Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,19 @@ RUN pip install --upgrade setuptools
RUN pip install pysam

WORKDIR /opt/repos/uta/

# Install dependencies
COPY pyproject.toml ./
RUN pip install -e .[dev]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two pip-installs is the best solution I could come up with to prevent reinstalling dependencies whenever src changed (for a faster development cycle using docker).


# Install uta package
COPY etc ./etc
COPY misc ./misc
COPY sbin ./sbin
COPY src ./src
RUN pip install -e .[dev]
RUN pip install -e .

# ---------- #

# UTA test image
FROM uta as uta-test
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
```
docker compose run ncbi-download
docker compose run uta-extract
[OPTIONAL] docker compose run uta-check-transcripts
docker compose run seqrepo-load
docker compose run uta-load
```
Expand All @@ -351,7 +352,7 @@ docker compose run uta-load
```

#### 2C. Manual splign transcripts
To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path
To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path
using the environment variable $UTA_SPLIGN_MANUAL_DIR. These file paths should exist:
- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/txdata.yaml`
- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/alignments/*.splign`
Expand Down
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ services:
interval: 10s
retries: 80
network_mode: host
uta-check-transcripts:
image: uta-update
command: sbin/uta-check-transcripts ${UTA_ETL_OLD_UTA_VERSION} /uta-check-transcripts/work /uta-check-transcripts/logs
depends_on:
uta:
condition: service_healthy
volumes:
- ${UTA_ETL_WORK_DIR}:/uta-check-transcripts/work
- ${UTA_ETL_LOG_DIR}:/uta-check-transcripts/logs
network_mode: host
uta-load:
image: uta-update
command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} ${UTA_ETL_NEW_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
Expand Down
28 changes: 28 additions & 0 deletions sbin/uta-check-transcripts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

# Find transcripts in the current UTA database version which are not in the txinfo file,
# and write those transcripts to the check-transcripts.txt file.
#
# uta-extract, which produces the needed txinfo file, must run before this script.
# Any action taken with respect to the identified transcripts is case-dependent and optional.
#
# source_uta_v is the current UTA database version.
# working_dir stores input and output files.
# log_dir stores log files.

set -euxo pipefail

source_uta_v=$1
working_dir=$2
log_dir=$3

if [ -z "$source_uta_v" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
then
echo 'Usage: uta-check-transcripts <source_uta_v> <working_dir> <log_dir>'
exit 1
fi

mkdir -p "$log_dir"

UTA_USE_SCHEMA=false uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf check-transcripts --prefixes=NM,NR "$working_dir/txinfo.gz" "$source_uta_v" "$working_dir/check-transcripts.txt" 2>&1 | \
tee "$log_dir/check-transcripts.log"
2 changes: 2 additions & 0 deletions src/uta/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
uta --version
uta (-C CONF ...) [options] shell
uta (-C CONF ...) [options] drop-schema
uta (-C CONF ...) [options] check-transcripts [--prefixes=PREFIXES] TXINFO_FILE UTA_SCHEMA OUTPUT_FILE
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found it confusing to define the cli interface in the docstring.

uta (-C CONF ...) [options] create-schema
uta (-C CONF ...) [options] update-meta-data
uta (-C CONF ...) [options] load-sql FILES ...
Expand Down Expand Up @@ -68,6 +69,7 @@ def main():
dispatch_table = [
("align-exons", ul.align_exons),
("analyze", ul.analyze),
("check-transcripts", ul.check_transcripts),
("create-schema", ul.create_schema),
("update-meta-data", ul.update_meta_data),
("drop-schema", ul.drop_schema),
Expand Down
42 changes: 41 additions & 1 deletion src/uta/loading.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import, division, print_function, unicode_literals

from configparser import ConfigParser
import csv
import datetime
import gzip
Expand All @@ -16,7 +17,7 @@
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy import text
from sqlalchemy import or_, text
import psycopg2.extras
import six
from uta_align.align.algorithms import cigar_alignment, needleman_wunsch_gotoh_align
Expand Down Expand Up @@ -166,6 +167,45 @@ def analyze(session, opts, cf):
session.commit()


def check_transcripts(session: Session, opts: Dict, cf: ConfigParser):
"""
Find transcripts in the given UTA database version which are not in the given txinfo file,
and write those transcripts to the specified file.
"""
# required opts
txinfo_file = opts['TXINFO_FILE']
uta_schema = opts['UTA_SCHEMA']
output_file = opts['OUTPUT_FILE']

# optional opts
prefixes = opts.get('--prefixes')
# prefixes should be comma-separated list
transcript_prefixes = prefixes.split(',') if prefixes else None

role = cf.get('uta', 'admin_role')
session.execute(text(f"set role {role};"))
session.execute(text(f"set search_path = {uta_schema};"))

# fetch transcripts from uta
Transcript = uta.models.Transcript
query = session.query(Transcript)
if transcript_prefixes:
query = query.filter(or_(*[Transcript.ac.startswith(p) for p in transcript_prefixes]))
query = query.with_entities(Transcript.ac)
uta_transcripts = set(ac for (ac, ) in query)

# subtract incoming txinfo transcripts
txinfo_transcripts = set()
with gzip.open(txinfo_file, 'rt') as txinfo_fp:
for row in csv.DictReader(txinfo_fp, delimiter='\t'):
txinfo_transcripts.add(row['ac'])
result_transcripts = uta_transcripts - txinfo_transcripts

# write difference to output file
with open(output_file, 'wt') as output_fp:
output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))


def create_schema(session, opts, cf):
"""Create and populate initial schema"""
session.execute(text("set role {admin_role};".format(
Expand Down
4 changes: 3 additions & 1 deletion src/uta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
"""

import datetime
from distutils.util import strtobool
import hashlib
import os

import sqlalchemy as sa
import sqlalchemy.orm as sao
Expand All @@ -17,7 +19,7 @@
# also see etc/uta.conf

schema_version = "1.2"
use_schema = True
use_schema = strtobool(os.environ.get('UTA_USE_SCHEMA', 'true'))
if use_schema:
schema_name = "uta"
else:
Expand Down
63 changes: 63 additions & 0 deletions tests/test_uta_loading.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import configparser
import signal
from tempfile import NamedTemporaryFile
import unittest
from unittest.mock import Mock, patch

Expand Down Expand Up @@ -153,6 +154,68 @@ def test_load_assoc_ac(self):
]
self.assertEqual(aa_list, expected_aa_list)

def test_check_transcripts(self):
o1 = usam.Origin(
name='NCBI',
url='http://bogus.com/ncbi',
url_ac_fmt='http://bogus.com/ncbi/{ac}',
)
g1 = usam.Gene(
gene_id='140606',
hgnc='SELENOM',
symbol='SELENOM',
maploc='22q12.2',
descr='selenoprotein M',
summary='selenoprotein M',
aliases='SELM,SEPM',
type='protein-coding',
xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397',
)
self.session.add(o1)
self.session.add(g1)
self.session.commit()

# add transcript in txinfo.gz
t1 = usam.Transcript(
ac='NM_080430.4',
origin_id=o1.origin_id,
gene_id=g1.gene_id,
hgnc=g1.hgnc,
cds_start_i=63,
cds_end_i=501,
cds_md5='abc123',
codon_table=1,
)
# add transcript not in txinfo.gz
t2 = usam.Transcript(
ac='NM_080430.3',
origin_id=o1.origin_id,
gene_id=g1.gene_id,
hgnc=g1.hgnc,
cds_start_i=63,
cds_end_i=501,
cds_md5='abc123',
codon_table=1,
)
self.session.add(t1)
self.session.add(t2)
self.session.commit()

with NamedTemporaryFile(mode='w+t') as tf:
opts = {
'TXINFO_FILE': 'tests/data/txinfo.gz',
'UTA_SCHEMA': 'uta',
'OUTPUT_FILE': tf.name,
}

ul.check_transcripts(self.session, opts, self.cf)

# expect one transcript but not the other
tf.seek(0)
tf_content = tf.read()
assert 'NM_080430.3' in tf_content
assert 'NM_080430.4' not in tf_content

def test_load_txinfo(self):
"""
Loading file tests/data/txinfo.gz should create transcript, exon_set, exon, and translation_exception records in the database.
Expand Down
Loading