invitae · nvta1209 · Dec 9, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -15,13 +15,19 @@ RUN pip install --upgrade setuptools
 RUN pip install pysam
 
 WORKDIR /opt/repos/uta/
+
+# Install dependencies
 COPY pyproject.toml ./
+RUN pip install -e .[dev]
+
+# Install uta package
 COPY etc ./etc
 COPY misc ./misc
 COPY sbin ./sbin
 COPY src ./src
-RUN pip install -e .[dev]
+RUN pip install -e .
 
+# ---------- #
 
 # UTA test image
 FROM uta as uta-test

diff --git a/README.md b/README.md
@@ -339,6 +339,7 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 ```
 docker compose run ncbi-download
 docker compose run uta-extract
+[OPTIONAL] docker compose run uta-check-transcripts
 docker compose run seqrepo-load
 docker compose run uta-load
 ```
@@ -351,7 +352,7 @@ docker compose run uta-load
 ```
 
 #### 2C. Manual splign transcripts
-To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path 
+To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path
 using the environment variable $UTA_SPLIGN_MANUAL_DIR. These file paths should exist:
 - `$UTA_SPLIGN_MANUAL_DIR/splign-manual/txdata.yaml`
 - `$UTA_SPLIGN_MANUAL_DIR/splign-manual/alignments/*.splign`

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -46,6 +46,16 @@ services:
       interval: 10s
       retries: 80
     network_mode: host
+  uta-check-transcripts:
+    image: uta-update
+    command: sbin/uta-check-transcripts ${UTA_ETL_OLD_UTA_VERSION} /uta-check-transcripts/work /uta-check-transcripts/logs
+    depends_on:
+      uta:
+        condition: service_healthy
+    volumes:
+      - ${UTA_ETL_WORK_DIR}:/uta-check-transcripts/work
+      - ${UTA_ETL_LOG_DIR}:/uta-check-transcripts/logs
+    network_mode: host
   uta-load:
     image: uta-update
     command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} ${UTA_ETL_NEW_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs

diff --git a/sbin/uta-check-transcripts b/sbin/uta-check-transcripts
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Find transcripts in the current UTA database version which are not in the txinfo file,
+# and write those transcripts to the check-transcripts.txt file.
+#
+# uta-extract, which produces the needed txinfo file, must run before this script.
+# Any action taken with respect to the identified transcripts is case-dependent and optional.
+#
+# source_uta_v is the current UTA database version.
+# working_dir stores input and output files.
+# log_dir stores log files.
+
+set -euxo pipefail
+
+source_uta_v=$1
+working_dir=$2
+log_dir=$3
+
+if [ -z "$source_uta_v" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: uta-check-transcripts <source_uta_v> <working_dir> <log_dir>'
+    exit 1
+fi
+
+mkdir -p "$log_dir"
+
+UTA_USE_SCHEMA=false uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf check-transcripts --prefixes=NM,NR "$working_dir/txinfo.gz" "$source_uta_v" "$working_dir/check-transcripts.txt" 2>&1 | \
+    tee "$log_dir/check-transcripts.log"
diff --git a/src/uta/cli.py b/src/uta/cli.py
@@ -5,6 +5,7 @@
   uta --version
   uta (-C CONF ...) [options] shell
   uta (-C CONF ...) [options] drop-schema
+  uta (-C CONF ...) [options] check-transcripts [--prefixes=PREFIXES] TXINFO_FILE UTA_SCHEMA OUTPUT_FILE
   uta (-C CONF ...) [options] create-schema
   uta (-C CONF ...) [options] update-meta-data
   uta (-C CONF ...) [options] load-sql FILES ...
@@ -68,6 +69,7 @@ def main():
     dispatch_table = [
         ("align-exons",         ul.align_exons),
         ("analyze",             ul.analyze),
+        ("check-transcripts",   ul.check_transcripts),
         ("create-schema",       ul.create_schema),
         ("update-meta-data",    ul.update_meta_data),
         ("drop-schema",         ul.drop_schema),

diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+from configparser import ConfigParser
 import csv
 import datetime
 import gzip
@@ -16,7 +17,7 @@
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 from sqlalchemy.orm.exc import NoResultFound
-from sqlalchemy import text
+from sqlalchemy import or_, text
 import psycopg2.extras
 import six
 from uta_align.align.algorithms import cigar_alignment, needleman_wunsch_gotoh_align
@@ -166,6 +167,45 @@ def analyze(session, opts, cf):
     session.commit()
 
 
+def check_transcripts(session: Session, opts: Dict, cf: ConfigParser):
+    """
+    Find transcripts in the given UTA database version which are not in the given txinfo file,
+    and write those transcripts to the specified file.
+    """
+    # required opts
+    txinfo_file = opts['TXINFO_FILE']
+    uta_schema = opts['UTA_SCHEMA']
+    output_file = opts['OUTPUT_FILE']
+
+    # optional opts
+    prefixes = opts.get('--prefixes')
+    # prefixes should be comma-separated list
+    transcript_prefixes = prefixes.split(',') if prefixes else None
+
+    role = cf.get('uta', 'admin_role')
+    session.execute(text(f"set role {role};"))
+    session.execute(text(f"set search_path = {uta_schema};"))
+
+    # fetch transcripts from uta
+    Transcript = uta.models.Transcript
+    query = session.query(Transcript)
+    if transcript_prefixes:
+        query = query.filter(or_(*[Transcript.ac.startswith(p) for p in transcript_prefixes]))
+    query = query.with_entities(Transcript.ac)
+    uta_transcripts = set(ac for (ac, ) in query)
+
+    # subtract incoming txinfo transcripts
+    txinfo_transcripts = set()
+    with gzip.open(txinfo_file, 'rt') as txinfo_fp:
+        for row in csv.DictReader(txinfo_fp, delimiter='\t'):
+            txinfo_transcripts.add(row['ac'])
+    result_transcripts = uta_transcripts - txinfo_transcripts
+
+    # write difference to output file
+    with open(output_file, 'wt') as output_fp:
+        output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))
+
+
 def create_schema(session, opts, cf):
     """Create and populate initial schema"""
     session.execute(text("set role {admin_role};".format(

diff --git a/src/uta/models.py b/src/uta/models.py
@@ -2,7 +2,9 @@
 """
 
 import datetime
+from distutils.util import strtobool
 import hashlib
+import os
 
 import sqlalchemy as sa
 import sqlalchemy.orm as sao
@@ -17,7 +19,7 @@
 # also see etc/uta.conf
 
 schema_version = "1.2"
-use_schema = True
+use_schema = strtobool(os.environ.get('UTA_USE_SCHEMA', 'true'))
 if use_schema:
     schema_name = "uta"
 else:

diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py
@@ -1,5 +1,6 @@
 import configparser
 import signal
+from tempfile import NamedTemporaryFile
 import unittest
 from unittest.mock import Mock, patch
 
@@ -153,6 +154,68 @@ def test_load_assoc_ac(self):
         ]
         self.assertEqual(aa_list, expected_aa_list)
 
+    def test_check_transcripts(self):
+        o1 = usam.Origin(
+            name='NCBI',
+            url='http://bogus.com/ncbi',
+            url_ac_fmt='http://bogus.com/ncbi/{ac}',
+        )
+        g1 = usam.Gene(
+            gene_id='140606',
+            hgnc='SELENOM',
+            symbol='SELENOM',
+            maploc='22q12.2',
+            descr='selenoprotein M',
+            summary='selenoprotein M',
+            aliases='SELM,SEPM',
+            type='protein-coding',
+            xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397',
+        )
+        self.session.add(o1)
+        self.session.add(g1)
+        self.session.commit()
+
+        # add transcript in txinfo.gz
+        t1 = usam.Transcript(
+            ac='NM_080430.4',
+            origin_id=o1.origin_id,
+            gene_id=g1.gene_id,
+            hgnc=g1.hgnc,
+            cds_start_i=63,
+            cds_end_i=501,
+            cds_md5='abc123',
+            codon_table=1,
+        )
+        # add transcript not in txinfo.gz
+        t2 = usam.Transcript(
+            ac='NM_080430.3',
+            origin_id=o1.origin_id,
+            gene_id=g1.gene_id,
+            hgnc=g1.hgnc,
+            cds_start_i=63,
+            cds_end_i=501,
+            cds_md5='abc123',
+            codon_table=1,
+        )
+        self.session.add(t1)
+        self.session.add(t2)
+        self.session.commit()
+
+        with NamedTemporaryFile(mode='w+t') as tf:
+            opts = {
+                'TXINFO_FILE': 'tests/data/txinfo.gz',
+                'UTA_SCHEMA': 'uta',
+                'OUTPUT_FILE': tf.name,
+            }
+
+            ul.check_transcripts(self.session, opts, self.cf)
+
+            # expect one transcript but not the other
+            tf.seek(0)
+            tf_content = tf.read()
+            assert 'NM_080430.3' in tf_content
+            assert 'NM_080430.4' not in tf_content
+
     def test_load_txinfo(self):
         """
         Loading file tests/data/txinfo.gz should create transcript, exon_set, exon, and translation_exception records in the database.