phase/hap complete

ACEnglish · ACEnglish · commit 7a477a7409ec · 2025-03-25T14:34:44.000-05:00
Also beginning query refactor to polars
Tests are still going to fail
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@ def read(rel_path):
     with codecs.open(os.path.join(here, rel_path), 'r') as fp:
         return fp.read()
 
-VERSION = "0.3.0"
+VERSION = "1.0.0-dev"
 
 setup(
     name="tdb",
diff --git a/tdb/bigmerge.py b/tdb/bigmerge.py
@@ -360,12 +360,15 @@ def sample_puller(con, dbname, output_dir, compress):
     files = tdb.get_tdb_filenames(dbname)
     for _, sample_pq in files['sample'].items():
         out_name = os.path.join(output_dir, os.path.basename(sample_pq))
+        # TODO: new fields need to be pulled
         query = f"""
             COPY (
                 SELECT
                     allele_lookup.to_LocusID as LocusID,
                     allele_lookup.to_allele_number_new as allele_number,
                     sample.spanning_reads,
+                    sample.phase_set,
+                    sample.haplotype,
                     sample.length_range_lower,
                     sample.length_range_upper,
                     sample.average_methylation,
diff --git a/tdb/create.py b/tdb/create.py
@@ -23,7 +23,7 @@
           "sequence": pa.binary(),
           "spanning_reads": pa.uint16(),
           "phase_set": pa.uint32(),
-          "haplotype": pa.uint16(),
+          "haplotype": pa.uint8(),
           "length_range_lower": pa.uint16(),
           "length_range_upper": pa.uint16(),
           "average_methylation": pa.float32()}
@@ -147,6 +147,7 @@ def convert_buffer(vcf, samples, stats, seen_loci, avail_mem=4e9, force=False):
             logging.warning("Unable to convert %s", str(entry))
             logging.warning("Error: %s", str(e))
             if not force:
+                logging.error("Exiting")
                 sys.exit(1)
             continue
         # pylint: enable=broad-exception-caught
diff --git a/tdb/merge.py b/tdb/merge.py
@@ -328,6 +328,8 @@ def update_sample_table(second_sample, sample_lookup, compress):
             lookup.to_LocusID AS LocusID,
             lookup.to_allele_number_new AS allele_number,
             sample.spanning_reads AS spanning_reads,
+            sample.phase_set AS phase_set,
+            sample.haplotype AS haplotype,
             sample.length_range_lower AS length_range_lower,
             sample.length_range_upper AS length_range_upper,
             sample.average_methylation AS average_methylation
diff --git a/tdb/queries/allele_stats.py b/tdb/queries/allele_stats.py
@@ -0,0 +1,91 @@
+import sys
+from concurrent.futures import ThreadPoolExecutor
+
+import polars as pl
+from tqdm import tqdm
+import pyarrow.parquet as pq
+
+import tdb
+
+def process_sample(path):
+    """
+    Open a sample and generate its allele count
+    """
+    pf = pq.ParquetFile(path)
+    sample_data = pl.from_arrow(pf.read(columns=['LocusID', 'allele_number']))
+    return sample_data.group_by(['LocusID', 'allele_number']).len().rename({"len": "AC"})
+
+def merge_in(data, results):
+    """
+    Given a batch of samples' counts, consolidate into main data
+    """
+    combined = pl.concat(results, how="vertical").group_by(['LocusID', 'allele_number']).sum()
+    data = data.join(combined, on=['LocusID', 'allele_number'], how="left").fill_null(0)
+    data = data.with_columns([
+        (pl.col("AC") + pl.col("AC_right")).alias("AC"),
+    ])
+    return data.drop(["AC_right"])
+
+
+if __name__ == '__main__':
+    # fn = "../../AoU_TRs.v0.1.tdb/"
+    fn = sys.argv[1]
+    batch_size = 250
+    out_prefix = "result"
+    min_af = 0.01
+    lps_norm = 100
+    names = tdb.get_tdb_filenames(fn)
+    columns = ["LocusID", "allele_number"]
+
+    # Read base allele table efficiently
+    counts = pl.from_arrow(
+        pq.read_table(names['allele'],
+                      columns=columns)
+    )
+
+    counts = counts.with_columns(
+        pl.lit(0).alias('AC'),
+    )
+
+    # Use multiple threads for reading + batch merge
+    sample_paths = list(names['sample'].values())
+
+    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust thread count
+        results = []
+        for sample_counts in tqdm(executor.map(process_sample, sample_paths), 
+                                  total=len(sample_paths), desc="Processing"):
+            results.append(sample_counts)
+            
+            # Merge in batches
+            if len(results) >= batch_size:
+                counts = merge_in(counts, results)
+                results = []  # Reset batch
+
+    # Merge remaining results
+    if results:
+        counts = merge_in(counts, results)
+
+    AN = counts.group_by("LocusID").agg(pl.col('AC').sum().alias('AN'))
+    counts = counts.join(AN, on="LocusID", how='left')
+    counts = counts.with_columns((pl.col('AC') / pl.col('AN')).alias('AF'))
+
+    counts.write_csv(f"{out_prefix}.allele_seq.txt", separator='\t')
+
+    columns = ["LocusID", "allele_number", 'allele_length']
+    bylen = pl.from_arrow(pq.read_table(names['allele'], columns=columns))
+
+    bylen = bylen.join(counts, on=["LocusID", "allele_number"], how="left")  # Use "inner" if necessary
+    bylen = bylen.group_by(['LocusID', 'allele_length']).agg(pl.col('AC').sum(), pl.col('AN').first())
+    bylen = bylen.with_columns((pl.col('AC') / pl.col('AN')).alias('AF'))
+
+    bylen.write_csv(f"{out_prefix}.allele_len.txt", separator='\t')
+
+    lps = (bylen.filter(bylen['AF'] >= min_af)
+        .group_by('LocusID')
+        .agg((pl.col('allele_length').n_unique() /
+             (pl.col('AN').first() / lps_norm)
+            ).alias('LPS')
+        )
+    ).select(['LocusID', 'LPS'])
+
+    lps.write_csv(f"{out_prefix}.length_polymorphism.txt", separator='\t')