diff --git a/src/hub/dataload/sources/clinvar/clinvar_parser.py b/src/hub/dataload/sources/clinvar/clinvar_parser.py
deleted file mode 100644
index ef3014e8..00000000
--- a/src/hub/dataload/sources/clinvar/clinvar_parser.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# -*- coding: utf-8 -*-
-# DO NOT USE THIS SCRIPT, USE THE XML PARSER INSTEAD
-import csv
-import re
-from itertools import imap, groupby
-import os
-
-import vcf
-
-from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, merge_duplicate_rows
-
-''' vcf file for clinvar downloaded from
-ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/
-and tab_delimited file for clinvar downloaded from
-ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/'''
-
-VALID_COLUMN_NO = 25
-vcf_reader = vcf.Reader(filename='clinvar_20150305.vcf.gz')
-
-
-# split id lists into dictionary
-def other_id(other_ids):
- p = other_ids.strip(";").replace(";", ",").split(",")
- other_id = {}
- for id in p:
- try:
- ind = id.index(":")
- key, value = id[:ind], id[ind+1:]
- if key in other_id:
- if not isinstance(other_id[key], list):
- other_id[key] = [other_id[key]]
- other_id[key].append(value)
- else:
- other_id[key] = value
- except:
- continue
- return other_id
-
-
-# convert one snp to json
-def _map_line_to_json(fields):
- assert len(fields) == VALID_COLUMN_NO
- chrom = fields[13]
- chromStart = fields[14]
- chromEnd = fields[15]
-
- HGVS = None
- cds = fields[18].split(":")
- cds = cds[1]
- replace = re.findall(r'[ATCGMNYR=]+', cds)
- sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
- ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
- delete = fields[1] == 'deletion'
- indel = fields[1] == 'indel'
- dup = re.search(r'dup', cds)
- inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
- if ins:
- delete = None
- indel = None
- elif delete:
- ins = None
- indel = None
- # parse from vcf file. Input chrom number
- # and chromStart, and return REF, ALT
- if chromStart:
- record = vcf_reader.fetch(chrom, int(chromStart))
- else:
- record = None
- if record:
- REF = record.REF
- ALT = record.ALT
- ALT = ALT[0]
- if record.is_snp and len(ALT) < 2:
- mod = [REF, ALT]
- else:
- mod = ALT
- else:
- return
-
- if sub and record.is_snp:
- HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
- elif ins:
- HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
- elif delete:
- HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
- elif indel:
- try:
- HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
- except AttributeError:
- print "ERROR:", fields[1], cds
- elif dup:
- HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
- elif inv:
- HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
- elif replace:
- HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
- else:
- print 'ERROR:', fields[1], cds
-
- # load as json data
- if HGVS is None:
- print 'None:', fields[1], cds
- return None
-
- one_snp_json = {
-
- "_id": HGVS,
- "clinvar":
- {
- "allele_id": fields[0],
- "hg19":
- {
- "chr": fields[13],
- "start": fields[14],
- "end": fields[15]
- },
- "type": fields[1],
- "name": fields[2],
- "gene":
- {
- "id": fields[3],
- "symbol": fields[4]
- },
- "clinical_significance": fields[5].split(";"),
- "rsid": 'rs' + str(fields[6]),
- "nsv_dbvar": fields[7],
- "rcv_accession": fields[8].split(";"),
- "tested_in_gtr": fields[9],
- "phenotype_id": other_id(fields[10]),
- "origin": fields[11],
- "cytogenic": fields[16],
- "review_status": fields[17],
- "hgvs":
- {
- "coding": fields[18],
- "protein": fields[19]
- },
- "number_submitters": fields[20],
- "last_evaluated": fields[21],
- "guidelines": fields[22],
- "other_ids": other_id(fields[23]),
- "clinvar_id": fields[24]
- }
- }
- return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
-
-
-# open file, parse, pass to json mapper
-def load_data(input_file):
- os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \
- % (input_file, input_file))
- open_file = open("%s_sorted.tsv" % (input_file))
- print input_file
- clinvar = csv.reader(open_file, delimiter="\t")
- clinvar = (row for row in clinvar
- if row[18] != '-' and
- row[18].find('?') == -1 and
- row[13] != "" and
- row[12] == "GRCh37" and
- not re.search(r'p.', row[18]))
- json_rows = (row for row in imap(_map_line_to_json, clinvar) if row)
- row_groups = (it for (key, it) in groupby(json_rows, lambda row:
- row["_id"]))
- return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)
diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
index 6d3f4661..00631e54 100644
--- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
+++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
@@ -1,105 +1,134 @@
-# a python lib is generated on the fly, in data folder
-# adjust path
+"""
+Two major parsing functions in this script are:
+ 1. `clinvar_doc_feeder()`
+ 2. `merge_rcv_accession()`
-import sys, os, glob
+`clinvar_doc_feeder()` is responsible for the following jobs:
+
+ 1. Receive a ClinVarFullRelease_*.xml.gz file, and split the xml file into `...` blocks
+ 2. Parse each `...` block into an `PublicSetType` object, which is defined in the
+ dynamically imported `clinvarlib`
+ 3. Convert each `PublicSetType` object into a clinvar document
+
+Note that The `PublicSetType` class is defined to map the block structure of ``, which can be inspected
+through its XSD, https://ftp.ncbi.nlm.nih.gov/pub/clinvar/clinvar_public.xsd.
+
+`merge_rcv_accession()` is responsible for only one job:
+
+ 1. Group all the document by `doc['_id']`, and then inside each group, merge all the `doc['clinvar']['rcv']`.
+ Each group of documents will be merged into a single document.
+"""
+
+import glob
+import os
+import sys
from itertools import groupby
-from config import DATA_ARCHIVE_ROOT, logger as logging
-import biothings, config
+import biothings
+import config
biothings.config_for_app(config)
-from biothings.utils.dataload import unlist, dict_sweep, \
- value_convert_to_number, rec_handler
+
+from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, rec_handler
GLOB_PATTERN = "ClinVarFullRelease_*.xml.gz"
clinvarlib = None
+
def import_clinvar_lib(data_folder):
- sys.path.insert(0,data_folder)
+ # a python lib is generated on the fly, in data folder
+ sys.path.insert(0, data_folder)
import genclinvar as clinvar_mod
global clinvarlib
clinvarlib = clinvar_mod
-def merge_rcv_accession(generator):
- groups = []
- for key, group in groupby(generator, lambda x: x['_id']):
- groups.append(list(group))
-
- # get the number of groups, and uniquekeys
- logging.info("number of groups: %s" % len(groups))
-
- # loop through each item, if item number >1, merge rcv accession number
- for item in groups:
- rcv_new = []
- if len(item) > 1:
- json_item = item[0]
- for _item in item:
- rcv_info = _item['clinvar']['rcv']
- rcv_new.append(rcv_info)
- json_item['clinvar']['rcv'] = rcv_new
- yield json_item
+
+def merge_rcv_accession(docs):
+ doc_groups = []
+ for key, group in groupby(docs, lambda x: x['_id']):
+ doc_groups.append(list(group))
+
+ # get the number of groups, and unique keys
+ logging.info("number of groups: %s" % len(doc_groups))
+
+ # Each group is a list of documents
+ # Loop through each group, if doc number >1, merge rcv accessions
+ for doc_list in doc_groups:
+ if len(doc_list) == 1:
+ yield doc_list[0]
else:
- yield item[0]
+ rcv_list = [doc['clinvar']['rcv'] for doc in doc_list]
+
+ merged_doc = doc_list[0]
+ merged_doc['clinvar']['rcv'] = rcv_list
+ yield merged_doc
+
+def _map_measure_to_json(measure_obj, hg19=True):
+ """
+ Convert a `clinvarlib.MeasureType` object into json.
-def parse_measure(Measure, hg19=True):
- variation_type = Measure.Type
- # exclude any item of which types belong to
- # 'Variation', 'protein only' or 'Microsatellite'
+ Each `clinvarlib.MeasureType` object is mapped to a `` block in the XML, which is part of the hierarchy
+ below:
+
+
+
+
+ ...
+ ...
+ ...
+
+
+
+ """
+
+ # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite'
+ variation_type = measure_obj.Type
if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite':
return None
- allele_id = Measure.ID
+
+ allele_id = measure_obj.ID
+
chrom = None
- chromStart_19 = None
- chromEnd_19 = None
- chromStart_38 = None
- chromEnd_38 = None
- ref = None
- alt = None
- if Measure.SequenceLocation:
- for SequenceLocation in Measure.SequenceLocation:
- # In this version, only accept information concerning GRCh37
+ chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None
+ ref, alt = None, None
+ if measure_obj.SequenceLocation:
+ for SequenceLocation in measure_obj.SequenceLocation:
if 'GRCh37' in SequenceLocation.Assembly:
chrom = SequenceLocation.Chr
chromStart_19 = SequenceLocation.start
chromEnd_19 = SequenceLocation.stop
if not ref:
- ref = SequenceLocation.referenceAllele
- if not ref:
- ref = SequenceLocation.referenceAlleleVCF
- if not alt:
- alt = SequenceLocation.alternateAllele
+ ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF
if not alt:
- alt = SequenceLocation.alternateAlleleVCF
+ alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF
+
if 'GRCh38' in SequenceLocation.Assembly:
chromStart_38 = SequenceLocation.start
chromEnd_38 = SequenceLocation.stop
if not ref:
- ref = SequenceLocation.referenceAllele
- if not ref:
- ref = SequenceLocation.referenceAlleleVCF
- if not alt:
- alt = SequenceLocation.alternateAllele
+ ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF
if not alt:
- alt = SequenceLocation.alternateAlleleVCF
- if Measure.MeasureRelationship:
+ alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF
+
+ symbol = None
+ gene_id = None
+ if measure_obj.MeasureRelationship:
try:
- symbol = Measure.MeasureRelationship[0].\
- Symbol[0].get_ElementValue().valueOf_
+ symbol = measure_obj.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_
except:
symbol = None
- gene_id = Measure.MeasureRelationship[0].XRef[0].ID
- else:
- symbol = None
- gene_id = None
- if Measure.Name:
- name = Measure.Name[0].ElementValue.valueOf_
- else:
- name = None
- if len(Measure.CytogeneticLocation) == 1:
- cytogenic = Measure.CytogeneticLocation[0]
+ gene_id = measure_obj.MeasureRelationship[0].XRef[0].ID
+
+ name = None
+ if measure_obj.Name:
+ name = measure_obj.Name[0].ElementValue.valueOf_
+
+ if len(measure_obj.CytogeneticLocation) == 1:
+ cytogenic = measure_obj.CytogeneticLocation[0]
else:
- cytogenic = Measure.CytogeneticLocation
+ cytogenic = measure_obj.CytogeneticLocation
+
hgvs_coding = None
hgvs_genome = None
HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
@@ -111,66 +140,58 @@ def parse_measure(Measure, hg19=True):
else:
chromStart = chromStart_38
chromEnd = chromEnd_38
+
# hgvs_not_validated = None
- if Measure.AttributeSet:
+ if measure_obj.AttributeSet:
# 'copy number loss' or 'gain' have format different\
# from other types, should be dealt with seperately
- if (variation_type == 'copy number loss') or \
- (variation_type == 'copy number gain'):
- for AttributeSet in Measure.AttributeSet:
- if 'HGVS, genomic, top level' in AttributeSet.\
- Attribute.Type:
+ if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'):
+ for AttributeSet in measure_obj.AttributeSet:
+ if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type:
if AttributeSet.Attribute.integerValue == 37:
hgvs_genome = AttributeSet.Attribute.get_valueOf_()
+
if 'genomic' in AttributeSet.Attribute.Type:
- HGVS['genomic'].append(AttributeSet.Attribute.
- get_valueOf_())
+ HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
elif 'non-coding' in AttributeSet.Attribute.Type:
- HGVS['non-coding'].append(AttributeSet.Attribute.
- get_valueOf_())
+ HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_())
elif 'coding' in AttributeSet.Attribute.Type:
- HGVS['coding'].append(AttributeSet.Attribute.
- get_valueOf_())
+ HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_())
elif 'protein' in AttributeSet.Attribute.Type:
- HGVS['protein'].append(AttributeSet.
- Attribute.get_valueOf_())
+ HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
else:
- for AttributeSet in Measure.AttributeSet:
+ for AttributeSet in measure_obj.AttributeSet:
if 'genomic' in AttributeSet.Attribute.Type:
- HGVS['genomic'].append(AttributeSet.
- Attribute.get_valueOf_())
+ HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
elif 'non-coding' in AttributeSet.Attribute.Type:
- HGVS['non-coding'].append(AttributeSet.
- Attribute.get_valueOf_())
+ HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_())
elif 'coding' in AttributeSet.Attribute.Type:
- HGVS['coding'].append(AttributeSet.Attribute.
- get_valueOf_())
+ HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_())
elif 'protein' in AttributeSet.Attribute.Type:
- HGVS['protein'].append(AttributeSet.
- Attribute.get_valueOf_())
- if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
+ HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
+
+ if not hgvs_coding and AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
hgvs_coding = AttributeSet.Attribute.get_valueOf_()
- elif AttributeSet.Attribute.Type == \
- 'HGVS, genomic, top level, previous':
+ if not hgvs_genome and AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous':
hgvs_genome = AttributeSet.Attribute.get_valueOf_()
- break
+
if chrom and chromStart and chromEnd:
# if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided
if variation_type == 'single nucleotide variant':
if ref and alt:
hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
else:
- print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.format(chrom, chromStart, chromEnd, ref, alt, allele_id))
+ print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.
+ format(chrom, chromStart, chromEnd, ref, alt, allele_id))
# items whose type belong to 'Indel, Insertion, \
# Duplication' might not hava explicit alt information, \
# so we will parse from hgvs_genome
elif variation_type == 'Indel':
- # RCV000156073, NC_000010.10:g.112581638_112581639delinsG
+ # RCV000156073, NC_000010.10:g.112581638_112581639delinsG
if hgvs_genome:
indel_position = hgvs_genome.find('del')
indel_alt = hgvs_genome[indel_position+3:]
- hgvs_id = "chr%s:g.%s_%sdel%s" % \
- (chrom, chromStart, chromEnd, indel_alt)
+ hgvs_id = "chr%s:g.%s_%sdel%s" % (chrom, chromStart, chromEnd, indel_alt)
elif variation_type == 'Deletion':
if chromStart == chromEnd:
# RCV000048406, chr17:g.41243547del
@@ -183,24 +204,19 @@ def parse_measure(Measure, hg19=True):
if 'ins' in hgvs_genome:
ins_ref = hgvs_genome[ins_position+3:]
if chromStart == chromEnd:
- hgvs_id = "chr%s:g.%sins%s" % \
- (chrom, chromStart, ins_ref)
+ hgvs_id = "chr%s:g.%sins%s" % (chrom, chromStart, ins_ref)
else:
- hgvs_id = "chr%s:g.%s_%sins%s" % \
- (chrom, chromStart, chromEnd, ins_ref)
+ hgvs_id = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, ins_ref)
elif variation_type == 'Duplication':
if hgvs_genome:
dup_position = hgvs_genome.find('dup')
if 'dup' in hgvs_genome:
dup_ref = hgvs_genome[dup_position+3:]
if chromStart == chromEnd:
- hgvs_id = "chr%s:g.%sdup%s" % \
- (chrom, chromStart, dup_ref)
+ hgvs_id = "chr%s:g.%sdup%s" % (chrom, chromStart, dup_ref)
else:
- hgvs_id = "chr%s:g.%s_%sdup%s" % \
- (chrom, chromStart, chromEnd, dup_ref)
- elif variation_type == 'copy number loss' or\
- variation_type == 'copy number gain':
+ hgvs_id = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, dup_ref)
+ elif variation_type == 'copy number loss' or variation_type == 'copy number gain':
if hgvs_genome and chrom:
hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2]
elif hgvs_coding:
@@ -210,16 +226,18 @@ def parse_measure(Measure, hg19=True):
return
else:
return
+
for key in HGVS:
HGVS[key].sort()
+
rsid = None
cosmic = None
dbvar = None
uniprot = None
omim = None
# loop through XRef to find rsid as well as other ids
- if Measure.XRef:
- for XRef in Measure.XRef:
+ if measure_obj.XRef:
+ for XRef in measure_obj.XRef:
if XRef.Type == 'rs':
rsid = 'rs' + str(XRef.ID)
elif XRef.DB == 'COSMIC':
@@ -234,73 +252,85 @@ def parse_measure(Measure, hg19=True):
# make sure the hgvs_id is not none
if hgvs_id:
one_snp_json = {
-
"_id": hgvs_id,
- "clinvar":
- {
- "allele_id": allele_id,
- "chrom": chrom,
- "omim": omim,
- "cosmic": cosmic,
- "uniprot": uniprot,
- "dbvar": dbvar,
- "hg19":
- {
- "start": chromStart_19,
- "end": chromEnd_19
- },
- "hg38":
- {
- "start": chromStart_38,
- "end": chromEnd_38
- },
- "type": variation_type,
- "gene":
- {
- "id": gene_id,
- "symbol": symbol
- },
- "rcv":
- {
- "preferred_name": name,
- },
- "rsid": rsid,
- "cytogenic": cytogenic,
- "hgvs": HGVS,
- "coding_hgvs_only": coding_hgvs_only,
- "ref": ref,
- "alt": alt
- }
+ "clinvar": {
+ "allele_id": allele_id,
+ "chrom": chrom,
+ "omim": omim,
+ "cosmic": cosmic,
+ "uniprot": uniprot,
+ "dbvar": dbvar,
+ "hg19": {
+ "start": chromStart_19,
+ "end": chromEnd_19
+ },
+ "hg38": {
+ "start": chromStart_38,
+ "end": chromEnd_38
+ },
+ "type": variation_type,
+ "gene": {
+ "id": gene_id,
+ "symbol": symbol
+ },
+ "rcv": {
+ "preferred_name": name,
+ },
+ "rsid": rsid,
+ "cytogenic": cytogenic,
+ "hgvs": HGVS,
+ "coding_hgvs_only": coding_hgvs_only,
+ "ref": ref,
+ "alt": alt
+ }
}
return one_snp_json
-def _map_line_to_json(cp, hg19):
+
+def _map_public_set_to_json(public_set_obj, hg19: bool):
+ """
+ Convert a `clinvarlib.PublicSetType` object into a json document.
+
+ Each `clinvarlib.PublicSetType` object is mapped to a `` block in the XML.
+
+ E.g., `public_set_obj.ReferenceClinVarAssertion.MeasureSet` is the parsed value from a block structure below:
+
+
+
+
+ ...
+
+
+
+ """
try:
- clinical_significance = cp.ReferenceClinVarAssertion.\
- ClinicalSignificance.Description
+ clinical_significance = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.Description
except:
clinical_significance = None
- rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
+
+ rcv_accession = public_set_obj.ReferenceClinVarAssertion.ClinVarAccession.Acc
+
try:
- review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
- ReviewStatus
+ review_status = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
except:
review_status = None
+
try:
- last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
- DateLastEvaluated
+ last_evaluated = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
except:
last_evaluated = None
- number_submitters = len(cp.ClinVarAssertion)
+ number_submitters = len(public_set_obj.ClinVarAssertion)
+
# some items in clinvar_xml doesn't have origin information
try:
- origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
+ origin = public_set_obj.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
except:
origin = None
+
conditions = []
- for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
+ for _trait in public_set_obj.ReferenceClinVarAssertion.TraitSet.Trait:
synonyms = []
conditions_name = ''
for name in _trait.Name:
@@ -308,6 +338,7 @@ def _map_line_to_json(cp, hg19):
synonyms.append(name.ElementValue.get_valueOf_())
if name.ElementValue.Type == 'Preferred':
conditions_name += name.ElementValue.get_valueOf_()
+
identifiers = {}
for item in _trait.XRef:
if item.DB == 'Human Phenotype Ontology':
@@ -318,55 +349,63 @@ def _map_line_to_json(cp, hg19):
for symbol in _trait.Symbol:
if symbol.ElementValue.Type == 'Preferred':
conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
+
age_of_onset = ''
for _set in _trait.AttributeSet:
if _set.Attribute.Type == 'age of onset':
age_of_onset = _set.Attribute.get_valueOf_()
- conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset})
+
+ conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers,
+ "age_of_onset": age_of_onset})
try:
- genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
+ genotypeset = public_set_obj.ReferenceClinVarAssertion.GenotypeSet
except:
genotypeset = None
+
if genotypeset:
obj_list = []
id_list = []
- for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
+ for _set in public_set_obj.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
variant_id = _set.ID
for _measure in _set.Measure:
- json_obj = parse_measure(_measure, hg19=hg19)
+ json_obj = _map_measure_to_json(_measure, hg19=hg19)
if json_obj:
- json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
+ json_obj['clinvar']['rcv'].update({
+ 'accession': rcv_accession,
'clinical_significance': clinical_significance,
'number_submitters': number_submitters,
'review_status': review_status,
'last_evaluated': str(last_evaluated),
'origin': origin,
- 'conditions': conditions})
+ 'conditions': conditions
+ })
json_obj['clinvar'].update({'variant_id': variant_id})
json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
- ['chrom', 'omim', 'id', 'orphanet', 'gene',
- 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
+ ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])),
+ [None, '', 'None']))
obj_list.append(json_obj)
id_list.append(json_obj['_id'])
for _obj in obj_list:
_obj['clinvar'].update({'genotypeset': {
- 'type': 'CompoundHeterozygote',
- 'genotype': id_list
- }})
+ 'type': 'CompoundHeterozygote',
+ 'genotype': id_list
+ }})
yield _obj
else:
- variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
- for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
- json_obj = parse_measure(_measure, hg19=hg19)
+ variant_id = public_set_obj.ReferenceClinVarAssertion.MeasureSet.ID
+ for _measure in public_set_obj.ReferenceClinVarAssertion.MeasureSet.Measure:
+ json_obj = _map_measure_to_json(_measure, hg19=hg19)
if json_obj:
- json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
- 'clinical_significance': clinical_significance,
- 'number_submitters': number_submitters,
- 'review_status': review_status,
- 'last_evaluated': str(last_evaluated),
- 'origin': origin,
- 'conditions': conditions})
+ json_obj['clinvar']['rcv'].update({
+ 'accession': rcv_accession,
+ 'clinical_significance': clinical_significance,
+ 'number_submitters': number_submitters,
+ 'review_status': review_status,
+ 'last_evaluated': str(last_evaluated),
+ 'origin': origin,
+ 'conditions': conditions
+ })
json_obj['clinvar'].update({'variant_id': variant_id})
json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
['chrom', 'omim', 'id', 'orphanet', 'gene',
@@ -374,21 +413,56 @@ def _map_line_to_json(cp, hg19):
yield json_obj
-def rcv_feeder(input_file, hg19):
- # the first two line of clinvar_xml is not useful information
- cv_data = rec_handler(input_file, block_end='\n',
- skip=2, include_block_end=True)
- for record in cv_data:
- # some exceptions
- if record.startswith('\n'):
+def clinvar_doc_feeder(input_file, hg19: bool):
+ """
+ This function will split the xml file into `...` blocks, then parse each block into an
+ `PublicSetType` object (which is defined in the dynamically imported `clinvarlib`), and finally convert each
+ `PublicSetType` object into a clinvar document.
+ """
+
+ """
+ A ClinVarFullRelease_*.xml.gz file has the following structure:
+
+
+
+
+
+ ...
+
+
+
+ ...
+
+
+ ...
+
+
+
+ Therefore when splitting the xml into `` blocks, the first 2 lines and the last 1 line should be
+ skipped.
+
+ However the `rec_handler` function cannot skip the last 1 line, and will return "\n..." as
+ the last block in this scenario. Therefore in the for-loop below, we will skip any block starting with
+ "\n".
+ """
+ clinvar_set_blocks = rec_handler(input_file, block_end='\n', skip=2, include_block_end=True)
+ for clinvar_set_block in clinvar_set_blocks:
+ # Skip any block starting with "\n"
+ # Actually only the last block will be skipped. See comments above
+ if clinvar_set_block.startswith('\n'):
continue
+
try:
- record_parsed = clinvarlib.parseString(record, silence=1)
+ # Parse each `` block into a `clinvarlib.PublicSetType` object
+ public_set_obj = clinvarlib.parseString(clinvar_set_block, silence=1)
except:
- logging.debug(record)
+ logging.debug(clinvar_set_block)
raise
- for record_mapped in _map_line_to_json(record_parsed, hg19):
- yield record_mapped
+
+ # Convert each `clinvarlib.PublicSetType` object into a json document
+ for doc in _map_public_set_to_json(public_set_obj, hg19):
+ yield doc
+
def load_data(data_folder, version):
# try to get logger from uploader
@@ -397,17 +471,23 @@ def load_data(data_folder, version):
logging = loggingmod.getLogger("clinvar_upload")
import_clinvar_lib(data_folder)
- files = glob.glob(os.path.join(data_folder,GLOB_PATTERN))
- assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN,files)
+
+ files = glob.glob(os.path.join(data_folder, GLOB_PATTERN))
+ assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files)
input_file = files[0]
- data_generator = rcv_feeder(input_file, version == "hg19")
- data_list = list(data_generator)
- # TODO: why do we sort this list ? this prevent from using yield/iterator
- data_list_sorted = sorted(data_list, key=lambda k: k['_id'])
- data_merge_rcv = merge_rcv_accession(data_list_sorted)
- return data_merge_rcv
+
+ doc_generator = clinvar_doc_feeder(input_file, hg19=(version == "hg19"))
+
+ # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()`
+ # which cannot put non-adjacent items with the same key into a group
+ doc_list = list(doc_generator)
+ doc_list = sorted(doc_list, key=lambda k: k['_id'])
+
+ merged_doc_generator = merge_rcv_accession(doc_list)
+ return merged_doc_generator
+
if __name__ == "__main__":
from biothings.utils.mongo import get_data_folder
data_folder = get_data_folder("clinvar")
- load_data(data_folder=data_folder)
+ load_data(data_folder=data_folder, version="hg19")