diff --git a/src/hub/dataload/sources/clinvar/clinvar_parser.py b/src/hub/dataload/sources/clinvar/clinvar_parser.py deleted file mode 100644 index ef3014e8..00000000 --- a/src/hub/dataload/sources/clinvar/clinvar_parser.py +++ /dev/null @@ -1,164 +0,0 @@ -# -*- coding: utf-8 -*- -# DO NOT USE THIS SCRIPT, USE THE XML PARSER INSTEAD -import csv -import re -from itertools import imap, groupby -import os - -import vcf - -from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, merge_duplicate_rows - -''' vcf file for clinvar downloaded from -ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/ -and tab_delimited file for clinvar downloaded from -ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/''' - -VALID_COLUMN_NO = 25 -vcf_reader = vcf.Reader(filename='clinvar_20150305.vcf.gz') - - -# split id lists into dictionary -def other_id(other_ids): - p = other_ids.strip(";").replace(";", ",").split(",") - other_id = {} - for id in p: - try: - ind = id.index(":") - key, value = id[:ind], id[ind+1:] - if key in other_id: - if not isinstance(other_id[key], list): - other_id[key] = [other_id[key]] - other_id[key].append(value) - else: - other_id[key] = value - except: - continue - return other_id - - -# convert one snp to json -def _map_line_to_json(fields): - assert len(fields) == VALID_COLUMN_NO - chrom = fields[13] - chromStart = fields[14] - chromEnd = fields[15] - - HGVS = None - cds = fields[18].split(":") - cds = cds[1] - replace = re.findall(r'[ATCGMNYR=]+', cds) - sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) - ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) - delete = fields[1] == 'deletion' - indel = fields[1] == 'indel' - dup = re.search(r'dup', cds) - inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) - if ins: - delete = None - indel = None - elif delete: - ins = None - indel = None - # parse from vcf file. Input chrom number - # and chromStart, and return REF, ALT - if chromStart: - record = vcf_reader.fetch(chrom, int(chromStart)) - else: - record = None - if record: - REF = record.REF - ALT = record.ALT - ALT = ALT[0] - if record.is_snp and len(ALT) < 2: - mod = [REF, ALT] - else: - mod = ALT - else: - return - - if sub and record.is_snp: - HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) - elif ins: - HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) - elif delete: - HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) - elif indel: - try: - HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) - except AttributeError: - print "ERROR:", fields[1], cds - elif dup: - HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) - elif inv: - HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) - elif replace: - HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) - else: - print 'ERROR:', fields[1], cds - - # load as json data - if HGVS is None: - print 'None:', fields[1], cds - return None - - one_snp_json = { - - "_id": HGVS, - "clinvar": - { - "allele_id": fields[0], - "hg19": - { - "chr": fields[13], - "start": fields[14], - "end": fields[15] - }, - "type": fields[1], - "name": fields[2], - "gene": - { - "id": fields[3], - "symbol": fields[4] - }, - "clinical_significance": fields[5].split(";"), - "rsid": 'rs' + str(fields[6]), - "nsv_dbvar": fields[7], - "rcv_accession": fields[8].split(";"), - "tested_in_gtr": fields[9], - "phenotype_id": other_id(fields[10]), - "origin": fields[11], - "cytogenic": fields[16], - "review_status": fields[17], - "hgvs": - { - "coding": fields[18], - "protein": fields[19] - }, - "number_submitters": fields[20], - "last_evaluated": fields[21], - "guidelines": fields[22], - "other_ids": other_id(fields[23]), - "clinvar_id": fields[24] - } - } - return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"]) - - -# open file, parse, pass to json mapper -def load_data(input_file): - os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \ - % (input_file, input_file)) - open_file = open("%s_sorted.tsv" % (input_file)) - print input_file - clinvar = csv.reader(open_file, delimiter="\t") - clinvar = (row for row in clinvar - if row[18] != '-' and - row[18].find('?') == -1 and - row[13] != "" and - row[12] == "GRCh37" and - not re.search(r'p.', row[18])) - json_rows = (row for row in imap(_map_line_to_json, clinvar) if row) - row_groups = (it for (key, it) in groupby(json_rows, lambda row: - row["_id"])) - return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups) diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py index 6d3f4661..00631e54 100644 --- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py +++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py @@ -1,105 +1,134 @@ -# a python lib is generated on the fly, in data folder -# adjust path +""" +Two major parsing functions in this script are: + 1. `clinvar_doc_feeder()` + 2. `merge_rcv_accession()` -import sys, os, glob +`clinvar_doc_feeder()` is responsible for the following jobs: + + 1. Receive a ClinVarFullRelease_*.xml.gz file, and split the xml file into `...` blocks + 2. Parse each `...` block into an `PublicSetType` object, which is defined in the + dynamically imported `clinvarlib` + 3. Convert each `PublicSetType` object into a clinvar document + +Note that The `PublicSetType` class is defined to map the block structure of ``, which can be inspected +through its XSD, https://ftp.ncbi.nlm.nih.gov/pub/clinvar/clinvar_public.xsd. + +`merge_rcv_accession()` is responsible for only one job: + + 1. Group all the document by `doc['_id']`, and then inside each group, merge all the `doc['clinvar']['rcv']`. + Each group of documents will be merged into a single document. +""" + +import glob +import os +import sys from itertools import groupby -from config import DATA_ARCHIVE_ROOT, logger as logging -import biothings, config +import biothings +import config biothings.config_for_app(config) -from biothings.utils.dataload import unlist, dict_sweep, \ - value_convert_to_number, rec_handler + +from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, rec_handler GLOB_PATTERN = "ClinVarFullRelease_*.xml.gz" clinvarlib = None + def import_clinvar_lib(data_folder): - sys.path.insert(0,data_folder) + # a python lib is generated on the fly, in data folder + sys.path.insert(0, data_folder) import genclinvar as clinvar_mod global clinvarlib clinvarlib = clinvar_mod -def merge_rcv_accession(generator): - groups = [] - for key, group in groupby(generator, lambda x: x['_id']): - groups.append(list(group)) - - # get the number of groups, and uniquekeys - logging.info("number of groups: %s" % len(groups)) - - # loop through each item, if item number >1, merge rcv accession number - for item in groups: - rcv_new = [] - if len(item) > 1: - json_item = item[0] - for _item in item: - rcv_info = _item['clinvar']['rcv'] - rcv_new.append(rcv_info) - json_item['clinvar']['rcv'] = rcv_new - yield json_item + +def merge_rcv_accession(docs): + doc_groups = [] + for key, group in groupby(docs, lambda x: x['_id']): + doc_groups.append(list(group)) + + # get the number of groups, and unique keys + logging.info("number of groups: %s" % len(doc_groups)) + + # Each group is a list of documents + # Loop through each group, if doc number >1, merge rcv accessions + for doc_list in doc_groups: + if len(doc_list) == 1: + yield doc_list[0] else: - yield item[0] + rcv_list = [doc['clinvar']['rcv'] for doc in doc_list] + + merged_doc = doc_list[0] + merged_doc['clinvar']['rcv'] = rcv_list + yield merged_doc + +def _map_measure_to_json(measure_obj, hg19=True): + """ + Convert a `clinvarlib.MeasureType` object into json. -def parse_measure(Measure, hg19=True): - variation_type = Measure.Type - # exclude any item of which types belong to - # 'Variation', 'protein only' or 'Microsatellite' + Each `clinvarlib.MeasureType` object is mapped to a `` block in the XML, which is part of the hierarchy + below: + + + + + ... + ... + ... + + + + """ + + # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite' + variation_type = measure_obj.Type if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite': return None - allele_id = Measure.ID + + allele_id = measure_obj.ID + chrom = None - chromStart_19 = None - chromEnd_19 = None - chromStart_38 = None - chromEnd_38 = None - ref = None - alt = None - if Measure.SequenceLocation: - for SequenceLocation in Measure.SequenceLocation: - # In this version, only accept information concerning GRCh37 + chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None + ref, alt = None, None + if measure_obj.SequenceLocation: + for SequenceLocation in measure_obj.SequenceLocation: if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start chromEnd_19 = SequenceLocation.stop if not ref: - ref = SequenceLocation.referenceAllele - if not ref: - ref = SequenceLocation.referenceAlleleVCF - if not alt: - alt = SequenceLocation.alternateAllele + ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF if not alt: - alt = SequenceLocation.alternateAlleleVCF + alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF + if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: - ref = SequenceLocation.referenceAllele - if not ref: - ref = SequenceLocation.referenceAlleleVCF - if not alt: - alt = SequenceLocation.alternateAllele + ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF if not alt: - alt = SequenceLocation.alternateAlleleVCF - if Measure.MeasureRelationship: + alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF + + symbol = None + gene_id = None + if measure_obj.MeasureRelationship: try: - symbol = Measure.MeasureRelationship[0].\ - Symbol[0].get_ElementValue().valueOf_ + symbol = measure_obj.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_ except: symbol = None - gene_id = Measure.MeasureRelationship[0].XRef[0].ID - else: - symbol = None - gene_id = None - if Measure.Name: - name = Measure.Name[0].ElementValue.valueOf_ - else: - name = None - if len(Measure.CytogeneticLocation) == 1: - cytogenic = Measure.CytogeneticLocation[0] + gene_id = measure_obj.MeasureRelationship[0].XRef[0].ID + + name = None + if measure_obj.Name: + name = measure_obj.Name[0].ElementValue.valueOf_ + + if len(measure_obj.CytogeneticLocation) == 1: + cytogenic = measure_obj.CytogeneticLocation[0] else: - cytogenic = Measure.CytogeneticLocation + cytogenic = measure_obj.CytogeneticLocation + hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} @@ -111,66 +140,58 @@ def parse_measure(Measure, hg19=True): else: chromStart = chromStart_38 chromEnd = chromEnd_38 + # hgvs_not_validated = None - if Measure.AttributeSet: + if measure_obj.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately - if (variation_type == 'copy number loss') or \ - (variation_type == 'copy number gain'): - for AttributeSet in Measure.AttributeSet: - if 'HGVS, genomic, top level' in AttributeSet.\ - Attribute.Type: + if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'): + for AttributeSet in measure_obj.AttributeSet: + if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() + if 'genomic' in AttributeSet.Attribute.Type: - HGVS['genomic'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: - HGVS['non-coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: - HGVS['coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: - HGVS['protein'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) else: - for AttributeSet in Measure.AttributeSet: + for AttributeSet in measure_obj.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: - HGVS['genomic'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: - HGVS['non-coding'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: - HGVS['coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: - HGVS['protein'].append(AttributeSet. - Attribute.get_valueOf_()) - if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': + HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) + + if not hgvs_coding and AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() - elif AttributeSet.Attribute.Type == \ - 'HGVS, genomic, top level, previous': + if not hgvs_genome and AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() - break + if chrom and chromStart and chromEnd: # if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided if variation_type == 'single nucleotide variant': if ref and alt: hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) else: - print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.format(chrom, chromStart, chromEnd, ref, alt, allele_id)) + print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'. + format(chrom, chromStart, chromEnd, ref, alt, allele_id)) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': - # RCV000156073, NC_000010.10:g.112581638_112581639delinsG + # RCV000156073, NC_000010.10:g.112581638_112581639delinsG if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] - hgvs_id = "chr%s:g.%s_%sdel%s" % \ - (chrom, chromStart, chromEnd, indel_alt) + hgvs_id = "chr%s:g.%s_%sdel%s" % (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': if chromStart == chromEnd: # RCV000048406, chr17:g.41243547del @@ -183,24 +204,19 @@ def parse_measure(Measure, hg19=True): if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] if chromStart == chromEnd: - hgvs_id = "chr%s:g.%sins%s" % \ - (chrom, chromStart, ins_ref) + hgvs_id = "chr%s:g.%sins%s" % (chrom, chromStart, ins_ref) else: - hgvs_id = "chr%s:g.%s_%sins%s" % \ - (chrom, chromStart, chromEnd, ins_ref) + hgvs_id = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] if chromStart == chromEnd: - hgvs_id = "chr%s:g.%sdup%s" % \ - (chrom, chromStart, dup_ref) + hgvs_id = "chr%s:g.%sdup%s" % (chrom, chromStart, dup_ref) else: - hgvs_id = "chr%s:g.%s_%sdup%s" % \ - (chrom, chromStart, chromEnd, dup_ref) - elif variation_type == 'copy number loss' or\ - variation_type == 'copy number gain': + hgvs_id = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, dup_ref) + elif variation_type == 'copy number loss' or variation_type == 'copy number gain': if hgvs_genome and chrom: hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2] elif hgvs_coding: @@ -210,16 +226,18 @@ def parse_measure(Measure, hg19=True): return else: return + for key in HGVS: HGVS[key].sort() + rsid = None cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids - if Measure.XRef: - for XRef in Measure.XRef: + if measure_obj.XRef: + for XRef in measure_obj.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': @@ -234,73 +252,85 @@ def parse_measure(Measure, hg19=True): # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { - "_id": hgvs_id, - "clinvar": - { - "allele_id": allele_id, - "chrom": chrom, - "omim": omim, - "cosmic": cosmic, - "uniprot": uniprot, - "dbvar": dbvar, - "hg19": - { - "start": chromStart_19, - "end": chromEnd_19 - }, - "hg38": - { - "start": chromStart_38, - "end": chromEnd_38 - }, - "type": variation_type, - "gene": - { - "id": gene_id, - "symbol": symbol - }, - "rcv": - { - "preferred_name": name, - }, - "rsid": rsid, - "cytogenic": cytogenic, - "hgvs": HGVS, - "coding_hgvs_only": coding_hgvs_only, - "ref": ref, - "alt": alt - } + "clinvar": { + "allele_id": allele_id, + "chrom": chrom, + "omim": omim, + "cosmic": cosmic, + "uniprot": uniprot, + "dbvar": dbvar, + "hg19": { + "start": chromStart_19, + "end": chromEnd_19 + }, + "hg38": { + "start": chromStart_38, + "end": chromEnd_38 + }, + "type": variation_type, + "gene": { + "id": gene_id, + "symbol": symbol + }, + "rcv": { + "preferred_name": name, + }, + "rsid": rsid, + "cytogenic": cytogenic, + "hgvs": HGVS, + "coding_hgvs_only": coding_hgvs_only, + "ref": ref, + "alt": alt + } } return one_snp_json -def _map_line_to_json(cp, hg19): + +def _map_public_set_to_json(public_set_obj, hg19: bool): + """ + Convert a `clinvarlib.PublicSetType` object into a json document. + + Each `clinvarlib.PublicSetType` object is mapped to a `` block in the XML. + + E.g., `public_set_obj.ReferenceClinVarAssertion.MeasureSet` is the parsed value from a block structure below: + + + + + ... + + + + """ try: - clinical_significance = cp.ReferenceClinVarAssertion.\ - ClinicalSignificance.Description + clinical_significance = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.Description except: clinical_significance = None - rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc + + rcv_accession = public_set_obj.ReferenceClinVarAssertion.ClinVarAccession.Acc + try: - review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ - ReviewStatus + review_status = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus except: review_status = None + try: - last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ - DateLastEvaluated + last_evaluated = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated except: last_evaluated = None - number_submitters = len(cp.ClinVarAssertion) + number_submitters = len(public_set_obj.ClinVarAssertion) + # some items in clinvar_xml doesn't have origin information try: - origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin + origin = public_set_obj.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None + conditions = [] - for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: + for _trait in public_set_obj.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: @@ -308,6 +338,7 @@ def _map_line_to_json(cp, hg19): synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() + identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': @@ -318,55 +349,63 @@ def _map_line_to_json(cp, hg19): for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' + age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() - conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset}) + + conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, + "age_of_onset": age_of_onset}) try: - genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet + genotypeset = public_set_obj.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None + if genotypeset: obj_list = [] id_list = [] - for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: + for _set in public_set_obj.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: - json_obj = parse_measure(_measure, hg19=hg19) + json_obj = _map_measure_to_json(_measure, hg19=hg19) if json_obj: - json_obj['clinvar']['rcv'].update({'accession': rcv_accession, + json_obj['clinvar']['rcv'].update({ + 'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, - 'conditions': conditions}) + 'conditions': conditions + }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, - ['chrom', 'omim', 'id', 'orphanet', 'gene', - 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) + ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), + [None, '', 'None'])) obj_list.append(json_obj) id_list.append(json_obj['_id']) for _obj in obj_list: _obj['clinvar'].update({'genotypeset': { - 'type': 'CompoundHeterozygote', - 'genotype': id_list - }}) + 'type': 'CompoundHeterozygote', + 'genotype': id_list + }}) yield _obj else: - variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID - for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: - json_obj = parse_measure(_measure, hg19=hg19) + variant_id = public_set_obj.ReferenceClinVarAssertion.MeasureSet.ID + for _measure in public_set_obj.ReferenceClinVarAssertion.MeasureSet.Measure: + json_obj = _map_measure_to_json(_measure, hg19=hg19) if json_obj: - json_obj['clinvar']['rcv'].update({'accession': rcv_accession, - 'clinical_significance': clinical_significance, - 'number_submitters': number_submitters, - 'review_status': review_status, - 'last_evaluated': str(last_evaluated), - 'origin': origin, - 'conditions': conditions}) + json_obj['clinvar']['rcv'].update({ + 'accession': rcv_accession, + 'clinical_significance': clinical_significance, + 'number_submitters': number_submitters, + 'review_status': review_status, + 'last_evaluated': str(last_evaluated), + 'origin': origin, + 'conditions': conditions + }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', @@ -374,21 +413,56 @@ def _map_line_to_json(cp, hg19): yield json_obj -def rcv_feeder(input_file, hg19): - # the first two line of clinvar_xml is not useful information - cv_data = rec_handler(input_file, block_end='\n', - skip=2, include_block_end=True) - for record in cv_data: - # some exceptions - if record.startswith('\n'): +def clinvar_doc_feeder(input_file, hg19: bool): + """ + This function will split the xml file into `...` blocks, then parse each block into an + `PublicSetType` object (which is defined in the dynamically imported `clinvarlib`), and finally convert each + `PublicSetType` object into a clinvar document. + """ + + """ + A ClinVarFullRelease_*.xml.gz file has the following structure: + + + + + + ... + + + + ... + + + ... + + + + Therefore when splitting the xml into `` blocks, the first 2 lines and the last 1 line should be + skipped. + + However the `rec_handler` function cannot skip the last 1 line, and will return "\n..." as + the last block in this scenario. Therefore in the for-loop below, we will skip any block starting with + "\n". + """ + clinvar_set_blocks = rec_handler(input_file, block_end='\n', skip=2, include_block_end=True) + for clinvar_set_block in clinvar_set_blocks: + # Skip any block starting with "\n" + # Actually only the last block will be skipped. See comments above + if clinvar_set_block.startswith('\n'): continue + try: - record_parsed = clinvarlib.parseString(record, silence=1) + # Parse each `` block into a `clinvarlib.PublicSetType` object + public_set_obj = clinvarlib.parseString(clinvar_set_block, silence=1) except: - logging.debug(record) + logging.debug(clinvar_set_block) raise - for record_mapped in _map_line_to_json(record_parsed, hg19): - yield record_mapped + + # Convert each `clinvarlib.PublicSetType` object into a json document + for doc in _map_public_set_to_json(public_set_obj, hg19): + yield doc + def load_data(data_folder, version): # try to get logger from uploader @@ -397,17 +471,23 @@ def load_data(data_folder, version): logging = loggingmod.getLogger("clinvar_upload") import_clinvar_lib(data_folder) - files = glob.glob(os.path.join(data_folder,GLOB_PATTERN)) - assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN,files) + + files = glob.glob(os.path.join(data_folder, GLOB_PATTERN)) + assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files) input_file = files[0] - data_generator = rcv_feeder(input_file, version == "hg19") - data_list = list(data_generator) - # TODO: why do we sort this list ? this prevent from using yield/iterator - data_list_sorted = sorted(data_list, key=lambda k: k['_id']) - data_merge_rcv = merge_rcv_accession(data_list_sorted) - return data_merge_rcv + + doc_generator = clinvar_doc_feeder(input_file, hg19=(version == "hg19")) + + # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()` + # which cannot put non-adjacent items with the same key into a group + doc_list = list(doc_generator) + doc_list = sorted(doc_list, key=lambda k: k['_id']) + + merged_doc_generator = merge_rcv_accession(doc_list) + return merged_doc_generator + if __name__ == "__main__": from biothings.utils.mongo import get_data_folder data_folder = get_data_folder("clinvar") - load_data(data_folder=data_folder) + load_data(data_folder=data_folder, version="hg19")