Skip to content

Commit a561fac

Browse files
authored
Merge pull request #270 from opencb/TASK-6558
TASK-6558 - Issue indexing variants with length 0 (i.e. start > end) and CIEND #269
2 parents aacf0bb + 58bee08 commit a561fac

File tree

4 files changed

+409
-155
lines changed

4 files changed

+409
-155
lines changed

biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java

Lines changed: 154 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -301,24 +301,20 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
301301
normalizedVariants.add(variant);
302302
continue;
303303
}
304-
String reference = variant.getReference(); //Save original values, as they can be changed
304+
//Save original values, as they can be changed
305+
String reference = variant.getReference();
305306
String alternate = variant.getAlternate();
306307
Integer start = variant.getStart();
307308
Integer end = variant.getEnd();
308309
String chromosome = variant.getChromosome();
309-
StructuralVariation sv = variant.getSv();
310310

311311
if (variant.getStudies() == null || variant.getStudies().isEmpty()) {
312-
List<VariantKeyFields> keyFieldsList;
313-
if (isSymbolic(variant)) {
314-
keyFieldsList = normalizeSymbolic(start, end, reference, alternate, sv);
315-
} else {
316-
keyFieldsList = normalize(chromosome, start, reference, alternate);
317-
}
312+
List<VariantKeyFields> keyFieldsList = normalizeAlleles(variant);
313+
318314
// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
319315
for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
320316
OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele());
321-
Variant normalizedVariant = newVariant(variant, keyFields, sv);
317+
Variant normalizedVariant = newVariant(variant, keyFields);
322318
if (keyFields.getPhaseSet() != null) {
323319
StudyEntry studyEntry = new StudyEntry();
324320
studyEntry.setSamples(
@@ -332,25 +328,16 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
332328
normalizedVariants.add(normalizedVariant);
333329
}
334330
} else {
335-
for (StudyEntry entry : variant.getStudies()) {
336-
List<String> originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
337-
List<String> alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
338-
alternates.add(alternate);
339-
originalAlternates.add(alternate);
340-
for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) {
341-
alternates.add(secondaryAlternatesAllele);
342-
originalAlternates.add(secondaryAlternatesAllele);
343-
}
331+
if (variant.getStudies().size() != 1) {
332+
throw new IllegalStateException("Only one study per variant is supported when normalizing variants. Found "
333+
+ variant.getStudies().size() + " studies. Variant: " + variant);
334+
} else {
335+
StudyEntry entry = variant.getStudies().get(0);
336+
List<String> alternates = getAllAlternates(variant);
344337

345338
// FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed
346-
List<VariantKeyFields> keyFieldsList;
347-
List<VariantKeyFields> originalKeyFieldsList;
348-
if (isSymbolic(variant)) {
349-
keyFieldsList = normalizeSymbolic(start, end, reference, alternates, sv);
350-
} else {
351-
keyFieldsList = normalize(chromosome, start, reference, alternates);
352-
}
353-
originalKeyFieldsList = keyFieldsList
339+
List<VariantKeyFields> keyFieldsList = normalizeAlleles(variant);
340+
List<VariantKeyFields> originalKeyFieldsList = keyFieldsList
354341
.stream()
355342
.filter(k -> !k.isReferenceBlock())
356343
.map(k -> k.originalKeyFields)
@@ -373,8 +360,8 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
373360
originalCall = entry.getFiles().get(0).getCall().getVariantId();
374361
} else {
375362
StringBuilder sb = new StringBuilder(variant.toString());
376-
for (int i = 1; i < originalAlternates.size(); i++) {
377-
sb.append(",").append(originalAlternates.get(i));
363+
for (int i = 1; i < alternates.size(); i++) {
364+
sb.append(",").append(alternates.get(i));
378365
}
379366
originalCall = sb.toString();
380367
}
@@ -400,6 +387,9 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
400387
variant.setEnd(keyFields.getEnd());
401388
variant.setReference(keyFields.getReference());
402389
variant.setAlternate(keyFields.getAlternate());
390+
if (keyFields.getSv() != null) {
391+
variant.setSv(keyFields.getSv());
392+
}
403393
variant.reset();
404394
// Variant is being reused, must ensure the SV field si appropriately created
405395
// if (isSymbolic(variant)) {
@@ -415,7 +405,7 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
415405
}
416406
samples = entry.getSamples();
417407
} else {
418-
normalizedVariant = newVariant(variant, keyFields, sv);
408+
normalizedVariant = newVariant(variant, keyFields);
419409

420410
normalizedEntry = new StudyEntry();
421411
normalizedEntry.setStudyId(entry.getStudyId());
@@ -598,17 +588,54 @@ private Collection<VariantKeyFields> sortByPosition(List<VariantKeyFields> keyFi
598588
// }
599589
// }
600590

591+
protected List<VariantKeyFields> normalizeAlleles(Variant variant) {
592+
List<String> alternates = getAllAlternates(variant);
593+
594+
List<VariantKeyFields> keyFieldsList;
595+
if (isSymbolic(variant)) {
596+
keyFieldsList = normalizeSymbolic(variant.getStart(), variant.getEnd(), variant.getReference(), alternates, variant.getSv());
597+
} else {
598+
keyFieldsList = normalize(variant.getChromosome(), variant.getStart(), variant.getReference(), alternates, variant.getSv());
599+
}
600+
return keyFieldsList;
601+
}
602+
603+
private static List<String> getAllAlternates(Variant variant) {
604+
List<String> alternates;
605+
if (variant.getStudies() != null && !variant.getStudies().isEmpty()) {
606+
StudyEntry entry = variant.getStudies().get(0);
607+
String alternate = variant.getAlternate();
608+
alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
609+
alternates.add(alternate);
610+
for (AlternateCoordinate secondaryAlternate : entry.getSecondaryAlternates()) {
611+
if (secondaryAlternate.getStart() != null && !secondaryAlternate.getStart().equals(variant.getStart())) {
612+
throw new IllegalStateException("Unable to normalize variant where secondary alternates do not start at the same position. "
613+
+ "Variant: " + variant + " , secondaryAlternate: " + secondaryAlternate);
614+
}
615+
if (secondaryAlternate.getEnd() != null && !secondaryAlternate.getEnd().equals(variant.getEnd())) {
616+
throw new IllegalStateException("Unable to normalize variant where secondary alternates do not end at the same position. "
617+
+ "Variant: " + variant + " (end=" + variant.getEnd() + ") , secondaryAlternate: " + secondaryAlternate);
618+
}
619+
alternates.add(secondaryAlternate.getAlternate());
620+
}
621+
} else {
622+
alternates = Collections.singletonList(variant.getAlternate());
623+
}
624+
return Collections.unmodifiableList(alternates);
625+
}
626+
627+
@Deprecated // Test purposes only
601628
public List<VariantKeyFields> normalizeSymbolic(Integer start, Integer end, String reference, String alternate, StructuralVariation sv) {
602629
return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate), sv);
603630
}
604631

605-
@Deprecated
632+
@Deprecated // Test purposes only
606633
public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
607634
final List<String> alternates) {
608635
return normalizeSymbolic(start, end, reference, alternates, null);
609636
}
610637

611-
public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
638+
protected List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
612639
final List<String> alternates, StructuralVariation sv) {
613640
List<VariantKeyFields> list = new ArrayList<>(alternates.size());
614641

@@ -624,12 +651,56 @@ public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integ
624651
Integer copyNumber = sv == null ? null : sv.getCopyNumber();
625652
keyFields = normalizeSymbolic(start, end, reference, alternate, alternates, copyNumber, numAllelesIdx);
626653
}
654+
655+
if (alternate.equals(VariantBuilder.DUP_TANDEM_ALT)) {
656+
if (keyFields.getSv() == null) {
657+
keyFields.setSv(new StructuralVariation());
658+
}
659+
keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION);
660+
}
661+
662+
normalizeSvField(sv, keyFields);
663+
627664
list.add(keyFields);
628665
}
629666

630667
return list;
631668
}
632669

670+
private static void normalizeSvField(StructuralVariation sv, VariantKeyFields keyFields) {
671+
if (sv != null) {
672+
StructuralVariation normalizedSv = keyFields.getSv();
673+
if (normalizedSv == null) {
674+
normalizedSv = new StructuralVariation();
675+
}
676+
// CI positions may change during the normalization. Update them.
677+
normalizedSv.setCiStartLeft(sv.getCiStartLeft());
678+
normalizedSv.setCiStartRight(sv.getCiStartRight());
679+
680+
// Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND.
681+
// At this point, we're removing the CIEND from the normalized variant.
682+
// Do not remove the value from the INFO field (if any).
683+
// The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start")
684+
if (keyFields.getEnd() < keyFields.getStart()) {
685+
normalizedSv.setCiEndLeft(null);
686+
normalizedSv.setCiEndRight(null);
687+
} else {
688+
normalizedSv.setCiEndLeft(sv.getCiEndLeft());
689+
normalizedSv.setCiEndRight(sv.getCiEndRight());
690+
}
691+
normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq());
692+
normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq());
693+
694+
if (keyFields.getSv() == null) {
695+
if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null
696+
|| normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null
697+
|| normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) {
698+
keyFields.setSv(normalizedSv);
699+
}
700+
}
701+
}
702+
}
703+
633704
private boolean isNonRef(String alternate) {
634705
return alternate.equals(Allele.NO_CALL_STRING)
635706
|| alternate.equals(VariantBuilder.NON_REF_ALT)
@@ -695,7 +766,7 @@ private static VariantKeyFields normalizeMateBreakend(
695766
}
696767

697768
VariantKeyFields keyFields = new VariantKeyFields(newStart, newStart - 1, numAllelesIdx, newReference, newAlternate);
698-
keyFields.getSv().setBreakend(breakend);
769+
keyFields.setBreakend(breakend);
699770
return keyFields;
700771
}
701772

@@ -718,29 +789,37 @@ private VariantKeyFields normalizeSymbolic(
718789
+ "contain 0 or 1 nt, but no more. Please, check.");
719790
}
720791

721-
Integer cn = VariantBuilder.getCopyNumberFromAlternate(alternate);
722792
// if (cn != null) {
723793
// // Alternate with the form <CNxxx>, being xxx the number of copies, must be normalized into "<CNV>"
724794
// newAlternate = "<CNV>";
725795
// }
726796
String newAlternate;
797+
Integer newCn;
727798
if (alternate.equals("<CNV>") && copyNumber != null) {
728799
// Alternate must be of the form <CNxxx>, being xxx the number of copies
729800
newAlternate = "<CN" + copyNumber + ">";
801+
newCn = copyNumber;
730802
} else {
731803
newAlternate = alternate;
804+
newCn = VariantBuilder.getCopyNumberFromAlternate(alternate);
732805
}
806+
733807
return new VariantKeyFields(newStart, end, numAllelesIdx, newReference, newAlternate,
734-
null, cn, false);
808+
null, newCn, false);
735809
}
736810

737811

812+
@Deprecated // Test purposes only
738813
public List<VariantKeyFields> normalize(String chromosome, int position, String reference, String alternate) {
739-
return normalize(chromosome, position, reference, Collections.singletonList(alternate));
814+
return normalize(chromosome, position, reference, Collections.singletonList(alternate), null);
815+
}
816+
817+
@Deprecated // Test purposes only
818+
public List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates) {
819+
return normalize(chromosome, position, reference, alternates, null);
740820
}
741821

742-
public List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates)
743-
{
822+
protected List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates, StructuralVariation sv) {
744823

745824
List<VariantKeyFields> list = new ArrayList<>(alternates.size());
746825
int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present
@@ -784,6 +863,8 @@ public List<VariantKeyFields> normalize(String chromosome, int position, String
784863
}
785864
}
786865

866+
normalizeSvField(sv, keyFields);
867+
787868
if (keyFields != null) {
788869

789870
// To deal with cases such as A>GT
@@ -1380,34 +1461,24 @@ private int[] getGenotypesReorderingMap(int numAllele, int[] alleleMap) {
13801461
}
13811462
}
13821463

1383-
1384-
private Variant newVariant(Variant variant, VariantKeyFields keyFields, StructuralVariation sv) {
1464+
private Variant newVariant(Variant variant, VariantKeyFields keyFields) {
13851465
Variant normalizedVariant = new Variant(variant.getChromosome(), keyFields.getStart(), keyFields.getEnd(), keyFields.getReference(), keyFields.getAlternate())
13861466
.setId(variant.getId())
13871467
.setNames(variant.getNames())
13881468
.setStrand(variant.getStrand());
13891469

1390-
if (sv != null) {
1391-
if (normalizedVariant.getSv() != null) {
1392-
// CI positions may change during the normalization. Update them.
1393-
normalizedVariant.getSv().setCiStartLeft(sv.getCiStartLeft());
1394-
normalizedVariant.getSv().setCiStartRight(sv.getCiStartRight());
1395-
normalizedVariant.getSv().setCiEndLeft(sv.getCiEndLeft());
1396-
normalizedVariant.getSv().setCiEndRight(sv.getCiEndRight());
1397-
normalizedVariant.getSv().setLeftSvInsSeq(sv.getLeftSvInsSeq());
1398-
normalizedVariant.getSv().setRightSvInsSeq(sv.getRightSvInsSeq());
1399-
1400-
// Variant will never have CopyNumber, because the Alternate is normalized from <CNxx> to <CNV>
1401-
normalizedVariant.getSv().setCopyNumber(keyFields.getCopyNumber());
1402-
VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber());
1403-
if (cnvSubtype != null) {
1404-
normalizedVariant.setType(cnvSubtype);
1405-
}
1406-
}
1470+
if (keyFields.getSv() != null) {
1471+
normalizedVariant.setSv(keyFields.getSv());
14071472
}
1408-
14091473
normalizedVariant.setAnnotation(variant.getAnnotation());
14101474

1475+
if (keyFields.getCopyNumber() != null) {
1476+
VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber());
1477+
if (cnvSubtype != null) {
1478+
normalizedVariant.setType(cnvSubtype);
1479+
}
1480+
}
1481+
14111482
return normalizedVariant;
14121483
// normalizedVariant.setAnnotation(variant.getAnnotation());
14131484
// if (isSymbolic(variant)) {
@@ -1527,8 +1598,10 @@ public VariantKeyFields(int start, int end, int numAllele, String reference, Str
15271598
this.alternate = alternate;
15281599
this.originalKeyFields = originalKeyFields == null ? this : originalKeyFields;
15291600
this.referenceBlock = referenceBlock;
1530-
this.sv = new StructuralVariation();
1531-
setCopyNumber(copyNumber);
1601+
this.sv = null;
1602+
if (copyNumber != null) {
1603+
setCopyNumber(copyNumber);
1604+
}
15321605
}
15331606

15341607

@@ -1604,7 +1677,28 @@ public Integer getCopyNumber() {
16041677
}
16051678

16061679
public VariantKeyFields setCopyNumber(Integer copyNumber) {
1607-
sv.setCopyNumber(copyNumber);
1680+
if (sv == null) {
1681+
if (copyNumber != null) {
1682+
sv = new StructuralVariation();
1683+
sv.setCopyNumber(copyNumber);
1684+
sv.setType(VariantBuilder.getCNVSubtype(copyNumber));
1685+
}
1686+
} else {
1687+
sv.setCopyNumber(copyNumber);
1688+
sv.setType(VariantBuilder.getCNVSubtype(copyNumber));
1689+
}
1690+
return this;
1691+
}
1692+
1693+
public VariantKeyFields setBreakend(Breakend breakend) {
1694+
if (sv == null) {
1695+
if (breakend != null) {
1696+
sv = new StructuralVariation();
1697+
sv.setBreakend(breakend);
1698+
}
1699+
} else {
1700+
sv.setBreakend(breakend);
1701+
}
16081702
return this;
16091703
}
16101704

biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,20 +230,24 @@ protected Variant newVariant(int position, String ref, String altsCsv) {
230230
return newVariant(position, position, ref, Arrays.asList(altsCsv.split(",")), "2");
231231
}
232232

233-
protected Variant newVariant(int start, int end, String ref, String altsCsv) {
233+
protected Variant newVariant(int start, Integer end, String ref, String altsCsv) {
234234
return newVariant(start, end, ref, Arrays.asList(altsCsv.split(",")), "2");
235235
}
236236

237237
protected Variant newVariant(int position, String ref, List<String> altsList, String studyId) {
238238
return newVariant(position, position, ref, altsList, studyId);
239239
}
240240

241-
protected Variant newVariant(int start, int end, String ref, List<String> altsList, String studyId) {
241+
protected Variant newVariant(int start, Integer end, String ref, List<String> altsList, String studyId) {
242242
return newVariantBuilder(start, end, ref, altsList, studyId).build();
243243
}
244244

245-
protected VariantBuilder newVariantBuilder(int position, int end, String ref, List<String> altsList, String studyId) {
246-
return Variant.newBuilder("1", position, end, ref, String.join(",", altsList))
245+
protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, List<String> altsList, String studyId) {
246+
return newVariantBuilder(position, end, ref, String.join(",", altsList), studyId);
247+
}
248+
249+
protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, String alts, String studyId) {
250+
return Variant.newBuilder("1", position, end, ref, alts)
247251
.setStudyId(studyId)
248252
.setSampleDataKeys("GT")
249253
.setSamples(new ArrayList<>())

0 commit comments

Comments
 (0)