diff --git a/src/main/scala/com/fulcrumgenomics/bam/ErrorRateByReadPosition.scala b/src/main/scala/com/fulcrumgenomics/bam/ErrorRateByReadPosition.scala index 10eb9579b..d06e51ab0 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/ErrorRateByReadPosition.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/ErrorRateByReadPosition.scala @@ -26,7 +26,6 @@ package com.fulcrumgenomics.bam import java.util - import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamSource import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} @@ -36,6 +35,7 @@ import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.Metric.Count import com.fulcrumgenomics.util.{Metric, ProgressLogger, Rscript, Sequences} +import com.fulcrumgenomics.vcf.api.VcfSource import com.fulcrumgenomics.vcf.{ByIntervalListVariantContextIterator, VariantMask} import htsjdk.samtools.SAMRecord import htsjdk.samtools.filter.{DuplicateReadFilter, FailsVendorReadQualityFilter, SamRecordFilter, SecondaryOrSupplementaryFilter} @@ -197,9 +197,9 @@ class ErrorRateByReadPosition case None => new VariantMask(Iterator.empty, dict) case Some(path) => - val reader = new VCFFileReader(path.toFile, this.intervals.isDefined) + val reader = VcfSource(path) intervals match { - case None => new VariantMask(reader.iterator(), dict) + case None => new VariantMask(reader.iterator, dict) case Some(i) => new VariantMask(ByIntervalListVariantContextIterator(reader, i), dict) } } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/AssessPhasing.scala b/src/main/scala/com/fulcrumgenomics/vcf/AssessPhasing.scala index 4e43bc0f1..b573a1d04 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/AssessPhasing.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/AssessPhasing.scala @@ -24,10 +24,6 @@ package com.fulcrumgenomics.vcf -import java.nio.file.Paths -import java.util -import java.util.Comparator - import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.io.{Io, PathUtil} @@ -36,14 +32,15 @@ import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.{GenomicSpan, Metric, ProgressLogger} import com.fulcrumgenomics.vcf.PhaseCigarOp.PhaseCigarOp +import com.fulcrumgenomics.vcf.api._ import htsjdk.samtools.util.{IntervalList, OverlapDetector} -import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder} -import htsjdk.variant.variantcontext.{Genotype, GenotypeBuilder, VariantContext, VariantContextBuilder} -import htsjdk.variant.vcf._ +import java.nio.file.Paths +import java.util +import java.util.Comparator import scala.annotation.tailrec -import scala.jdk.CollectionConverters._ import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters._ @clp( description = @@ -91,34 +88,25 @@ class AssessPhasing // check the sequence dictionaries. val dict = { - import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary - val calledReader = new VCFFileReader(calledVcf.toFile, true) - val truthReader = new VCFFileReader(truthVcf.toFile, true) - calledReader.getFileHeader.getSequenceDictionary.assertSameDictionary(truthReader.getFileHeader.getSequenceDictionary) + val calledReader = VcfSource(calledVcf) + val truthReader = VcfSource(truthVcf) + calledReader.header.dict.sameAs(truthReader.header.dict) calledReader.close() truthReader.close() - calledReader.getFileHeader.getSequenceDictionary.fromSam + calledReader.header.dict } - val knownIntervalList = knownIntervals.map { intv => IntervalList.fromFile(intv.toFile).uniqued() } val calledBlockLengthCounter = new NumericCounter[Long]() val truthBlockLengthCounter = new NumericCounter[Long]() val metric = new AssessPhasingMetric val writer = if (!debugVcf) None else { val path = Paths.get(output.toString + AssessPhasing.AnnotatedVcfExtension) - val reader = new VCFFileReader(calledVcf.toFile, true) - val header = reader.getFileHeader - val builder = new VariantContextWriterBuilder() - .setOutputFile(path.toFile) - .setReferenceDictionary(header.getSequenceDictionary) - .setOption(Options.INDEX_ON_THE_FLY) - .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, true) - val writer: VariantContextWriter = builder.build - val headerLines: util.Set[VCFHeaderLine] = new util.HashSet[VCFHeaderLine](header.getMetaDataInSortedOrder) - headerLines.add(AssessPhasing.PhaseConcordanceFormatHeaderLine) // add the new format header lines - VCFStandardHeaderLines.addStandardFormatLines(headerLines, false, Genotype.PRIMARY_KEYS) // add standard header lines - writer.writeHeader(new VCFHeader(headerLines, List("call", "truth").asJava)) - reader.safelyClose() + val reader = VcfSource(calledVcf) + val header = reader.header + + val newFormatLines = header.formats ++ AssessPhasing.formatHeaderLines :+ AssessPhasing.PhaseConcordanceFormatHeaderLine + val writer = VcfWriter(path, header.copy(formats=newFormatLines, samples=IndexedSeq("call", "truth"))) + Some(writer) } val chromosomes = intervals.map { intv => @@ -132,7 +120,6 @@ class AssessPhasing intervalList.getIntervals.map { i => i.getContig }.toSet } - // NB: could parallelize! dict .iterator @@ -194,7 +181,7 @@ class AssessPhasing metric: AssessPhasingMetric, calledBlockLengthCounter: NumericCounter[Long], truthBlockLengthCounter: NumericCounter[Long], - writer: Option[VariantContextWriter] = None + writer: Option[VcfWriter] = None ): Unit = { logger.info(s"Assessing $contig") @@ -207,7 +194,7 @@ class AssessPhasing // get the phased blocks logger.info("Getting the called phase blocks") val calledPhaseBlockDetector = { - val calledReader = new VCFFileReader(calledVcf.toFile, true) + val calledReader = VcfSource(calledVcf) val detector = PhaseBlock.buildOverlapDetector( iterator = toVariantContextIterator(calledReader, contig, contigLength), dict = dict, @@ -218,7 +205,7 @@ class AssessPhasing } logger.info("Getting the known phase blocks") val truthPhaseBlockDetector = { - val truthReader = new VCFFileReader(truthVcf.toFile, true) + val truthReader = VcfSource(truthVcf) val detector = PhaseBlock.buildOverlapDetector( iterator = toVariantContextIterator(truthReader, contig, contigLength, intervalList=intervalListForContig), dict = dict, @@ -229,8 +216,8 @@ class AssessPhasing } // get an iterator of the pairs - val calledReader = new VCFFileReader(calledVcf.toFile, true) - val truthReader = new VCFFileReader(truthVcf.toFile, true) + val calledReader = VcfSource(calledVcf) + val truthReader = VcfSource(truthVcf) val pairedIterator = JointVariantContextIterator( iters = Seq(toVariantContextIterator(truthReader, contig, contigLength, intervalList=intervalListForContig), toVariantContextIterator(calledReader, contig, contigLength)), dict = dict @@ -282,20 +269,30 @@ class AssessPhasing logger.info(s"Completed $contig") } - private def toVariantContextIterator(reader: VCFFileReader, + private def toVariantContextIterator(reader: VcfSource, contig: String, contigLength: Int, - intervalList: Option[IntervalList] = None): Iterator[VariantContext] = { - import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary - val sampleName = reader.getFileHeader.getSampleNamesInOrder.iterator().next() - val baseIter: Iterator[VariantContext] = intervalList match { - case Some(intv) => ByIntervalListVariantContextIterator(reader.iterator(), intv, dict=reader.getFileHeader.getSequenceDictionary.fromSam) + intervalList: Option[IntervalList] = None): Iterator[Variant] = { + val sampleName = reader.header.samples.head + val baseIter: Iterator[Variant] = intervalList match { + case Some(intv) => ByIntervalListVariantContextIterator(reader.iterator, intv, dict=reader.header.dict) case None => reader.query(contig, 1, contigLength) } baseIter - .map(_.subContextFromSample(sampleName)) - .filter(v => v.isSNP && v.isBiallelic && v.getGenotype(sampleName).isHet) + .map(subsetVariantToSample(_, sampleName)) + .filter(v => isBiallelicSnp(v, sampleName)) + } + + private def subsetVariantToSample(variant: Variant, sample: String): Variant = { + variant + } + + private def isBiallelicSnp(v: Variant, sample: String): Boolean = { + val isSnp = v.alleles.toSeq.forall(_.value.length == 1) + val isBiallelic = v.alleles.alts.length == 1 + val isHet = v.gt(sample).isHet + isSnp && isBiallelic && isHet } } @@ -309,10 +306,26 @@ object AssessPhasing { val PhaseConcordanceFormatTag = "PHASE_CONC" val PhaseConcordanceFormatDescription = "The phase concordance (Match or Mismatch) determined by fgbio's AssessPhasing" - val PhaseConcordanceFormatHeaderLine = new VCFFormatHeaderLine(PhaseConcordanceFormatTag, 1, VCFHeaderLineType.Integer, PhaseConcordanceFormatDescription) + val PhaseConcordanceFormatHeaderLine: VcfFormatHeader = VcfFormatHeader( + PhaseConcordanceFormatTag, + VcfCount.Fixed(1), + VcfFieldType.Integer, + PhaseConcordanceFormatDescription) + + val GenotypeFilterHeaderLine: VcfFormatHeader = VcfFormatHeader("FT", VcfCount.Unknown, VcfFieldType.String, "Genotype-level filter") + val GenotypeKeyHeaderLine: VcfFormatHeader = VcfFormatHeader("GT", VcfCount.Fixed(1), VcfFieldType.String, "Genotype") + val GenotypeQualityKeyHeaderLine: VcfFormatHeader = VcfFormatHeader("GQ", VcfCount.Fixed(1), VcfFieldType.Integer, "Genotype Quality") + val DepthKeyHeaderLine: VcfFormatHeader = VcfFormatHeader("DP", VcfCount.Fixed(1), VcfFieldType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered") + val AlleleDepthHeaderLine: VcfFormatHeader = VcfFormatHeader("AD", VcfCount.OnePerAllele, VcfFieldType.Integer, "Allelic depths for the ref and alt alleles in the order listed") + val GenotypePLKeyHeaderLine: VcfFormatHeader = VcfFormatHeader("PL", VcfCount.OnePerGenotype, VcfFieldType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") + + /** The VCF header FORMAT lines that will be added for HapCut1 specific-genotype information. */ + val formatHeaderLines: Seq[VcfFormatHeader] = { + Seq(GenotypeFilterHeaderLine, GenotypeKeyHeaderLine, GenotypeQualityKeyHeaderLine, DepthKeyHeaderLine, AlleleDepthHeaderLine, GenotypePLKeyHeaderLine) + } - def getPhasingSetId(ctx: VariantContext): Int = { - Integer.valueOf(ctx.getGenotype(0).getExtendedAttribute("PS", "-1").toString) + def getPhasingSetId(ctx: Variant): Int = { + Integer.valueOf(ctx.gts.next().attrs.getOrElse("PS", "-1").toString) } } @@ -396,7 +409,7 @@ object PhaseBlock extends LazyLogging { /** Creates an overlap detector for blocks of phased variants. Variants from the same block are found using the * "PS" tag. The modify blocks option resolves overlapping blocks */ - private[vcf] def buildOverlapDetector(iterator: Iterator[VariantContext], dict: SequenceDictionary, modifyBlocks: Boolean = true): OverlapDetector[PhaseBlock] = { + private[vcf] def buildOverlapDetector(iterator: Iterator[Variant], dict: SequenceDictionary, modifyBlocks: Boolean = true): OverlapDetector[PhaseBlock] = { val detector = new OverlapDetector[PhaseBlock](0, 0) val progress = new ProgressLogger(logger) @@ -418,7 +431,6 @@ object PhaseBlock extends LazyLogging { } progress.record(ctx.getContig, ctx.getStart) } - val blocksIn = new util.TreeSet[PhaseBlock](new Comparator[PhaseBlock] { /** Compares the two blocks based on start position, then returns the shorter block. */ def compare(a: PhaseBlock, b: PhaseBlock): Int = { @@ -611,7 +623,7 @@ private[vcf] object PhaseCigar { import AssessPhasing.{CalledSampleName, TruthSampleName} import PhaseCigarOp._ - private type VCtx = VariantContext + private type VCtx = Variant def apply(cigar: Seq[PhaseCigarOp]): PhaseCigar = new PhaseCigar(cigar) @@ -628,22 +640,22 @@ private[vcf] object PhaseCigar { * CIGAR: 00003330002220000511111116000001000060600111000 * shows one known block split */ - def apply(pairedIterator: Iterator[(Option[VariantContext], Option[VariantContext])], + def apply(pairedIterator: Iterator[(Option[Variant], Option[Variant])], truthPhaseBlockDetector: OverlapDetector[PhaseBlock], calledPhaseBlockDetector: OverlapDetector[PhaseBlock], metric: AssessPhasingMetric, skipMismatchingAlleles: Boolean, - writer: Option[VariantContextWriter] = None, + writer: Option[VcfWriter] = None, assumeFixedAlleleOrder: Boolean = false): PhaseCigar = { val iter = pairedIterator.filter { // ensure the alleles are the same when both truth and call are called - case ((Some(t: VCtx), Some(c: VCtx))) => - !skipMismatchingAlleles || t.getAlleles.toSet == c.getAlleles.toSet + case (Some(t: VCtx), Some(c: VCtx)) => + !skipMismatchingAlleles || t.alleles.toSet == c.alleles.toSet case _ => true - }.flatMap { case (t, c) => // collect metrics but only keep sites where either variant context (i.e. truth or call) is phased. - val ctxs = Seq(t, c).flatten // NB: can be 1 or 2 contexts here - val inTruthPhaseBlock = ctxs.headOption.exists(truthPhaseBlockDetector.overlapsAny) - val inCalledPhaseBlock = ctxs.headOption.exists(calledPhaseBlockDetector.overlapsAny) + }.flatMap { case (t, c) => // collect metrics but only keep sites where either variant (i.e. truth or call) is phased. + val ctxs = Seq(t, c).flatten // NB: can be 1 or 2 variants here + val inTruthPhaseBlock = ctxs.headOption.exists(v => truthPhaseBlockDetector.overlapsAny(v)) + val inCalledPhaseBlock = ctxs.headOption.exists(v => calledPhaseBlockDetector.overlapsAny(v)) val isTruthVariant = t.isDefined val isCalledVariant = c.isDefined val isTruthVariantPhased = t.exists { ctx => AssessPhasing.getPhasingSetId(ctx) > 0 } @@ -659,7 +671,7 @@ private[vcf] object PhaseCigar { metric.num_truth += 1 if (isTruthVariantPhased) metric.num_truth_phased += 1 } - // Less fun compute metrics + // Less fun compute metrics if (isCalledVariant && isTruthVariantPhased) { metric.num_called_with_truth_phased += 1 if (isCalledVariantPhased) metric.num_phased_with_truth_phased += 1 @@ -733,19 +745,14 @@ private[vcf] object PhaseCigar { (matchingOp, t, c, writer) match { case (Some(op), Some(truthCtx), Some(calledCtx), Some(w)) => val calledGenotype = { - val builder = new GenotypeBuilder(calledCtx.getGenotype(0)) - builder.name(CalledSampleName) - builder.attribute(AssessPhasing.PhaseConcordanceFormatTag, op.toString) - builder.make() - } - val truthGenotype = { - val builder = new GenotypeBuilder(truthCtx.getGenotype(0)) - builder.name(TruthSampleName) - builder.make() + val gt = calledCtx.gts.next() + gt.copy(sample=CalledSampleName, attrs = gt.attrs ++ Map(AssessPhasing.PhaseConcordanceFormatTag -> op.toString)) } - val ctxBuilder = new VariantContextBuilder(calledCtx) - ctxBuilder.genotypes(calledGenotype, truthGenotype) - w.add(ctxBuilder.make()) + val truthGenotype = truthCtx.gts.next().copy(sample=TruthSampleName) + + val ctxBuilder = calledCtx.copy(genotypes = Map(CalledSampleName -> calledGenotype, TruthSampleName -> truthGenotype)) + + w.write(ctxBuilder) case _ => () } } @@ -759,7 +766,7 @@ private[vcf] object PhaseCigar { } /** Returns an end operator if we have reached a new block. */ - private[vcf] def contextsToBlockEndOperator(truth: Option[VariantContext], call: Option[VariantContext]): Option[PhaseCigarOp] = (truth, call) match { + private[vcf] def contextsToBlockEndOperator(truth: Option[Variant], call: Option[Variant]): Option[PhaseCigarOp] = (truth, call) match { case (None, Some(c: VCtx)) => if (isStartOfPhaseBlock(c)) Some(CallEnd) else None case (Some(t: VCtx), None) => if (isStartOfPhaseBlock(t)) Some(TruthEnd) else None case (Some(t: VCtx), Some(c: VCtx)) => (isStartOfPhaseBlock(t), isStartOfPhaseBlock(c)) match { @@ -772,7 +779,7 @@ private[vcf] object PhaseCigar { } /** Returns the cigar for the two variant contexts. */ - private[vcf] def contextsToMatchingOperator(truth: Option[VariantContext], call: Option[VariantContext]): Option[PhaseCigarOp] = (truth, call) match { + private[vcf] def contextsToMatchingOperator(truth: Option[Variant], call: Option[Variant]): Option[PhaseCigarOp] = (truth, call) match { case (None, Some(c: VCtx)) => Some(CallOnly) case (Some(t: VCtx), None) => Some(TruthOnly) case (Some(t: VCtx), Some(c: VCtx)) => Some(cigarTypeForVariantContexts(t, c)) @@ -780,14 +787,14 @@ private[vcf] object PhaseCigar { } /** True if the phasing set id is the same as the start position of the given variant, false otherwise. */ - def isStartOfPhaseBlock(ctx: VariantContext): Boolean = ctx.getStart == AssessPhasing.getPhasingSetId(ctx) + def isStartOfPhaseBlock(ctx: Variant): Boolean = ctx.getStart == AssessPhasing.getPhasingSetId(ctx) /** Computes the cigar for two variant contexts. Returns [[Match]] if they share the same alleles in the same order, * [[Mismatch]] otherwise. */ - private[vcf] def cigarTypeForVariantContexts(truth: VariantContext, call: VariantContext): PhaseCigarOp = { - val truthAlleles = truth.getGenotype(0).getAlleles.toSeq - val calledAlleles = call.getGenotype(0).getAlleles.toSeq + private[vcf] def cigarTypeForVariantContexts(truth: Variant, call: Variant): PhaseCigarOp = { + val truthAlleles = truth.gts.next().calls.toSeq + val calledAlleles = call.gts.next().calls.toSeq require(truthAlleles.length == calledAlleles.length) require(truthAlleles.length == 2) if (truthAlleles.head != calledAlleles.head || truthAlleles.last != calledAlleles.last) PhaseCigarOp.Mismatch diff --git a/src/main/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIterator.scala b/src/main/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIterator.scala index 6424bc92b..7fcbee6dd 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIterator.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIterator.scala @@ -26,12 +26,10 @@ package com.fulcrumgenomics.vcf import java.util.NoSuchElementException - import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.fasta.SequenceDictionary +import com.fulcrumgenomics.vcf.api.{Variant, VcfSource} import htsjdk.samtools.util._ -import htsjdk.variant.variantcontext.VariantContext -import htsjdk.variant.vcf.VCFFileReader import scala.annotation.tailrec @@ -42,9 +40,9 @@ object ByIntervalListVariantContextIterator { * * All variants will be read in and compared to the intervals. */ - def apply(iterator: Iterator[VariantContext], + def apply(iterator: Iterator[Variant], intervalList: IntervalList, - dict: SequenceDictionary): Iterator[VariantContext] = { + dict: SequenceDictionary): Iterator[Variant] = { new OverlapDetectionVariantContextIterator(iterator, intervalList, dict) } @@ -56,29 +54,29 @@ object ByIntervalListVariantContextIterator { * The VCF will be queried when moving to the next variant context, and so may be quite slow * if we jump around the VCF a lot. */ - def apply(reader: VCFFileReader, - intervalList: IntervalList): Iterator[VariantContext] = { + def apply(reader: VcfSource, + intervalList: IntervalList): Iterator[Variant] = { new IndexQueryVariantContextIterator(reader, intervalList) } } -private class OverlapDetectionVariantContextIterator(val iter: Iterator[VariantContext], +private class OverlapDetectionVariantContextIterator(val iter: Iterator[Variant], val intervalList: IntervalList, val dict: SequenceDictionary) - extends Iterator[VariantContext] { + extends Iterator[Variant] { require(dict != null) private val intervals = intervalList.uniqued(false).iterator().buffered - private var nextVariantContext: Option[VariantContext] = None + private var nextVariant: Option[Variant] = None this.advance() - def hasNext: Boolean = this.nextVariantContext.isDefined + def hasNext: Boolean = this.nextVariant.isDefined - def next(): VariantContext = { - this.nextVariantContext match { + def next(): Variant = { + this.nextVariant match { case None => throw new NoSuchElementException("Called next when hasNext is false") - case Some(ctx) => yieldAndThen(ctx) { this.nextVariantContext = None; this.advance() } + case Some(ctx) => yieldAndThen(ctx) { this.nextVariant = None; this.advance() } } } @@ -108,16 +106,16 @@ private class OverlapDetectionVariantContextIterator(val iter: Iterator[VariantC } if (intervals.isEmpty) { } // no more intervals - else if (overlaps(ctx, intervals.head)) { nextVariantContext = Some(ctx) } // overlaps!!! + else if (overlaps(ctx, intervals.head)) { nextVariant = Some(ctx) } // overlaps!!! else if (iter.isEmpty) { } // no more variants else { this.advance() } // move to the next context } } /** NB: if a variant overlaps multiple intervals, only returns it once. */ -private class IndexQueryVariantContextIterator(private val reader: VCFFileReader, intervalList: IntervalList) - extends Iterator[VariantContext] { - private var iter: Option[Iterator[VariantContext]] = None +private class IndexQueryVariantContextIterator(private val reader: VcfSource, intervalList: IntervalList) + extends Iterator[Variant] { + private var iter: Option[Iterator[Variant]] = None private val intervals = intervalList.iterator() private var previousInterval: Option[Interval] = None @@ -127,7 +125,7 @@ private class IndexQueryVariantContextIterator(private val reader: VCFFileReader this.iter.exists(_.hasNext) } - def next(): VariantContext = { + def next(): Variant = { this.iter match { case None => throw new NoSuchElementException("Called next when hasNext is false") case Some(i) => yieldAndThen(i.next()) { advance() } @@ -136,10 +134,10 @@ private class IndexQueryVariantContextIterator(private val reader: VCFFileReader def remove(): Unit = throw new UnsupportedOperationException - private def overlapsInterval(ctx: VariantContext, interval: Interval): Boolean = { - if (!ctx.getContig.equals(interval.getContig)) false // different contig - else if (interval.getStart <= ctx.getStart && ctx.getStart <= interval.getEnd) true // start falls within this interval, count it - else if (ctx.getStart < interval.getStart && interval.getEnd <= ctx.getEnd) true // the variant encloses the interval + private def overlapsInterval(variant: Variant, interval: Interval): Boolean = { + if (!variant.getContig.equals(interval.getContig)) false // different contig + else if (interval.getStart <= variant.getStart && variant.getStart <= interval.getEnd) true // start falls within this interval, count it + else if (variant.getStart < interval.getStart && interval.getEnd <= variant.getEnd) true // the variant encloses the interval else false } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/JointVariantContextIterator.scala b/src/main/scala/com/fulcrumgenomics/vcf/JointVariantContextIterator.scala index 90e0518bc..d82a2a162 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/JointVariantContextIterator.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/JointVariantContextIterator.scala @@ -25,10 +25,10 @@ package com.fulcrumgenomics.vcf import com.fulcrumgenomics.fasta.SequenceDictionary -import htsjdk.variant.variantcontext.{VariantContext, VariantContextComparator} +import com.fulcrumgenomics.vcf.api.Variant object JointVariantContextIterator { - def apply(iters: Seq[Iterator[VariantContext]], + def apply(iters: Seq[Iterator[Variant]], dict: SequenceDictionary ): JointVariantContextIterator = { new JointVariantContextIterator( @@ -37,8 +37,8 @@ object JointVariantContextIterator { ) } - def apply(iters: Seq[Iterator[VariantContext]], - comp: VariantContextComparator + def apply(iters: Seq[Iterator[Variant]], + comp: VariantComparator ): JointVariantContextIterator = { new JointVariantContextIterator( iters=iters, @@ -51,25 +51,25 @@ object JointVariantContextIterator { * Iterates over multiple variant context iterators such that we return a list of contexts for the union of sites * across the iterators. If samples is given, we subset each variant context to just that sample. */ -class JointVariantContextIterator private(iters: Seq[Iterator[VariantContext]], - dictOrComp: Either[SequenceDictionary, VariantContextComparator] +class JointVariantContextIterator private(iters: Seq[Iterator[Variant]], + dictOrComp: Either[SequenceDictionary, VariantComparator] ) -extends Iterator[Seq[Option[VariantContext]]] { - import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary +extends Iterator[Seq[Option[Variant]]] { if (iters.isEmpty) throw new IllegalArgumentException("No iterators given") private val iterators = iters.map(_.buffered) private val comparator = dictOrComp match { - case Left(dict) => new VariantContextComparator(dict.asSam) + case Left(dict) => VariantComparator(dict) case Right(comp) => comp } def hasNext: Boolean = iterators.exists(_.nonEmpty) - def next(): Seq[Option[VariantContext]] = { + def next(): Seq[Option[Variant]] = { val minCtx = iterators.filter(_.nonEmpty).map(_.head).sortWith { - case (left: VariantContext, right: VariantContext) => comparator.compare(left, right) < 0 + // this just checks that the variants are the the same position? Shouldn't be difficult to replace. + case (left: Variant, right: Variant) => comparator.compare(left, right) < 0 }.head // TODO: could use a TreeSet to store the iterators, examine the head of each iterator, then pop the iterator with the min, // and add that iterator back in. @@ -79,3 +79,21 @@ extends Iterator[Seq[Option[VariantContext]]] { } } } + +private object VariantComparator { + def apply(dict: SequenceDictionary): VariantComparator = { + new VariantComparator(dict) + } +} + +/** A class for comparing Variants using a sequence dictionary */ +private class VariantComparator(dict: SequenceDictionary) { + /** Function for comparing two variants. Returns negative if left < right, and positive if right > left + * To mimic the VariantContextComparator, throws an exception if the contig isn't found. */ + def compare(left: Variant, right: Variant): Int = { + val idx1 = this.dict(name=left.chrom).index + val idx2 = this.dict(name=right.chrom).index + if (idx1 - idx2 == 0) left.pos - right.pos + else idx1 - idx2 + } +} diff --git a/src/main/scala/com/fulcrumgenomics/vcf/MakeTwoSampleMixtureVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/MakeTwoSampleMixtureVcf.scala index b7b12790a..346655254 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/MakeTwoSampleMixtureVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/MakeTwoSampleMixtureVcf.scala @@ -25,12 +25,12 @@ package com.fulcrumgenomics.vcf import java.util - import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.util.Io import com.fulcrumgenomics.vcf.MakeTwoSampleMixtureVcf._ import com.fulcrumgenomics.sopt._ +import com.fulcrumgenomics.vcf.api.VcfSource import htsjdk.samtools.util.IntervalList import htsjdk.variant.variantcontext._ import htsjdk.variant.vcf.{VCFFilterHeaderLine, _} @@ -165,9 +165,13 @@ class MakeTwoSampleMixtureVcf /** Generates an iterator over the whole file, or over the intervals if provided. */ def iterator(in: VCFFileReader, intervals: Option[PathToIntervals]): Iterator[VariantContext] = { +// val tmpFile = Io.makeTempFile("temp patch", ".vcf") +// MakeMixtureVcf.makeWriter(tmpFile, in.getFileHeader, lines=Seq.empty, samples=in.getFileHeader.getSampleNamesInOrder.toSeq:_*) +// val tmpIn = VcfSource(tmpFile) intervals match { case None => in.iterator() - case Some(path) => ByIntervalListVariantContextIterator(in, IntervalList.fromFile(path.toFile).uniqued(false)) + case Some(path) => in.iterator() +// case Some(path) => ByIntervalListVariantContextIterator(in, IntervalList.fromFile(path.toFile).uniqued(false)) } } } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/UpdateVcfContigNames.scala b/src/main/scala/com/fulcrumgenomics/vcf/UpdateVcfContigNames.scala index eb13d92e9..641c6c66a 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/UpdateVcfContigNames.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/UpdateVcfContigNames.scala @@ -30,6 +30,7 @@ import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.sopt.{arg, clp} import com.fulcrumgenomics.util.{Io, ProgressLogger} +import com.fulcrumgenomics.vcf.api.{VcfContigHeader, VcfHeader, VcfSource, VcfWriter} import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriterBuilder} import htsjdk.variant.vcf.{VCFFileReader, VCFHeader} @@ -55,20 +56,19 @@ class UpdateVcfContigNames override def execute(): Unit = { val dict = SequenceDictionary(this.dict) - val reader = new VCFFileReader(this.input) + val reader = VcfSource(this.input) val header = { - import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary - val h: VCFHeader = new VCFHeader(reader.getFileHeader) - h.setSequenceDictionary(dict.asSam) - h + VcfHeader( + contigs=dict.map{l => VcfContigHeader(index=l.index, name=l.name, length=Some(l.length))}.toIndexedSeq, + infos=reader.header.infos, + formats=reader.header.formats, + filters=reader.header.filters, + others=reader.header.others, + samples=reader.header.samples + ) } - val writer = { - new VariantContextWriterBuilder() - .setOutputPath(this.output) - .setOption(Options.INDEX_ON_THE_FLY) - .build() - } - writer.writeHeader(header) + + val writer = VcfWriter(this.output, header) // go through all the records val progress = ProgressLogger(logger, noun = "variants", verb = "written") @@ -78,9 +78,9 @@ class UpdateVcfContigNames if (skipMissing) logger.warning(s"Did not find contig ${v.getContig} in the sequence dictionary.") else throw new IllegalStateException(s"Did not find contig ${v.getContig} in the sequence dictionary.") case Some(info) => - val newV = new VariantContextBuilder(v).chr(info.name).make() + val newV = v.copy(chrom=info.name) progress.record(newV.getContig, newV.getStart) - writer.add(newV) + writer.write(newV) } } progress.logLast() diff --git a/src/main/scala/com/fulcrumgenomics/vcf/VariantMask.scala b/src/main/scala/com/fulcrumgenomics/vcf/VariantMask.scala index 9b5c11846..55aad4c6e 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/VariantMask.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/VariantMask.scala @@ -26,25 +26,24 @@ package com.fulcrumgenomics.vcf import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.fasta.SequenceDictionary -import htsjdk.variant.variantcontext.VariantContext -import htsjdk.variant.vcf.VCFFileReader +import com.fulcrumgenomics.vcf.api.{Variant, VcfSource} import scala.collection.mutable object VariantMask { /** Generate a variant mask from the provided VCF. */ - def apply(path: PathToVcf): VariantMask = apply(new VCFFileReader(path.toFile)) + def apply(path: PathToVcf): VariantMask = apply(VcfSource(path)) /** Generate a variant mask from the provided VCF reader. */ - def apply(reader: VCFFileReader): VariantMask = { - import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary - val dict = reader.getFileHeader.getSequenceDictionary.fromSam + def apply(reader: VcfSource): VariantMask = { + val dict = reader.header.dict +// val dict = reader.getFileHeader.getSequenceDictionary.fromSam require(dict != null && dict.length > 0, "Generating a VariantMask requires VCFs contain contig lines.") - apply(reader.iterator(), dict) + apply(reader.iterator, dict) } /** Generates a VariantMask from the variants in the provided iterator. */ - def apply(variants: Iterator[VariantContext], dict: SequenceDictionary) = new VariantMask(variants, dict) + def apply(variants: Iterator[Variant], dict: SequenceDictionary) = new VariantMask(variants, dict) } /** @@ -55,7 +54,7 @@ object VariantMask { * @param variants a coordinate sorted iterator of variants * @param dict the sequence dictionary for the reference */ -class VariantMask(variants: Iterator[VariantContext], val dict: SequenceDictionary) { +class VariantMask(variants: Iterator[Variant], val dict: SequenceDictionary) { private val iterator = variants.bufferBetter private var currentMask: mutable.BitSet = new mutable.BitSet(0) private var currentIndex: Int = -1 @@ -74,7 +73,7 @@ class VariantMask(variants: Iterator[VariantContext], val dict: SequenceDictiona val bits = new mutable.BitSet(ref.length) iterator.dropWhile(v => dict(v.getContig).index < refIndex) .takeWhile(v => dict(v.getContig).index == refIndex) - .filterNot(v => v.isFiltered) +// .filterNot(v => v.isFiltered) TODO fix this .foreach { v => forloop (from=v.getStart, until=v.getEnd+1) { i => bits(i-1) = true } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/AssessPhasingTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/AssessPhasingTest.scala index a896f805d..d00c53a21 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/AssessPhasingTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/AssessPhasingTest.scala @@ -27,26 +27,22 @@ package com.fulcrumgenomics.vcf import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.commons.io.PathUtil import com.fulcrumgenomics.commons.util.NumericCounter -import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary import com.fulcrumgenomics.testing.VcfBuilder.Gt -import com.fulcrumgenomics.testing.{ErrorLogLevel, UnitSpec, VariantContextSetBuilder, VcfBuilder} +import com.fulcrumgenomics.testing.{ErrorLogLevel, UnitSpec, VcfBuilder} import com.fulcrumgenomics.util.Metric import com.fulcrumgenomics.vcf.PhaseCigar.IlluminaSwitchErrors +import com.fulcrumgenomics.vcf.api.{Variant, VcfCount, VcfFieldType, VcfFormatHeader, VcfHeader, VcfSource, VcfWriter} import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.util.{Interval, IntervalList} -import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriterBuilder} -import htsjdk.variant.variantcontext.{GenotypeBuilder, VariantContext, VariantContextBuilder} -import htsjdk.variant.vcf.{VCFFileReader, VCFHeader} +import htsjdk.variant.vcf.VCFFileReader import java.nio.file.{Files, Paths} object AssessPhasingTest { - def withPhasingSetId(ctx: VariantContext, id: Int): VariantContext = { - val gBuilder = new GenotypeBuilder(ctx.getGenotype(0)) - gBuilder.attribute("PS", id) - val ctxBuilder = new VariantContextBuilder(ctx) - ctxBuilder.genotypes(gBuilder.make()) - ctxBuilder.make() + def withPhasingSetId(ctx: Variant, id: Int): Variant = { + val sgt = ctx.gts.next() + val gt = Map(sgt.sample -> ctx.gt(sgt.sample).copy(attrs = sgt.attrs ++ Map("PS" -> id))) + ctx.copy(genotypes=gt) } private val builderTruth = VcfBuilder(samples=Seq("S1")) @@ -90,10 +86,10 @@ object AssessPhasingTest { // NB: call has blocks lengths 4, 5, 1, and 13; truth has block lengths 4, 6, and 13. } - val readBuilderCall: VCFFileReader = new VCFFileReader(builderCall.toTempFile()) - val readBuilderTruth: VCFFileReader = new VCFFileReader(builderTruth.toTempFile()) + val readBuilderCall: VcfSource = builderCall.toSource + val readBuilderTruth: VcfSource = builderTruth.toSource - private def addPhaseSetId(ctx: VariantContext): VariantContext = { + private def addPhaseSetId(ctx: Variant): Variant = { if (ctx.getStart <= 10) withPhasingSetId(ctx, 1) else if (ctx.getStart <= 20) withPhasingSetId(ctx, 11) else if (ctx.getStart <= 29) withPhasingSetId(ctx, 21) @@ -101,23 +97,24 @@ object AssessPhasingTest { else unreachable("Not defined") } - lazy val TruthVariants: Seq[VariantContext] = { + lazy val TruthVariants: Seq[Variant] = { // Keep the truth variant position 13 without a phase set - readBuilderTruth.iterator().map { ctx => + readBuilderTruth.iterator.map { ctx => if (ctx.getStart == 13) ctx else addPhaseSetId(ctx) }.toSeq } - lazy val CallVariants: Seq[VariantContext] = { + lazy val CallVariants: Seq[Variant] = { // Keep the call variant position 14 without a phase set - readBuilderCall.iterator().map { ctx => + readBuilderCall.iterator.map { ctx => if (ctx.getStart == 14) ctx else addPhaseSetId(ctx) }.toSeq } - val Header = readBuilderCall.getFileHeader + val Header = readBuilderCall.header.copy( + formats = readBuilderCall.header.formats :+ VcfFormatHeader("PS", VcfCount.Fixed(1), VcfFieldType.Integer, "Phase set")) } /** @@ -126,17 +123,10 @@ object AssessPhasingTest { class AssessPhasingTest extends ErrorLogLevel { import AssessPhasingTest._ - private def writeVariants(variants: Seq[VariantContext]): PathToVcf = { + private def writeVariants(variants: Seq[Variant]): PathToVcf = { val path = Files.createTempFile("AssessPhasingTest.", ".vcf.gz") - path.toFile.deleteOnExit() - val writer = new VariantContextWriterBuilder() - .setReferenceDictionary(Header.getSequenceDictionary) - .setOutputFile(path.toFile) - .setOption(Options.INDEX_ON_THE_FLY) - .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) - .build() - writer.writeHeader(Header) - variants.foreach(writer.add) + val writer = VcfWriter(path, Header) + writer.write(variants) writer.close() path } @@ -157,13 +147,15 @@ class AssessPhasingTest extends ErrorLogLevel { private def runEndToEnd(intervals: Option[PathToIntervals], expectedPrefix: PathPrefix, - truthVariants: Seq[VariantContext] = TruthVariants, - callVariants: Seq[VariantContext] = CallVariants, + truthVariants: Seq[Variant] = TruthVariants, + callVariants: Seq[Variant] = CallVariants, debugVcf: Boolean = false ): Unit = { - // input files + // input files ) val truthVcf = writeVariants(truthVariants) val callVcf = writeVariants(callVariants) + println(truthVcf) + println(callVcf) // output files val output = makeTempFile("AssessPhasingTest.", "prefix") @@ -173,7 +165,6 @@ class AssessPhasingTest extends ErrorLogLevel { // run it new AssessPhasing(truthVcf=truthVcf, calledVcf=callVcf, knownIntervals=intervals, output=output, debugVcf=debugVcf).execute() - // read the metrics val phasingMetrics = Metric.read[AssessPhasingMetric](path=phasingMetricsPath) val blockLengthMetrics = Metric.read[PhaseBlockLengthMetric](path=blockLengthMetricsPath) @@ -187,6 +178,8 @@ class AssessPhasingTest extends ErrorLogLevel { val expectedBlockLengthMetrics = Metric.read[PhaseBlockLengthMetric](path=expectedBlockLengthMetricsPath) // compare the metrics + println(phasingMetrics) + println(expectedPhasingMetrics) phasingMetrics should contain theSameElementsInOrderAs expectedPhasingMetrics blockLengthMetrics should contain theSameElementsInOrderAs expectedBlockLengthMetrics @@ -204,9 +197,9 @@ class AssessPhasingTest extends ErrorLogLevel { private def toIntervalList(start: Int, end: Int): PathToIntervals = { val path = makeTempFile("AssessPhasingTest.", ".interval_list") - val contig = Header.getSequenceDictionary.getSequence(0).getSequenceName + val contig = Header.dict(0).name val header = new SAMFileHeader - header.setSequenceDictionary(Header.getSequenceDictionary) + header.setSequenceDictionary(Header.dict.toSam) val intervalList = new IntervalList(header) intervalList.add(new Interval(contig, 1, 10)) intervalList.write(path.toFile) @@ -301,21 +294,19 @@ class PhaseBlockTest extends ErrorLogLevel { import AssessPhasingTest.withPhasingSetId "PhaseBlock.toOverlapDetector" should "create an empty detector if no variants are given" in { - val builder = new VCFFileReader(VcfBuilder(samples=Seq("s1")).toTempFile()) - PhaseBlock.buildOverlapDetector(iterator=builder.iterator, dict=builder.getFileHeader.getSequenceDictionary.fromSam).getAll.isEmpty shouldBe true + val builder = VcfBuilder(samples=Seq("s1")) + PhaseBlock.buildOverlapDetector(iterator=builder.iterator, dict=builder.header.dict).getAll.isEmpty shouldBe true } it should "create an empty detector when variants do not have the phase set tag" in { val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - val contextBuilder = new VCFFileReader(builder.toTempFile()) - PhaseBlock.buildOverlapDetector(iterator=contextBuilder.iterator, dict=contextBuilder.getFileHeader.getSequenceDictionary.fromSam).getAll.isEmpty shouldBe true + PhaseBlock.buildOverlapDetector(iterator=builder.iterator, dict=builder.header.dict).getAll.isEmpty shouldBe true } it should "create a detector for a single variant" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) val iterator = builder.iterator.map { ctx => withPhasingSetId(ctx, 1) } - val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builder.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builder.header.dict) detector.getAll should have size 1 val interval = detector.getAll.toSeq.head interval.getStart shouldBe 1 @@ -323,13 +314,12 @@ class PhaseBlockTest extends ErrorLogLevel { } it should "create a detector from multiple variants within one block" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilder.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val iterator = builder.iterator().map { ctx => withPhasingSetId(ctx, 1) } - val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builder.getFileHeader.getSequenceDictionary.fromSam) + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builder.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val iterator = builder.iterator.map { ctx => withPhasingSetId(ctx, 1) } + val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builder.header.dict) detector.getAll should have size 1 val interval = detector.getAll.toSeq.head interval.getStart shouldBe 1 @@ -337,20 +327,18 @@ class PhaseBlockTest extends ErrorLogLevel { } it should "create a detector from multiple variants across multiple blocks" in { - val vcfBuilderBlockOne = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockOne.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=6, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - - val builderBlockOne = new VCFFileReader(vcfBuilderBlockOne.toTempFile()) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) - val iterator = builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 1) } ++ builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 4) } - - val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val builderBlockOne = VcfBuilder(samples=Seq("s1")) + builderBlockOne.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val builderBlockTwo = VcfBuilder(samples=Seq("s1")) + builderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=6, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + + val iterator = builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 1) } ++ builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 4) } + + val detector = PhaseBlock.buildOverlapDetector(iterator=iterator, dict=builderBlockOne.header.dict) detector.getAll should have size 2 val intervals = detector.getAll.toSeq.sortBy(_.getStart) val head = intervals.head @@ -362,22 +350,20 @@ class PhaseBlockTest extends ErrorLogLevel { } it should "keep the larger block when one block encloses/contains another" in { - val vcfBuilderBlockOne = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockOne.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockOne = new VCFFileReader(vcfBuilderBlockOne.toTempFile()) + val builderBlockOne = VcfBuilder(samples=Seq("s1")) + builderBlockOne.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) // second fully contained in the first { - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("S1")).add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) - val contexts = (builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 1) } ++ builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 2) }).toSeq + val builderBlockTwo = VcfBuilder(samples=Seq("S1")).add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val contexts = (builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 1) } ++ builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 2) }).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 1 val intervals = detector.getAll.toSeq.sortBy(_.getStart) val head = intervals.head @@ -387,15 +373,14 @@ class PhaseBlockTest extends ErrorLogLevel { // first fully contained in the second { - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")).add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) + val builderBlockTwo = VcfBuilder(samples=Seq("s1")).add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val contexts = (builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 1) }).toSeq + val contexts = (builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 1) }).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 1 val intervals = detector.getAll.toSeq.sortBy(_.getStart) val head = intervals.head @@ -407,36 +392,34 @@ class PhaseBlockTest extends ErrorLogLevel { { val contexts = builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 1) }.toSeq - val detector = PhaseBlock.buildOverlapDetector(iterator = (contexts ++ contexts).iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = (contexts ++ contexts).iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 1 val intervals = detector.getAll.toSeq.sortBy(_.getStart) - val head = intervals.head + val head = intervals.head head.getStart shouldBe 1 head.getEnd shouldBe 3 } } it should "truncate the smaller block when too blocks overlap" in { - val vcfBuilderBlockOne = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockOne = new VCFFileReader(vcfBuilderBlockOne.toTempFile()) + val builderBlockOne = VcfBuilder(samples=Seq("s1")) + builderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) // first block is extended { - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockTwo.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) + val builderBlockTwo = VcfBuilder(samples=Seq("s1")) + builderBlockTwo.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val contexts = (builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 3) }).toSeq + val contexts = (builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 3) }).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 2 val intervals = detector.getAll.toSeq.sortBy(_.getStart) val head = intervals.head @@ -449,19 +432,18 @@ class PhaseBlockTest extends ErrorLogLevel { // second block is extended { - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockTwo.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=6, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) + val builderBlockTwo = VcfBuilder(samples=Seq("s1")) + builderBlockTwo.add(pos=3, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=4, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=5, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=6, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val contexts = (builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 3) }).toSeq + val contexts = (builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 2) } ++ builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 3) }).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 2 val intervals = detector.getAll.toSeq.sortBy(_.getStart) val head = intervals.head @@ -474,30 +456,27 @@ class PhaseBlockTest extends ErrorLogLevel { } it should "resolve three overlapping blocks, such that when the middle one is truncated and now starts after the third, it is resolved" in { - val vcfBuilderBlockOne = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=10, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockOne = new VCFFileReader(vcfBuilderBlockOne.toTempFile()) - val oneIter = builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 2) } - - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockTwo.add(pos=8, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=14, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) - val twoIter = builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 3) } - - val vcfBuilderBlockThree = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockThree.add(pos=9, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockThree.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockThree = new VCFFileReader(vcfBuilderBlockThree.toTempFile()) - val threeIter = builderBlockThree.iterator().map { ctx => withPhasingSetId(ctx, 4) } + val builderBlockOne = VcfBuilder(samples=Seq("s1")) + builderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=10, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val oneIter = builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 2) } + + val builderBlockTwo = VcfBuilder(samples=Seq("s1")) + builderBlockTwo.add(pos=8, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=14, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val twoIter = builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 3) } + + val builderBlockThree = VcfBuilder(samples=Seq("s1")) + builderBlockThree.add(pos=9, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockThree.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val threeIter = builderBlockThree.iterator.map { ctx => withPhasingSetId(ctx, 4) } val contexts = (oneIter ++ twoIter ++ threeIter).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 3 val intervals = detector.getAll.toSeq.sortBy(_.getStart) @@ -517,30 +496,27 @@ class PhaseBlockTest extends ErrorLogLevel { } it should "resolve three overlapping blocks, such that when the middle one is truncated and now is enclosed in the third, we get two blocks" in { - val vcfBuilderBlockOne = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockOne.add(pos=10, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockOne = new VCFFileReader(vcfBuilderBlockOne.toTempFile()) - val oneIter = builderBlockOne.iterator().map { ctx => withPhasingSetId(ctx, 2) } - - val vcfBuilderBlockTwo = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockTwo.add(pos=8, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockTwo.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockTwo = new VCFFileReader(vcfBuilderBlockTwo.toTempFile()) - val twoIter = builderBlockTwo.iterator().map { ctx => withPhasingSetId(ctx, 3) } - - val vcfBuilderBlockThree = VcfBuilder(samples=Seq("s1")) - vcfBuilderBlockThree.add(pos=9, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderBlockThree.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderBlockThree = new VCFFileReader(vcfBuilderBlockThree.toTempFile()) - val threeIter = builderBlockThree.iterator().map { ctx => withPhasingSetId(ctx, 4) } + val builderBlockOne = VcfBuilder(samples=Seq("s1")) + builderBlockOne.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockOne.add(pos=10, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val oneIter = builderBlockOne.iterator.map { ctx => withPhasingSetId(ctx, 2) } + + val builderBlockTwo = VcfBuilder(samples=Seq("s1")) + builderBlockTwo.add(pos=8, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockTwo.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val twoIter = builderBlockTwo.iterator.map { ctx => withPhasingSetId(ctx, 3) } + + val builderBlockThree = VcfBuilder(samples=Seq("s1")) + builderBlockThree.add(pos=9, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderBlockThree.add(pos=13, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val threeIter = builderBlockThree.iterator.map { ctx => withPhasingSetId(ctx, 4) } val contexts = (oneIter ++ twoIter ++ threeIter).toSeq // Check that if we do not want to modify the blocks we get an exception - an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam, modifyBlocks = false) + an[Exception] should be thrownBy PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict, modifyBlocks = false) - val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.getFileHeader.getSequenceDictionary.fromSam) + val detector = PhaseBlock.buildOverlapDetector(iterator = contexts.iterator, dict = builderBlockOne.header.dict) detector.getAll should have size 2 val intervals = detector.getAll.toSeq.sortBy(_.getStart) @@ -560,12 +536,12 @@ class PhaseCigarTest extends ErrorLogLevel { import AssessPhasingTest._ import PhaseCigarOp._ - private def toCigar(truth: Seq[VariantContext], - call: Seq[VariantContext], - header: VCFHeader, + private def toCigar(truth: Seq[Variant], + call: Seq[Variant], + header: VcfHeader, skipMismatchingAlleles: Boolean, assumeFixedAlleleOrder: Boolean = true): (Seq[PhaseCigarOp], AssessPhasingMetric) = { - val dict = header.getSequenceDictionary.fromSam + val dict = header.dict val pairedIterator = JointVariantContextIterator( iters = Seq(truth.iterator, call.iterator), dict = dict @@ -589,13 +565,12 @@ class PhaseCigarTest extends ErrorLogLevel { } "Cigar.toCigar" should "create an empty cigar if no variants have a phasing set" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val ctx = builder.iterator().next() + val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val ctx = builder.iterator.next() // truth variant only { - val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq.empty, header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq.empty, header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, BothEnd) metric.num_called shouldBe 0 metric.num_truth shouldBe 1 @@ -604,7 +579,7 @@ class PhaseCigarTest extends ErrorLogLevel { // call variant only { - val (cigar, metric) = toCigar(truth = Seq.empty, call = Seq(ctx), header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq.empty, call = Seq(ctx), header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 0 @@ -613,13 +588,12 @@ class PhaseCigarTest extends ErrorLogLevel { } it should "create a cigar from either a single truth or call variant" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val ctx = withPhasingSetId(builder.iterator().next(), 1) + val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val ctx = withPhasingSetId(builder.iterator.next(), 1) // truth variant only { - val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq.empty, header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq.empty, header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, TruthOnly, BothEnd) metric.num_called shouldBe 0 metric.num_truth shouldBe 1 @@ -628,7 +602,7 @@ class PhaseCigarTest extends ErrorLogLevel { // call variant only { - val (cigar, metric) = toCigar(truth = Seq.empty, call = Seq(ctx), header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq.empty, call = Seq(ctx), header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, CallOnly, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -637,14 +611,13 @@ class PhaseCigarTest extends ErrorLogLevel { } it should "create a cigar when both truth and call variants are present but only of the two are phased" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val ctxPhased = withPhasingSetId(builder.iterator().next(), 1) - val ctxNoPhase = builder.iterator().next() + val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val ctxPhased = withPhasingSetId(builder.iterator.next(), 1) + val ctxNoPhase = builder.iterator.next() // truth variant phased only { - val (cigar, metric) = toCigar(truth = Seq(ctxPhased), call = Seq(ctxNoPhase), header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(ctxPhased), call = Seq(ctxNoPhase), header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, TruthOnly, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 0 @@ -654,7 +627,7 @@ class PhaseCigarTest extends ErrorLogLevel { // call variant phased only { - val (cigar, metric) = toCigar(truth = Seq(ctxNoPhase), call = Seq(ctxPhased), header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(ctxNoPhase), call = Seq(ctxPhased), header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, CallOnly, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -664,13 +637,12 @@ class PhaseCigarTest extends ErrorLogLevel { } it should "create a cigar when both truth and call variants are present and both are phased" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val ctx = withPhasingSetId(builder.iterator().next(), 1) + val builder = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val ctx = withPhasingSetId(builder.iterator.next(), 1) // both variants are phased { - val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq(ctx), header = builder.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(ctx), call = Seq(ctx), header = builder.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, Match, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -680,18 +652,16 @@ class PhaseCigarTest extends ErrorLogLevel { } // split into three different unit tests can put 683-693 into different function, and put three subtests in separate tests it should "create a cigar when both truth and call variants are present and both are phased but mismatch alleles" in { - val vcfBuilderTruth = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderTruth = new VCFFileReader(vcfBuilderTruth.toTempFile()) + val builderTruth = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val vcfBuilderCall = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1|0"))) - val builderCall = new VCFFileReader(vcfBuilderCall.toTempFile()) + val builderCall = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1|0"))) - val truth = withPhasingSetId(builderTruth.iterator().next(), 1) - val call = withPhasingSetId(builderCall.iterator().next(), 1) + val truth = withPhasingSetId(builderTruth.iterator.next(), 1) + val call = withPhasingSetId(builderCall.iterator.next(), 1) // both variants are phased, phase is inverted, and we assume a fixed order, so a mismatch { - val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.getFileHeader, skipMismatchingAlleles = true, assumeFixedAlleleOrder = true) + val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.header, skipMismatchingAlleles = true, assumeFixedAlleleOrder = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, Mismatch, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -700,7 +670,7 @@ class PhaseCigarTest extends ErrorLogLevel { } // both variants are phased, phase is inverted, and we don't assume a fixed order, so a match { - val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.getFileHeader, skipMismatchingAlleles = true, assumeFixedAlleleOrder = false) + val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.header, skipMismatchingAlleles = true, assumeFixedAlleleOrder = false) cigar should contain theSameElementsInOrderAs Seq(BothEnd, Match, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -709,14 +679,12 @@ class PhaseCigarTest extends ErrorLogLevel { } // first site is a match, second is a mismatch since we inverted after the first { - vcfBuilderTruth.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilderCall.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderTruth = new VCFFileReader(vcfBuilderTruth.toTempFile()) - val builderCall = new VCFFileReader(vcfBuilderCall.toTempFile()) - val truthTwo = withPhasingSetId(builderTruth.iterator().toSeq.last, 1) - val callTwo = withPhasingSetId(builderCall.iterator().toSeq.last, 1) - - val (cigar, metric) = toCigar(truth = Seq(truth, truthTwo), call = Seq(call, callTwo), header = builderTruth.getFileHeader, skipMismatchingAlleles = true, assumeFixedAlleleOrder = false) + builderTruth.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builderCall.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val truthTwo = withPhasingSetId(builderTruth.iterator.toSeq.last, 1) + val callTwo = withPhasingSetId(builderCall.iterator.toSeq.last, 1) + + val (cigar, metric) = toCigar(truth = Seq(truth, truthTwo), call = Seq(call, callTwo), header = builderTruth.header, skipMismatchingAlleles = true, assumeFixedAlleleOrder = false) cigar should contain theSameElementsInOrderAs Seq(BothEnd, Match, Mismatch, BothEnd) metric.num_called shouldBe 2 metric.num_phased shouldBe 2 @@ -726,18 +694,15 @@ class PhaseCigarTest extends ErrorLogLevel { } it should "skip sites where alleles mismatch if specified" in { - val vcfBuilderTruth = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderTruth = new VCFFileReader(vcfBuilderTruth.toTempFile()) - - val vcfBuilderCall = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "G"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builderCall = new VCFFileReader(vcfBuilderCall.toTempFile()) + val builderTruth = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val builderCall = VcfBuilder(samples=Seq("s1")).add(pos=1, alleles=Seq("A", "G"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val truth = withPhasingSetId(builderTruth.iterator().next(), 1) - val call = withPhasingSetId(builderCall.iterator().next(), 1) + val truth = withPhasingSetId(builderTruth.iterator.next(), 1) + val call = withPhasingSetId(builderCall.iterator.next(), 1) // skip the sites that have mismatching alleles { - val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.getFileHeader, skipMismatchingAlleles = true) + val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.header, skipMismatchingAlleles = true) cigar should contain theSameElementsInOrderAs Seq(BothEnd, BothEnd) metric.num_called shouldBe 0 metric.num_truth shouldBe 0 @@ -745,7 +710,7 @@ class PhaseCigarTest extends ErrorLogLevel { // include the sites that have mismatching alleles { - val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.getFileHeader, skipMismatchingAlleles = false) + val (cigar, metric) = toCigar(truth = Seq(truth), call = Seq(call), header = builderTruth.header, skipMismatchingAlleles = false) cigar should contain theSameElementsInOrderAs Seq(BothEnd, Mismatch, BothEnd) metric.num_called shouldBe 1 metric.num_phased shouldBe 1 @@ -965,12 +930,11 @@ class PhaseCigarTest extends ErrorLogLevel { } "Cigar.contextsToBlockEndOperator/contextsToMatchingOperator" should "should return the cigar operator for two variant contexts" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val ctxStart = withPhasingSetId(builder.iterator().toSeq.head, 1) - val ctxNoStart = withPhasingSetId(builder.iterator().toSeq.last, 1) + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + val ctxStart = withPhasingSetId(builder.iterator.toSeq.head, 1) + val ctxNoStart = withPhasingSetId(builder.iterator.toSeq.last, 1) // No truth, call is start of a phase block PhaseCigar.contextsToBlockEndOperator(truth=None, call=Some(ctxStart)) shouldBe Some(CallEnd) @@ -1006,10 +970,9 @@ class PhaseCigarTest extends ErrorLogLevel { } it should "should return the cigar operator for two variant contexts that disagree on phase" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1|0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0|1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1|0"))) val ctx = withPhasingSetId(builder.head, 1) val ctxMismatch = withPhasingSetId(builder.last, 2) @@ -1023,28 +986,25 @@ class PhaseCigarTest extends ErrorLogLevel { } "Cigar.cigarForVariantContexts" should "return a match if two variant contexts share the same alleles in the same order" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) PhaseCigar.cigarTypeForVariantContexts(builder.head, builder.last) shouldBe Match PhaseCigar.cigarTypeForVariantContexts(builder.last, builder.head) shouldBe Match } it should "return a mismatch if two variant contexts share the same alleles but in the different order" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1/0"))) PhaseCigar.cigarTypeForVariantContexts(builder.head, builder.last) shouldBe Mismatch PhaseCigar.cigarTypeForVariantContexts(builder.last, builder.head) shouldBe Mismatch } it should "return a mismatch if two variant contexts share the different alleles" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "G"), gts=Seq(Gt(sample="s1", gt="0/1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "G"), gts=Seq(Gt(sample="s1", gt="0/1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) PhaseCigar.cigarTypeForVariantContexts(builder.head, builder.last) shouldBe Mismatch PhaseCigar.cigarTypeForVariantContexts(builder.last, builder.head) shouldBe Mismatch } @@ -1052,19 +1012,17 @@ class PhaseCigarTest extends ErrorLogLevel { it should "throw an exception if the variant contexts do not have exactly two alleles" in { // One variant alleles, one genotype allele { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0"))) - vcfBuilder.add(pos=2, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0"))) + builder.add(pos=2, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0"))) an[Exception] should be thrownBy PhaseCigar.cigarTypeForVariantContexts(builder.head, builder.last) } // Two variant alleles, one genotype allele { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - vcfBuilder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1"))) - vcfBuilder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()).iterator().toSeq + val builder = VcfBuilder(samples=Seq("s1")) + builder.add(pos=1, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1"))) + builder.add(pos=2, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="0/1"))) an[Exception] should be thrownBy PhaseCigar.cigarTypeForVariantContexts(builder.head, builder.last) an[Exception] should be thrownBy PhaseCigar.cigarTypeForVariantContexts(builder.last, builder.head) } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIteratorTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIteratorTest.scala index 8a625229e..092af8e74 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIteratorTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/ByIntervalListVariantContextIteratorTest.scala @@ -24,13 +24,11 @@ package com.fulcrumgenomics.vcf -import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.testing.VcfBuilder.Gt import com.fulcrumgenomics.testing.{UnitSpec, VcfBuilder} +import com.fulcrumgenomics.vcf.api.{Variant, VcfSource} import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.util.{Interval, IntervalList} -import htsjdk.variant.variantcontext.VariantContext -import htsjdk.variant.vcf.VCFFileReader /** * Tests for ByIntervalListVariantContextIterator. @@ -44,49 +42,45 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { new IntervalList(header) } - private def toIterator(reader: VCFFileReader, + private def toIterator(reader: VcfSource, intervalList: IntervalList, - useIndex: Boolean): Iterator[VariantContext] = { + useIndex: Boolean): Iterator[Variant] = { if (useIndex) { ByIntervalListVariantContextIterator(reader=reader, intervalList=intervalList) } else { - import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary - val dict = reader.getFileHeader.getSequenceDictionary.fromSam - ByIntervalListVariantContextIterator(iterator=reader.iterator(), intervalList=intervalList, dict=dict) + val dict = reader.header.dict + ByIntervalListVariantContextIterator(iterator=reader.iterator, intervalList=intervalList, dict=dict) } } "ByIntervalListVariantContextIterator" should "return no variants if the interval list is empty" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val iterator = toIterator(reader=builder, intervalList=emtpyIntervalList(), useIndex=useIndex) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) + val iterator = toIterator(reader=builder.toSource, intervalList=emtpyIntervalList(), useIndex=useIndex) iterator.isEmpty shouldBe true } } it should "return no variants if the variants are empty" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 1, 1000, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=emtpyIntervalList(), useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=emtpyIntervalList(), useIndex=useIndex) iterator.isEmpty shouldBe true } } it should "return a variant context if it overlaps an interval" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 1, 1000, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe false val actual = iterator.next() - val expected = builder.iterator().next() + val expected = builder.iterator.next() actual.getContig shouldBe expected.getContig actual.getStart shouldBe expected.getStart actual.getEnd shouldBe expected.getEnd @@ -96,31 +90,28 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { it should "not return a variant context if it doesn't overlap an interval (same chromosome)" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 750, 1000, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe true } } it should "not return a variant context if it doesn't overlap an interval (different chromosome)" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=500, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(1).getSequenceName, 1, 1000, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe true } } it should "throw an exception when next() is call but hasNext() is false" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) - val iterator = toIterator(reader=builder, intervalList=emtpyIntervalList(), useIndex=useIndex) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A"), gts=Seq(Gt(sample="s1", gt="0/0"))) + val iterator = toIterator(reader=builder.toSource, intervalList=emtpyIntervalList(), useIndex=useIndex) iterator.hasNext shouldBe false an[Exception] should be thrownBy iterator.next() } @@ -128,14 +119,13 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { it should "return a variant context if it encloses an interval" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 496, 496, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe false val actual = iterator.next() - val expected = builder.iterator().next() + val expected = builder.iterator.next() actual.getContig shouldBe expected.getContig actual.getStart shouldBe expected.getStart actual.getEnd shouldBe expected.getEnd @@ -145,15 +135,14 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { it should "return a variant context only once if it overlaps multiple intervals" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 496, 496, false, "foo")) intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 500, 500, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe false val actual = iterator.next() - val expected = builder.iterator().next() + val expected = builder.iterator.next() actual.getContig shouldBe expected.getContig actual.getStart shouldBe expected.getStart actual.getEnd shouldBe expected.getEnd @@ -162,14 +151,13 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { } it should "throw an exception when intervals are given out of order when using the VCF index" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) + val builder = VcfBuilder(samples=Seq("s1")) .add(chrom="chr1", pos=495, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) .add(chrom="chr1", pos=595, alleles=Seq("AAAAA", "A"), gts=Seq(Gt(sample="s1", gt="1/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 494, 500, false, "foo")) intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 500, 500, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=true) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=true) // OK, since we are overlapping the first interval iterator.isEmpty shouldBe false // NOK, since the intervals were overlapping when we pre-fetch the second variant context @@ -178,11 +166,10 @@ class ByIntervalListVariantContextIteratorTest extends UnitSpec { it should "ignore a variant context if does not overlap an interval" in { Iterator(true, false).foreach { useIndex => - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1/1"))) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=495, alleles=Seq("A", "C"), gts=Seq(Gt(sample="s1", gt="1/1"))) val intervalList = emtpyIntervalList() intervalList.add(new Interval(dict.getSequence(0).getSequenceName, 500, 500, false, "foo")) - val iterator = toIterator(reader=builder, intervalList=intervalList, useIndex=useIndex) + val iterator = toIterator(reader=builder.toSource, intervalList=intervalList, useIndex=useIndex) iterator.isEmpty shouldBe true } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/JointVariantContextIteratorTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/JointVariantContextIteratorTest.scala index 9f53e7193..ffad4aa59 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/JointVariantContextIteratorTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/JointVariantContextIteratorTest.scala @@ -25,10 +25,7 @@ package com.fulcrumgenomics.vcf import com.fulcrumgenomics.testing.{UnitSpec, VcfBuilder} -import htsjdk.variant.variantcontext.VariantContext -import htsjdk.variant.vcf.VCFFileReader - -import scala.jdk.CollectionConverters.IteratorHasAsScala +import com.fulcrumgenomics.vcf.api.Variant /** * Tests for JointVariantContextIterator. @@ -36,51 +33,46 @@ import scala.jdk.CollectionConverters.IteratorHasAsScala class JointVariantContextIteratorTest extends UnitSpec { private val dict = VcfBuilder(samples=Seq("s1")).header.dict - private def compareVariantContexts(actual: VariantContext, expected: VariantContext): Unit = { + private def compareVariantContexts(actual: Variant, expected: Variant): Unit = { actual.getContig shouldBe expected.getContig actual.getStart shouldBe expected.getStart actual.getEnd shouldBe expected.getEnd } "JointVariantContextIterator" should "iterate variant contexts given a single iterator" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A")) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A")) - val iterator = JointVariantContextIterator(iters=Seq(builder.iterator().asScala), dict=dict) - compareVariantContexts(actual=iterator.next().head.get, expected=builder.iterator().next()) + val iterator = JointVariantContextIterator(iters=Seq(builder.iterator), dict=dict) + compareVariantContexts(actual=iterator.next().head.get, expected=builder.iterator.next()) } it should "not return a variant context if all the iterators are empty" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")) - val iterator = JointVariantContextIterator(iters=Seq(builder.iterator().asScala, builder.iterator().asScala), dict=dict) + val iterator = JointVariantContextIterator(iters=Seq(builder.iterator, builder.iterator), dict=dict) iterator.hasNext shouldBe false an[NoSuchElementException] should be thrownBy iterator.next() } it should "return a pair of variant contexts at the same position" in { - val vcfBuilder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A")) - val builder = new VCFFileReader(vcfBuilder.toTempFile()) + val builder = VcfBuilder(samples=Seq("s1")).add(chrom="chr1", pos=1, alleles=Seq("A")) - val iterator = JointVariantContextIterator(iters=Seq(builder.iterator().asScala, builder.iterator().asScala), dict=dict) + val iterator = JointVariantContextIterator(iters=Seq(builder.iterator, builder.iterator), dict=dict) iterator.hasNext shouldBe true val Seq(left, right) = iterator.next().flatten compareVariantContexts(left, right) } it should "return a None for an iterator that doesn't have a variant context for a given covered site" in { - val vcfBuilderLeft = VcfBuilder(samples=Seq("s1")) + val builderLeft = VcfBuilder(samples=Seq("s1")) .add(chrom="chr1", pos=10, alleles=Seq("A")) .add(chrom="chr1", pos=30, alleles=Seq("A")) - val builderLeft = new VCFFileReader(vcfBuilderLeft.toTempFile()) - val vcfBuilderRight = VcfBuilder(samples=Seq("s1")) + val builderRight = VcfBuilder(samples=Seq("s1")) .add(chrom="chr1", pos=10, alleles=Seq("A")) .add(chrom="chr1", pos=20, alleles=Seq("A")) - val builderRight = new VCFFileReader(vcfBuilderRight.toTempFile()) - val iterator = JointVariantContextIterator(iters=Seq(builderLeft.iterator().asScala, builderRight.iterator().asScala), dict=dict) + val iterator = JointVariantContextIterator(iters=Seq(builderLeft.iterator, builderRight.iterator), dict=dict) // pos: 10 status: both iterator.hasNext shouldBe true iterator.next().flatten match { @@ -89,12 +81,46 @@ class JointVariantContextIteratorTest extends UnitSpec { // pos: 20 status: right iterator.hasNext shouldBe true iterator.next() match { - case Seq(None, Some(right)) => compareVariantContexts(right, builderRight.iterator().asScala.toSeq.last) + case Seq(None, Some(right)) => compareVariantContexts(right, builderRight.iterator.toSeq.last) } // pos: 30 status: left iterator.hasNext shouldBe true iterator.next() match { - case Seq(Some(left), None) => compareVariantContexts(left, builderLeft.iterator().asScala.toSeq.last) + case Seq(Some(left), None) => compareVariantContexts(left, builderLeft.iterator.toSeq.last) } } } + +/** + * Tests for VariantComparator. + */ +class VariantComparatorTest extends UnitSpec { + private val dict = VcfBuilder(samples = Seq("s1")).header.dict + + "VariantComparator" should "correctly compare variant positions" in { + val builder1 = VcfBuilder(samples = Seq("s1")) + .add(chrom = "chr1", pos = 1, alleles = Seq("A")) // same variant + .add(chrom = "chr1", pos = 2, alleles = Seq("A")) // same chrom, lower position + .add(chrom = "chr1", pos = 5, alleles = Seq("A")) // same chrom, higher position + .add(chrom = "chr1", pos = 6, alleles = Seq("A")) // lower chrom + .add(chrom = "chr4", pos = 1, alleles = Seq("A")) // higher chrom + + val builder2 = VcfBuilder(samples = Seq("s2")) + .add(chrom = "chr1", pos = 1, alleles = Seq("A")) + .add(chrom = "chr1", pos = 3, alleles = Seq("A")) + .add(chrom = "chr1", pos = 4, alleles = Seq("A")) + .add(chrom = "chr2", pos = 6, alleles = Seq("A")) + .add(chrom = "chr3", pos = 1, alleles = Seq("A")) + + val answers = Seq(0, -1, 1, -1, 1) + + val comparator = VariantComparator(dict) + + var i = 0 + builder1.zip(builder2).foreach { case (v1, v2) => + val comp = comparator.compare(v1, v2) + comp shouldBe answers(i) + i += 1 + } + } +} \ No newline at end of file diff --git a/src/test/scala/com/fulcrumgenomics/vcf/VariantMaskTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/VariantMaskTest.scala index 461650823..03a25a62a 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/VariantMaskTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/VariantMaskTest.scala @@ -55,8 +55,7 @@ class VariantMaskTest extends UnitSpec { val builder = VcfBuilder(samples=Seq("S1")) builder.add(chrom="chr1", pos=100, alleles=Seq("A", "T")) builder.add(chrom="chr1", pos=102, alleles=Seq("A", "T")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) mask.isVariant(1, 99) shouldBe false mask.isVariant(1, 100) shouldBe true mask.isVariant(1, 101) shouldBe false @@ -67,8 +66,7 @@ class VariantMaskTest extends UnitSpec { it should "mask all deleted bases for deletions, plus the upstream base" in { val builder = VcfBuilder(samples = Seq("S1")) builder.add(chrom="chr1", pos=100, alleles=Seq("AA", "A")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) mask.isVariant(1, 99) shouldBe false mask.isVariant(1, 100) shouldBe true @@ -79,8 +77,7 @@ class VariantMaskTest extends UnitSpec { it should "mask just the upstream base for insertions" in { val builder = VcfBuilder(samples = Seq("S1")) builder.add(chrom="chr1", pos=100, alleles=Seq("A", "AA")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) mask.isVariant(1, 99) shouldBe false mask.isVariant(1, 100) shouldBe true @@ -91,8 +88,7 @@ class VariantMaskTest extends UnitSpec { it should "allow querying be sequence name as well as ref index" in { val builder = VcfBuilder(samples = Seq("S1")) builder.add(chrom="chr1", pos=100, alleles=Seq("A", "T")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) mask.isVariant("chr0", 100) shouldBe false mask.isVariant("chr1", 100) shouldBe true @@ -112,8 +108,7 @@ class VariantMaskTest extends UnitSpec { val builder = VcfBuilder(samples = Seq("S1")) builder.add(chrom="chr1", pos=100, alleles=Seq("A", "T")) builder.add(chrom="chr2", pos=200, alleles=Seq("A", "T")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) mask.isVariant(1, 100) shouldBe true mask.isVariant(2, 200) shouldBe true @@ -123,8 +118,7 @@ class VariantMaskTest extends UnitSpec { it should "throw an exception if invalid reference sequences or positions are requested" in { val builder = VcfBuilder(header=vcfDefaultHeader) builder.add(chrom="chr1", pos=100, alleles=Seq("A", "T")) - val iterator = new VCFFileReader(builder.toTempFile()) - val mask = VariantMask(iterator.iterator().asScala, dict=dict) + val mask = VariantMask(builder.iterator, dict=dict) an[Exception] shouldBe thrownBy { mask.isVariant(-1, 100) } // invalid index (low) an[Exception] shouldBe thrownBy { mask.isVariant("chrNope", 100) } // invalid ref name