diff --git a/deca-cli/pom.xml b/deca-cli/pom.xml index 5968280..3fdc011 100644 --- a/deca-cli/pom.xml +++ b/deca-cli/pom.xml @@ -19,6 +19,13 @@ org.apache.maven.plugins maven-shade-plugin + + + org.bdgenomics.adam + adam-shade-spark2_${scala.version.prefix} + ${adam.version} + + false @@ -39,8 +46,9 @@ shade + workaround - + diff --git a/deca-cli/src/main/scala/org/bdgenomics/deca/cli/CNVer.scala b/deca-cli/src/main/scala/org/bdgenomics/deca/cli/CNVer.scala index 6ab6ab3..cbbc6be 100644 --- a/deca-cli/src/main/scala/org/bdgenomics/deca/cli/CNVer.scala +++ b/deca-cli/src/main/scala/org/bdgenomics/deca/cli/CNVer.scala @@ -103,26 +103,26 @@ class CNVer(protected val args: CNVerArgs) extends BDGSparkCommand[CNVerArgs] { ARF.duplicateRead, ARF.failedVendorQualityChecks, ARF.primaryAlignment, - ARF.mapq, - ARF.contigName, + ARF.mappingQuality, + ARF.referenceName, ARF.start, ARF.end, ARF.cigar, ARF.mateMapped, - ARF.mateContigName, + ARF.mateReferenceName, ARF.mateAlignmentStart, - ARF.inferredInsertSize) + ARF.insertSize) Projection(readFields: _*) } val readsRdds = args.readsPaths.map(path => { // TODO: Add push down filters - log.info("Loading {} alignment file", path) + info("Loading %s alignment file".format(path)) sc.loadAlignments(path, optProjection = Some(readProj), stringency = ValidationStringency.SILENT) }) val targetsAsFeatures = { - val targetProj = Projection(FF.contigName, FF.start, FF.end) + val targetProj = Projection(FF.referenceName, FF.start, FF.end) sc.loadFeatures(args.targetsPath, optProjection = Some(targetProj)) } diff --git a/deca-cli/src/main/scala/org/bdgenomics/deca/cli/Coverager.scala b/deca-cli/src/main/scala/org/bdgenomics/deca/cli/Coverager.scala index 946c04d..f6b6e87 100644 --- a/deca-cli/src/main/scala/org/bdgenomics/deca/cli/Coverager.scala +++ b/deca-cli/src/main/scala/org/bdgenomics/deca/cli/Coverager.scala @@ -88,25 +88,25 @@ class Coverager(protected val args: CoveragerArgs) extends BDGSparkCommand[Cover ARF.duplicateRead, ARF.failedVendorQualityChecks, ARF.primaryAlignment, - ARF.mapq, - ARF.contigName, + ARF.mappingQuality, + ARF.referenceName, ARF.start, ARF.end, ARF.cigar, ARF.mateMapped, - ARF.mateContigName, + ARF.mateReferenceName, ARF.mateAlignmentStart, - ARF.inferredInsertSize) + ARF.insertSize) Projection(readFields: _*) } val readsRdds = args.readsPaths.map(path => { // TODO: Add push down filters - log.info("Loading {} alignment file", path) + info("Loading %s alignment file".format(path)) sc.loadAlignments(path, optProjection = Some(readProj), stringency = ValidationStringency.SILENT) }) - val targetProj = Projection(FF.contigName, FF.start, FF.end) + val targetProj = Projection(FF.referenceName, FF.start, FF.end) val targetsAsFeatures = sc.loadFeatures(args.targetsPath, optProjection = Some(targetProj)) var matrix = Coverage.coverageMatrix(readsRdds, targetsAsFeatures, minMapQ = args.minMappingQuality) diff --git a/deca-core/src/main/scala/org/bdgenomics/deca/Coverage.scala b/deca-core/src/main/scala/org/bdgenomics/deca/Coverage.scala index 88a9182..ff265ef 100644 --- a/deca-core/src/main/scala/org/bdgenomics/deca/Coverage.scala +++ b/deca-core/src/main/scala/org/bdgenomics/deca/Coverage.scala @@ -23,8 +23,8 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.mllib.linalg.distributed.{ IndexedRow, IndexedRowMatrix } import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.ReferenceRegion -import org.bdgenomics.adam.rdd.feature.FeatureRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.deca.coverage.ReadDepthMatrix import org.bdgenomics.deca.util.MLibUtils @@ -69,7 +69,7 @@ object Coverage extends Serializable with Logging { def totalCoverageOfRegion(region: ReferenceRegion, read: RichAlignmentRecord): Long = { // "Clip" bases when the insert size is shorter than the read length - val insert = read.getInferredInsertSize + val insert = read.getInsertSize val regionStart = if (insert != null && insert < 0) max(region.start, read.getEnd + insert - 1) else region.start val regionEnd = if (insert != null && insert > 0) min(region.end, read.getStart + insert + 1) else region.end @@ -93,7 +93,7 @@ object Coverage extends Serializable with Logging { private def perBaseCoverageOfRegion(region: ReferenceRegion, pileup: DenseVector[Int], read: RichAlignmentRecord): DenseVector[Int] = { // "Clip" bases when the insert size is shorter than the read length - val insert = read.getInferredInsertSize + val insert = read.getInsertSize val regionStart = if (insert != null && insert < 0) max(region.start, read.getEnd + insert - 1) else region.start val regionEnd = if (insert != null && insert > 0) min(region.end, read.getStart + insert + 1) else region.end @@ -138,12 +138,12 @@ object Coverage extends Serializable with Logging { } private def readOverlapsTarget(read: AlignmentRecord, target: ReferenceRegion): Boolean = { - read.getContigName == target.referenceName && read.getEnd() > target.start && read.getStart < target.end + read.getReferenceName == target.referenceName && read.getEnd() > target.start && read.getStart < target.end } private def couldOverlapMate(read: AlignmentRecord): Boolean = { read.getMateMapped && - read.getMateContigName == read.getContigName && + read.getMateReferenceName == read.getReferenceName && read.getMateAlignmentStart >= read.getStart && read.getMateAlignmentStart < read.getEnd } @@ -168,7 +168,7 @@ object Coverage extends Serializable with Logging { if (first < targets.length) Some(first) else None } - def sortedCoverageCalculation(targets: Broadcast[Array[(ReferenceRegion, Int)]], reads: AlignmentRecordRDD): RDD[(Int, Double)] = { + def sortedCoverageCalculation(targets: Broadcast[Array[(ReferenceRegion, Int)]], reads: AlignmentRecordDataset): RDD[(Int, Double)] = { val targetsArray = targets.value reads.rdd.mapPartitions(readIter => { if (!readIter.hasNext) { @@ -193,7 +193,7 @@ object Coverage extends Serializable with Logging { } return scanForward(read, firstIndex, targetIndex + 1, mateOverlap) - } else if (read.getContigName == targetRegion.referenceName && read.getStart >= targetRegion.end) + } else if (read.getReferenceName == targetRegion.referenceName && read.getStart >= targetRegion.end) return scanForward(read, targetIndex + 1, targetIndex + 1, mateOverlap) else return firstIndex @@ -210,7 +210,7 @@ object Coverage extends Serializable with Logging { breakable { readIter.foreach(read => { // Crossed a contig boundary? Reset firstRead and firstTarget for coverage analysis of next contig - if (read.getContigName != firstRead.getContigName) { + if (read.getReferenceName != firstRead.getReferenceName) { firstRead = read firstTarget = findFirstTarget(targetsArray, ReferenceRegion.unstranded(read)) break // Should happen infrequently @@ -252,7 +252,7 @@ object Coverage extends Serializable with Logging { new IndexedRowMatrix(indexedRows, numSamples, numTargets.toInt) } - def coverageMatrix(readRdds: Seq[AlignmentRecordRDD], targets: FeatureRDD, minMapQ: Int = 0, numPartitions: Option[Int] = None): ReadDepthMatrix = ComputeReadDepths.time { + def coverageMatrix(readRdds: Seq[AlignmentRecordDataset], targets: FeatureDataset, minMapQ: Int = 0, numPartitions: Option[Int] = None): ReadDepthMatrix = ComputeReadDepths.time { // Sequence dictionary parsing is broken in current ADAM release: // https://github.com/bigdatagenomics/adam/issues/1409 // which breaks the desired sorting of the targets. Upgrading to a newer version of ADAM did not fix the issues as @@ -273,10 +273,10 @@ object Coverage extends Serializable with Logging { // TODO: Verify inputs, e.g. BAM files, are in sorted order val samplesDriver = readRdds.map(readsRdd => { - val samples = readsRdd.recordGroups.toSamples + val samples = readsRdd.readGroups.toSamples if (samples.length > 1) throw new IllegalArgumentException("reads RDD must be a single sample") - samples.head.getSampleId + samples.head.getId }).toArray val numSamples = readRdds.length @@ -291,7 +291,7 @@ object Coverage extends Serializable with Logging { !read.getFailedVendorQualityChecks && read.getPrimaryAlignment && read.getReadMapped && - (minMapQ == 0 || read.getMapq >= minMapQ) + (minMapQ == 0 || read.getMappingQuality >= minMapQ) })) sortedCoverageCalculation(broadcastTargetArray, filteredReadsRdd) diff --git a/deca-core/src/main/scala/org/bdgenomics/deca/HMM.scala b/deca-core/src/main/scala/org/bdgenomics/deca/HMM.scala index edb8e6f..7c18bca 100644 --- a/deca-core/src/main/scala/org/bdgenomics/deca/HMM.scala +++ b/deca-core/src/main/scala/org/bdgenomics/deca/HMM.scala @@ -20,12 +20,12 @@ package org.bdgenomics.deca import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel import org.bdgenomics.adam.models.SequenceDictionary -import org.bdgenomics.adam.rdd.feature.FeatureRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset import org.bdgenomics.deca.Timers._ import org.bdgenomics.deca.coverage.ReadDepthMatrix import org.bdgenomics.deca.hmm.{ SampleModel, TransitionProbabilities } import org.bdgenomics.deca.util.MLibUtils -import org.bdgenomics.formats.avro.{ Feature, Strand } +import org.bdgenomics.formats.avro.{ Feature, Sample, Strand } import org.bdgenomics.utils.misc.Logging /** @@ -36,7 +36,7 @@ object HMM extends Serializable with Logging { def discoverCNVs(readMatrix: ReadDepthMatrix, sequences: SequenceDictionary = SequenceDictionary.empty, M: Double = 3, T: Double = 6, p: Double = 1e-8, D: Double = 70000, - minSomeQuality: Double = 30.0): FeatureRDD = DiscoverCNVs.time { + minSomeQuality: Double = 30.0): FeatureDataset = DiscoverCNVs.time { val sc = SparkContext.getOrCreate() @@ -81,14 +81,14 @@ object HMM extends Serializable with Logging { if (start_target.referenceName == end_target.referenceName) { val builder = Feature.newBuilder(raw_feature) - builder.setContigName(start_target.referenceName) + builder.setReferenceName(start_target.referenceName) builder.setStart(start_target.start) builder.setEnd(end_target.end) Iterable(process(builder)) } else { val builder1 = Feature.newBuilder(raw_feature) - builder1.setContigName(start_target.referenceName) + builder1.setReferenceName(start_target.referenceName) builder1.setStart(start_target.start) val optCl: Option[Long] = sequences(start_target.referenceName).map(_.length) val cl: Long = optCl.getOrElse(Long.MaxValue) @@ -97,7 +97,7 @@ object HMM extends Serializable with Logging { builder1.setEnd(jcl) val builder2 = Feature.newBuilder(raw_feature) - builder2.setContigName(end_target.referenceName) + builder2.setReferenceName(end_target.referenceName) builder2.setStart(0L) builder2.setEnd(end_target.end) @@ -106,6 +106,6 @@ object HMM extends Serializable with Logging { }) }) - FeatureRDD(cnvs, sequences) + FeatureDataset(cnvs, sd = sequences, samples = Seq.empty[Sample]) } } diff --git a/deca-core/src/test/scala/org/bdgenomics/deca/HMMSuite.scala b/deca-core/src/test/scala/org/bdgenomics/deca/HMMSuite.scala index 08158a5..9ba6f26 100644 --- a/deca-core/src/test/scala/org/bdgenomics/deca/HMMSuite.scala +++ b/deca-core/src/test/scala/org/bdgenomics/deca/HMMSuite.scala @@ -81,7 +81,7 @@ class HMMSuite extends DecaFunSuite { val del = cnvs.rdd.filter(f => Option(f.getSource).exists(_.equals("HG00121"))).first() assert(del.getFeatureType() == "DEL") - assert(del.getContigName() == "22") + assert(del.getReferenceName() == "22") assert(del.getStart() == 18898401L) assert(del.getEnd() == 18913235L) assert(aboutEq(del.getScore(), 9.167934190998345)) @@ -95,7 +95,7 @@ class HMMSuite extends DecaFunSuite { val dup = cnvs.rdd.filter(f => Option(f.getSource).exists(_.equals("HG00113"))).first() assert(dup.getFeatureType() == "DUP") - assert(dup.getContigName() == "22") + assert(dup.getReferenceName() == "22") assert(dup.getStart() == 17071767L) assert(dup.getEnd() == 17073440L) assert(aboutEq(dup.getScore(), 25.321428730083596)) diff --git a/pom.xml b/pom.xml index 8a15958..0fb0d1d 100644 --- a/pom.xml +++ b/pom.xml @@ -16,15 +16,15 @@ 1.8 - 0.24.0 - 1.8.0 - 2.11.8 + 0.27.0 + 1.8.2 + 2.11.12 2.11 - 2.1.0 + 2.4.3 0.12 2.7.3 - 0.2.13 + 0.2.15 1.1.1 @@ -87,12 +87,12 @@ org.apache.maven.plugins maven-assembly-plugin - 2.4.1 + 3.1.0 org.apache.maven.plugins maven-shade-plugin - 2.4.3 + 3.2.0 org.codehaus.mojo @@ -102,7 +102,7 @@ pl.project13.maven git-commit-id-plugin - 2.2.1 + 2.2.2 true @@ -138,7 +138,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.5.1 + 3.8.0 ${java.version} ${java.version} @@ -149,17 +149,67 @@ scala-maven-plugin 3.2.2 + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + org.apache.maven.plugins + maven-install-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + forked-path + false + -Psonatype-oss-release + + + + org.apache.maven.plugins + maven-resources-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + org.apache.maven.plugins maven-surefire-plugin - 2.7 + 3.0.0-M3 org.scalatest scalatest-maven-plugin - 1.0 + 2.0.0 org.scalariform @@ -169,7 +219,7 @@ exec-maven-plugin org.codehaus.mojo - 1.3.2 + 1.5.0 org.codehaus.mojo @@ -264,7 +314,6 @@ org.scalariform scalariform-maven-plugin - 0.1.4 default-cli @@ -282,6 +331,7 @@ + @@ -379,7 +430,7 @@ org.scalatest scalatest_${scala.version.prefix} - 2.2.6 + 3.0.7 test @@ -390,12 +441,12 @@ commons-io commons-io - 2.4 + 2.6 args4j args4j - 2.0.31 + 2.33 org.apache.spark @@ -436,7 +487,6 @@ org.apache.maven.plugins maven-source-plugin - 3.0.1 attach-sources @@ -450,7 +500,6 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 attach-javadocs @@ -505,7 +554,7 @@ scoverage-maven-plugin ${scoverage.plugin.version} - 2.11.4 + ${scala.version} org.bdgenomics.deca.Timers true 90 @@ -522,7 +571,6 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 sign-artifacts