-
-
Notifications
You must be signed in to change notification settings - Fork 75
feat: FastqToBam can extract UMI(s) from the comment in the read name #989
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,12 +71,15 @@ import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord} | |
| |For more information on read structures see the | ||
| |[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) | ||
| | | ||
| |UMIs may be extracted from the read sequences, the read names, or both. If `--extract-umis-from-read-names` is | ||
| |UMIs may be extracted from the read sequences, the read names (or comment), or both. If `--extract-umis-from-read-names` is | ||
| |specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with | ||
| |any UMIs present in the 8th field. If this option is specified, the `--umi-qual-tag` option may not be used as | ||
| |qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those | ||
| |will also be extracted. If UMIs are present in both, the final UMIs are constructed by first taking the UMIs | ||
| |from the read names, then adding a hyphen, then the UMIs extracted from the reads. | ||
| |from the read names, then adding a hyphen, then the UMIs extracted from the reads. If `--extract-umis-from-read-comment` is | ||
| |specified, any UMIs present in the read name comments are extracted; the read name comment is the text _after_ | ||
| |the first white space in the read name (like a FASTA). If the comment is `:`-separated, then the UMI will be | ||
| |extracted from the last field, otherwise the full comment will be used. | ||
| | | ||
| |The same number of input files and read structures must be provided, with one exception: if supplying exactly | ||
| |1 or 2 fastq files, both of which are solely template reads, no read structures need be provided. | ||
|
|
@@ -93,7 +96,10 @@ class FastqToBam | |
| @arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.") val umiTag: String = ConsensusTags.UmiBases, | ||
| @arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.") val umiQualTag: Option[String] = None, | ||
| @arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.") val storeSampleBarcodeQualities: Boolean = false, | ||
| @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.") val extractUmisFromReadNames: Boolean = false, | ||
| @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadComment")) | ||
| val extractUmisFromReadNames: Boolean = false, | ||
| @arg(flag='c', doc="Extract UMI(s) from read name comment and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadNames")) | ||
| val extractUmisFromReadComment: Boolean = false, | ||
| @arg( doc="Read group ID to use in the file header.") val readGroupId: String = "A", | ||
| @arg( doc="The name of the sequenced sample.") val sample: String, | ||
| @arg( doc="The name/ID of the sequenced library.") val library: String, | ||
|
|
@@ -117,6 +123,7 @@ class FastqToBam | |
| validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.") | ||
| validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.") | ||
| validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.") | ||
| validate(!extractUmisFromReadComment || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read description.") | ||
nh13 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| override def execute(): Unit = { | ||
| val encoding = qualityEncoding | ||
|
|
@@ -166,7 +173,11 @@ class FastqToBam | |
| val templates = subs.iterator.filter(_.kind == Template).toList | ||
|
|
||
| // If requested, pull out the UMI(s) from the read name | ||
| val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None | ||
| val umiFromReadName = { | ||
| if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) | ||
| else if (extractUmisFromReadComment) fqs.head.comment.flatMap(comment => Umis.extractUmisFromReadComment(comment, strict=true)) | ||
|
Comment on lines
+177
to
+178
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
| else None | ||
| } | ||
|
|
||
| templates.zipWithIndex.map { case (read, index) => | ||
| // If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -55,15 +55,15 @@ object Umis { | |||||
|
|
||||||
| /** | ||||||
| * Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as: | ||||||
| * @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index> | ||||||
| * `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>`` | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that an extra backtick at the end? |
||||||
| * | ||||||
| * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm | ||||||
| * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI | ||||||
| * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be | ||||||
| * translated to hyphens before returning. | ||||||
| * | ||||||
| * If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, | ||||||
| with the UMI being the last in the case of 8 and `None` in the case of 7. | ||||||
| * with the UMI being the last in the case of 8 and `None` in the case of 7. | ||||||
| * | ||||||
| * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. | ||||||
| */ | ||||||
|
|
@@ -88,6 +88,42 @@ object Umis { | |||||
| else umi | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as: | ||||||
| * `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>`` | ||||||
|
Comment on lines
+92
to
+93
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be updated to reflect the expected structure of the read comment? |
||||||
| * | ||||||
| * The index in the description may be replaced with a UMI field. | ||||||
| * | ||||||
| * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm | ||||||
| * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI | ||||||
| * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be | ||||||
| * translated to hyphens before returning. | ||||||
| * | ||||||
| * If `strict` is true the comment _must_ contain either 4 colon-separated segments, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
As far as I can tell, there's no other permissible condition when
|
||||||
| * with the UMI being the last in the case of 4. | ||||||
|
Comment on lines
+102
to
+103
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm far enough removed from the original motivation for this PR that I don't recall - is there a convention for the comment to have four segments? |
||||||
| * | ||||||
| * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. | ||||||
| */ | ||||||
| def extractUmisFromReadComment(comment: String, delimiter: Char = ':', strict: Boolean): Option[String] = { | ||||||
| // If strict, check that the read name actually has eight parts, which is expected | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| val rawUmi = if (strict) { | ||||||
| val colons = comment.count(_ == delimiter) | ||||||
| if (colons == 3) Some(comment.substring(comment.lastIndexOf(delimiter) + 1, comment.length)) | ||||||
| else throw new IllegalArgumentException(s"Trying to extract UMI from read comment with ${colons + 1} parts (4 expected): ${comment}") | ||||||
| } else { | ||||||
| val idx = comment.lastIndexOf(delimiter) | ||||||
| if (idx == -1) Some(comment) | ||||||
| else Some(comment.substring(idx + 1, comment.length)) | ||||||
| } | ||||||
|
|
||||||
| val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) | ||||||
| val valid = umi.forall(u => u.forall(isValidUmiCharacter)) | ||||||
|
|
||||||
| if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${comment}") | ||||||
| else if (!valid) None | ||||||
| else umi | ||||||
| } | ||||||
|
|
||||||
| @inline private def isValidUmiCharacter(ch: Char): Boolean = { | ||||||
| ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -383,6 +383,37 @@ class FastqToBamTest extends UnitSpec { | |
| recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA" | ||
| } | ||
|
|
||
|
|
||
| it should "extract UMIs from the comment in read names when requested" in { | ||
| val r1 = fq( | ||
| FastqRecord("q1:2:3:4:5:6:7:8", "AAAAAAAAAA", "==========", comment=Some("0:1:2:ACGT")), | ||
| FastqRecord("q2:2:3:4:5:6:7:8", "TAAAAAAAAA", "==========", comment=Some("0:1:2:TTGA")), | ||
| FastqRecord("q3:2:3:4:5:6:7", "TAAAAAAAAA", "=========="), | ||
| ) | ||
| val bam = makeTempFile("fastqToBamTest.", ".bam") | ||
| new FastqToBam(input=Seq(r1), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute() | ||
| val recs = readBamRecs(bam) | ||
| recs should have size 3 | ||
| recs(0).apply[String]("RX") shouldBe "ACGT" | ||
| recs(1).apply[String]("RX") shouldBe "TTGA" | ||
| recs(2).get[String]("RX") shouldBe None | ||
| } | ||
|
|
||
| it should "extract UMIs from the comment in read names and read sequences" in { | ||
| val r1 = fq( | ||
| FastqRecord("q1:2:3:4:5:6:7:8", "GGNCCGAAAAAAA", "=============", comment=Some("0:1:2:ACGT+CGTA")), | ||
| FastqRecord("q2:2:3:4:5:6:7:8", "TANAACAAAAAAA", "=============", comment=Some("0:1:2:TTGA+TAAT")), | ||
| ) | ||
| val rs = ReadStructure("2M1S2M+T") | ||
| val bam = makeTempFile("fastqToBamTest.", ".bam") | ||
| new FastqToBam(input=Seq(r1), readStructures=Seq(rs), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute() | ||
| val recs = readBamRecs(bam) | ||
| recs should have size 2 | ||
| recs(0).apply[String]("RX") shouldBe "ACGT-CGTA-GG-CC" | ||
| recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA" | ||
|
Comment on lines
+412
to
+413
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are these suffixed with
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As per the method docs, the UMIs may be extracted from the read names, the read sequences, or both. In this case, the read structure shows UMI bases in the read sequences themselves, as well as the comment in the read name header, so we get four (!) UMI segments, two from the read sequences, and two from the comment in the read header. |
||
| } | ||
|
|
||
|
|
||
| it should "fail when read names don't match up" in { | ||
| val r1 = fq(FastqRecord("q1", "AAAAAAAAAA", "==========")) | ||
| val r2 = fq(FastqRecord("x1", "CCCCCCCCCC", "##########")) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TIL about the
mutexparameter to@arg!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How are mutually exclusive args rendered in the CLI help doc? Is it worth noting that the two arguments are mutually exclusive in the usage above, or will that be automatically noted in the help for the respective flags?