Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions R/bambu-extendAnnotations-utilityCombine.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ updateStartEndReadCount <- function(combinedFeatureTibble){
filter(row_number()==1)

combinedFeatureTibble <- combinedFeatureTibble %>%
dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore,
dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore, firstExonGroup, lastExonGroup,
maxTxScore.noFit, NSampleReadCount, NSampleReadProp,
NSampleTxScore, rowID) %>%
full_join(select(startTibble, rowID, start), by = "rowID") %>%
Expand All @@ -134,13 +134,13 @@ combineFeatureTibble <- function(combinedFeatureTibble,
featureTibbleSummarised, index=1, intraGroup = TRUE){
if (is.null(combinedFeatureTibble)) {
combinedTable <- featureTibbleSummarised %>%
select(intronStarts, intronEnds, chr, strand, maxTxScore,
select(intronStarts, intronEnds, chr, strand, maxTxScore, firstExonGroup, lastExonGroup,
maxTxScore.noFit, NSampleReadCount, NSampleReadProp,NSampleTxScore,
starts_with('start'), starts_with('end'), starts_with('readCount'))
} else {
combinedTable <- full_join(combinedFeatureTibble,
featureTibbleSummarised, by = c('intronStarts', 'intronEnds', 'chr',
'strand'), suffix=c('.combined','.new')) %>%
'strand', 'firstExonGroup', 'lastExonGroup'), suffix=c('.combined','.new')) %>%
mutate(NSampleReadCount=pmax0NA(NSampleReadCount.combined) +
pmax0NA(NSampleReadCount.new),
NSampleReadProp = pmax0NA(NSampleReadProp.combined) +
Expand All @@ -154,7 +154,7 @@ combineFeatureTibble <- function(combinedFeatureTibble,
select(intronStarts, intronEnds, chr, strand,
NSampleReadCount, NSampleReadProp, NSampleTxScore, maxTxScore,
maxTxScore.noFit, starts_with('start'), starts_with('end'),
starts_with('readCount'))
starts_with('readCount'), firstExonGroup, lastExonGroup)
}
if(intraGroup)
combinedTable <-
Expand Down Expand Up @@ -186,12 +186,12 @@ extractFeaturesFromReadClassSE <- function(readClassSe, sample_id,
rowData <- as_tibble(rowData(readClassSe)) %>%
mutate(start = unname(min(start(rowRangesSe))),
end= unname(max(end(rowRangesSe))))
group_var <- c("intronStarts", "intronEnds", "chr", "strand")
group_var <- c("intronStarts", "intronEnds", "chr", "strand", "firstExonGroup", "lastExonGroup")
sum_var <- c("start","end","NSampleReadCount", "maxTxScore",
"maxTxScore.noFit", "readCount","NSampleReadProp",
"NSampleTxScore")
featureTibble <- rowData %>%
dplyr::select(chr = chr.rc, start, end, strand = strand.rc,
dplyr::select(chr = chr.rc, start, end, strand = strand.rc, firstExonGroup, lastExonGroup,
intronStarts, intronEnds, confidenceType, readCount, geneReadProp,
txScore, txScore.noFit, numExons) %>%
filter(readCount >= 1, # only use readCount>1 and highconfidence reads
Expand Down
28 changes: 26 additions & 2 deletions R/bambu-extendAnnotations-utilityExtend.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ isore.extendAnnotations <- function(combinedTranscripts, annotationGrangesList,
combinedTranscripts <- filterTranscripts(combinedTranscripts, min.sampleNumber)
if (nrow(combinedTranscripts) > 0) {
group_var <- c("intronStarts","intronEnds","chr","strand","start","end",
"confidenceType","readCount", "maxTxScore", "maxTxScore.noFit")
"confidenceType","readCount", "maxTxScore", "maxTxScore.noFit", "firstExonGroup", "lastExonGroup")
rowDataTibble <- select(combinedTranscripts,all_of(group_var))
annotationSeqLevels <- seqlevels(annotationGrangesList)
rowDataSplicedTibble <- filter(rowDataTibble,
Expand Down Expand Up @@ -343,6 +343,16 @@ addNewSplicedReadClasses <- function(combinedTranscriptRanges,
# annotate with compatible gene id,
rowDataFilteredSpliced$GENEID[equalQhits[!duplicated(equalQhits)]] <-
mcols(annotationGrangesList[equalSubHits[!duplicated(equalQhits)]])$GENEID

# remove TXNAME, GENEID, equal and compatible for
idx_startEnd <- which(rowDataFilteredSpliced$firstExonGroup == 0 |
rowDataFilteredSpliced$lastExonGroup == 0)
if (length(idx_startEnd) > 0) {
classificationTable$compatible[idx_startEnd] <- ""
classificationTable$equal[idx_startEnd] <- ""
rowDataFilteredSpliced$GENEID[idx_startEnd] <- NA
rowDataFilteredSpliced$TXNAME[idx_startEnd] <- NA
}
# annotate as identical, using intron matches
unlistedIntrons <- unlist(intronsByReadClass, use.names = TRUE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(intronsByReadClass)),
Expand All @@ -355,7 +365,8 @@ addNewSplicedReadClasses <- function(combinedTranscriptRanges,
updateWIntronMatches(unlistedIntrons, unlistedIntronsAnnotations,
partitioning, classificationTable, annotationGrangesList,
rowDataFilteredSpliced, exonsByReadClass, min.exonDistance,
min.primarySecondaryDist, min.primarySecondaryDistStartEnd)
min.primarySecondaryDist, min.primarySecondaryDistStartEnd)
classificationTable <- updateWStartEnd(rowDataFilteredSpliced, classificationTable)
rowDataFilteredSpliced$readClassType <-
apply(classificationTable, 1, function(x){paste(x[x!=""], collapse = ":")})
rowDataFilteredSpliced$novelTranscript = TRUE
Expand Down Expand Up @@ -420,6 +431,19 @@ updateWIntronMatches <- function(unlistedIntrons, unlistedIntronsAnnotations,
}


#' update classificationTable by start and end
#' @importFrom GenomicRanges match
#' @noRd
updateWStartEnd <- function(rowDataSplicedTibble, classificationTable) {
idx <- which(rowDataSplicedTibble$firstExonGroup == 0 |
rowDataSplicedTibble$lastExonGroup == 0)
if (length(idx) > 0) {
classificationTable$compatible[idx] <- ""
classificationTable$equal[idx] <- ""
}
return(classificationTable)
}


#' assign gene id by maximum match
#' @importFrom dplyr as_tibble %>% group_by summarise filter ungroup
Expand Down
51 changes: 46 additions & 5 deletions R/bambu-processReads_utilityConstructReadClasses.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
uniqueJunctions = uniqueJunctions,
unlisted_junctions = unlisted_junctions,
readGrgList = readGrgList,
stranded = stranded)}
stranded = stranded, annotations)}
else{exonsByRC.spliced = GRangesList()}
end.ptm <- proc.time()
rm(readGrgList, unlisted_junctions, uniqueJunctions)
Expand Down Expand Up @@ -57,7 +57,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
#' @importFrom GenomicRanges match
#' @noRd
constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions,
readGrgList, stranded = FALSE) {
readGrgList, annotations, stranded = FALSE) {
options(scipen = 999)
allToUniqueJunctionMatch <- GenomicRanges::match(unlisted_junctions,
uniqueJunctions, ignore.strand = TRUE)
Expand Down Expand Up @@ -91,10 +91,11 @@ constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions,
rm(lowConfidenceReads, uniqueJunctions, allToUniqueJunctionMatch)
readTable <- createReadTable(start(unlisted_junctions),
end(unlisted_junctions), mcols(unlisted_junctions)$id, readGrgList,
readStrand, readConfidence)
readStrand, readConfidence, annotations)
exonsByReadClass <- createExonsByReadClass(readTable)
readTable <- readTable %>% dplyr::select(chr.rc = chr, strand.rc = strand,
startSD = startSD, endSD = endSD,
firstExonGroup = firstExonGroup, lastExonGroup = lastExonGroup,
readCount.posStrand = readCount.posStrand, intronStarts, intronEnds,
confidenceType, readCount, readIds, sampleIDs)
mcols(exonsByReadClass) <- readTable
Expand Down Expand Up @@ -159,7 +160,7 @@ correctReadStrandById <- function(strand, id, stranded = FALSE){
#' row_number .groups
#' @noRd
createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end,
unlisted_junctions_id, readGrgList,readStrand, readConfidence) {
unlisted_junctions_id, readGrgList,readStrand, readConfidence, annotations) {
readRanges <- unlist(range(ranges(readGrgList)), use.names = FALSE)
intronStartCoordinatesInt <-
as.integer(min(splitAsList(unlisted_junctions_start,
Expand All @@ -180,15 +181,26 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end,
alignmentStrand = as.character(getStrandFromGrList(readGrgList))=='+',
readId = mcols(readGrgList)$id,
sampleID = mcols(readGrgList)$sampleID)
readTable <- readTable %>%
mutate(intronStartCoordinatesInt = intronStartCoordinatesInt,
intronEndCoordinatesInt = intronEndCoordinatesInt,
firstExon5prime = ifelse(strand != "-", start, end), #assume * is +
firstExon3prime = ifelse(strand != "-", intronStartCoordinatesInt+1, intronEndCoordinatesInt-1),
lastExon5prime = ifelse(strand != "-", intronEndCoordinatesInt-1, intronStartCoordinatesInt+1),
lastExon3prime = ifelse(strand != "-", end, start)
) %>%
select(-intronStartCoordinatesInt, -intronEndCoordinatesInt)
rm(readRanges, readStrand, unlisted_junctions_start,
unlisted_junctions_end, unlisted_junctions_id, readConfidence,
intronStartCoordinatesInt, intronEndCoordinatesInt)
readTable <- splitReadClassByStartEnd(readTable, annotations)
## currently 80%/20% quantile of reads is used to identify start/end sites
readTable <- readTable %>%
group_by(chr, strand, intronEnds, intronStarts, confidenceType) %>%
group_by(chr, strand, intronEnds, intronStarts, confidenceType, firstExonGroup, lastExonGroup) %>%
summarise(readCount = n(), startSD = sd(start), endSD = sd(end),
start = nth(x = start, n = ceiling(readCount / 5), order_by = start),
end = nth(x = end, n = ceiling(readCount / 1.25), order_by = end),
firstExonGroup = unique(firstExonGroup), lastExonGroup = unique(lastExonGroup),
readCount.posStrand = sum(alignmentStrand, na.rm = TRUE),
readIds = list(readId), sampleIDs = list(sampleID),
.groups = 'drop') %>%
Expand All @@ -197,6 +209,35 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end,
return(readTable)
}

splitReadClassByStartEnd <- function(readTable, annotations, startEndWindowSize = 35 ){
exons <- unlist(annotations)
mcols(exons) <- cbind(mcols(exons),
mcols(annotations)[rep(seq_along(annotations), elementNROWS(annotations)), ])
annoTable <- tibble(TXNAME = names(exons),
GENEID = mcols(exons)$GENEID,
exonRank = mcols(exons)$exon_rank,
chr = as.character(seqnames(exons)),
start = start(exons),
end = end(exons),
strand = as.character(strand(exons)),
firstExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is +
firstExon3prime = ifelse(strand != "-", end(exons), start(exons)),
lastExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is +
lastExon3prime = ifelse(strand != "-", end(exons), start(exons)))
readTable = bind_rows(readTable, annoTable)
readTable <- readTable %>%
group_by(firstExon3prime) %>%
mutate(firstExonGroup = ifelse(strand != "-",
findInterval(start,sort(start[is.na(readId)]) - startEndWindowSize),
findInterval(-end,sort(-end[is.na(readId)]) - startEndWindowSize))) %>% ungroup() %>%
group_by(lastExon5prime) %>%
mutate(lastExonGroup = ifelse(strand != "-",
findInterval(-end,sort(-end[is.na(readId)]) - startEndWindowSize),
findInterval(start,sort(start[is.na(readId)]) - startEndWindowSize))) %>% ungroup() %>%
filter(!is.na(readId))
return(readTable)
}

#' @noRd
getChrFromGrList <- function(grl) {
return(unlist(seqnames(grl), use.names = FALSE)[cumsum(elementNROWS(grl))])
Expand Down