Skip to content

Commit 851464e

Browse files
committed
bug fix to resolve inconsistencies between loading data using the built in function and pre-loading using ape. Mostly the later could sometime result in errors if gaps/Ns were present
1 parent f296238 commit 851464e

File tree

3 files changed

+21
-11
lines changed

3 files changed

+21
-11
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: fastbaps
22
Title: A fast genetic clustering algorithm that approximates a Dirichlet Process Mixture model
3-
Version: 1.0.7
3+
Version: 1.0.8
44
Authors@R: person("Gerry", "Tonkin-Hill", email = "[email protected]", role = c("aut", "cre"))
55
Description: Takes a multiple sequence alignment as input and clusters according to the 'no-admixture' model.
66
It combines ideas from the Bayesian Hierarchical Clustering algorithm of Heller et al. <doi:10.1145/1102351.1102389>
@@ -29,7 +29,7 @@ Suggests: knitr,
2929
ggtree,
3030
ggplot2
3131
LinkingTo: Rcpp, RcppArmadillo
32-
RoxygenNote: 7.2.0
32+
RoxygenNote: 7.2.1
3333
VignetteBuilder: knitr
3434
URL: https://github.com/gtonkinhill/fastbaps
3535
BugReports: https://github.com/gtonkinhill/fastbaps/issues

R/import_fasta_sparse_nt.R

+14-7
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@
1010
#' @param prior the type of prior to use. Can be one of 'baps' and 'mean' (default=baps)
1111
#' @param check.fasta whether to check the fasta file for issue. Slows things down a little but may be avoided for very large fasta files.
1212
#'
13-
#' @return A sparse matrix reprsentation of the SNPs (different to the consensus sequence)
13+
#' @return A sparse matrix representation of the SNPs (different to the consensus sequence)
1414
#'
1515
#' @examples
1616
#' fasta <- system.file("extdata", "seqs.fa", package = "fastbaps")
17-
#' fasta <- ape::read.FASTA(fasta)
1817
#' sparse.data <- import_fasta_sparse_nt(fasta)
1918
#'
19+
#' fasta <- ape::read.FASTA(fasta)
20+
#' sparse.data.ape <- import_fasta_sparse_nt(fasta)
21+
#'
22+
#'
2023
#' @export
2124
import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
2225

@@ -38,10 +41,10 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
3841
seqnames <- rownames(fasta)
3942

4043
cons_ref <- c(a=0,c=1,g=2,t=3,`-`=4,`n`=4)
41-
cosensus <- apply(fasta[2:nrow(fasta),,drop=FALSE], 2, function(x){
42-
tbl <- table(x)
43-
cons_ref[names(tbl)[which.max(tbl)]]
44-
})
44+
allele_counts <- apply(fasta, 2, function(x){
45+
c(table(factor(x, levels = c('a','c','g','t','-','n'))))
46+
}, simplify = TRUE)
47+
cosensus <- cons_ref[apply(allele_counts, 2, which.max)]
4548

4649
fasta[fasta=='a'] <- 1
4750
fasta[fasta=='c'] <- 2
@@ -51,7 +54,10 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
5154
fasta[fasta=='n'] <- 5
5255
fasta <- apply(fasta, 2, as.numeric)
5356

54-
ij <- which(t(fasta) != (cosensus+1), arr.ind = TRUE)
57+
ij <- which((t(fasta) != (cosensus+1)) & (t(fasta)!=5), arr.ind = TRUE)
58+
# remove singletons
59+
ij <- ij[allele_counts[cbind(t(fasta)[ij], ij[,1])] > 1, , drop=FALSE]
60+
5561
snp.data <- list(num.seqs=nrow(fasta),
5662
consensus=cosensus,
5763
seq.length=ncol(fasta),
@@ -61,6 +67,7 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
6167
dims = c(snp.data$seq.length, snp.data$num.seqs),
6268
dimnames = list(1:snp.data$seq.length, seqnames)))
6369

70+
6471
} else {
6572
snp.data <- import_fasta_to_vector_each_nt(fasta)
6673
snp.data$seq.names <- gsub("^>","",snp.data$seq.names)

man/import_fasta_sparse_nt.Rd

+5-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)