bug fix to resolve inconsistencies between loading data using the built in function and pre-loading using ape. Mostly the later could sometime result in errors if gaps/Ns were present

gtonkinhill · gtonkinhill · commit 851464ee362a · 2022-09-18T23:08:43.000+10:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: fastbaps
 Title: A fast genetic clustering algorithm that approximates a Dirichlet Process Mixture model
-Version: 1.0.7
+Version: 1.0.8
 Authors@R: person("Gerry", "Tonkin-Hill", email = "g.tonkinhill@gmail.com", role = c("aut", "cre"))
 Description: Takes a multiple sequence alignment as input and clusters according to the 'no-admixture' model. 
   It combines ideas from the Bayesian Hierarchical Clustering algorithm of Heller et al. <doi:10.1145/1102351.1102389>
@@ -29,7 +29,7 @@ Suggests: knitr,
     ggtree,
     ggplot2
 LinkingTo: Rcpp, RcppArmadillo
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.1
 VignetteBuilder: knitr
 URL: https://github.com/gtonkinhill/fastbaps
 BugReports: https://github.com/gtonkinhill/fastbaps/issues
diff --git a/R/import_fasta_sparse_nt.R b/R/import_fasta_sparse_nt.R
@@ -10,13 +10,16 @@
 #' @param prior the type of prior to use. Can be one of 'baps' and 'mean' (default=baps)
 #' @param check.fasta whether to check the fasta file for issue. Slows things down a little but may be avoided for very large fasta files.
 #'
-#' @return A sparse matrix reprsentation of the SNPs (different to the consensus sequence)
+#' @return A sparse matrix representation of the SNPs (different to the consensus sequence)
 #'
 #' @examples
 #' fasta <- system.file("extdata", "seqs.fa", package = "fastbaps")
-#' fasta <- ape::read.FASTA(fasta)
 #' sparse.data <- import_fasta_sparse_nt(fasta)
 #'
+#' fasta <- ape::read.FASTA(fasta)
+#' sparse.data.ape <- import_fasta_sparse_nt(fasta)
+#'
+#'
 #' @export
 import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
 
@@ -38,10 +41,10 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
     seqnames <- rownames(fasta)
 
     cons_ref <-  c(a=0,c=1,g=2,t=3,`-`=4,`n`=4)
-    cosensus <- apply(fasta[2:nrow(fasta),,drop=FALSE], 2, function(x){
-      tbl <- table(x)
-      cons_ref[names(tbl)[which.max(tbl)]]
-    })
+    allele_counts <- apply(fasta, 2, function(x){
+      c(table(factor(x, levels = c('a','c','g','t','-','n'))))
+    }, simplify = TRUE)
+    cosensus <- cons_ref[apply(allele_counts, 2, which.max)]
 
     fasta[fasta=='a'] <- 1
     fasta[fasta=='c'] <- 2
@@ -51,7 +54,10 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
     fasta[fasta=='n'] <- 5
     fasta <- apply(fasta, 2, as.numeric)
 
-    ij <- which(t(fasta) != (cosensus+1), arr.ind = TRUE)
+    ij <- which((t(fasta) != (cosensus+1)) & (t(fasta)!=5), arr.ind = TRUE)
+    # remove singletons
+    ij <- ij[allele_counts[cbind(t(fasta)[ij], ij[,1])] > 1, , drop=FALSE]
+
     snp.data <- list(num.seqs=nrow(fasta),
                      consensus=cosensus,
                      seq.length=ncol(fasta),
@@ -61,6 +67,7 @@ import_fasta_sparse_nt <- function(fasta, prior='baps', check.fasta=TRUE){
                                  dims = c(snp.data$seq.length, snp.data$num.seqs),
                                  dimnames = list(1:snp.data$seq.length, seqnames)))
 
+
   } else {
     snp.data <- import_fasta_to_vector_each_nt(fasta)
     snp.data$seq.names <-  gsub("^>","",snp.data$seq.names)
diff --git a/man/import_fasta_sparse_nt.Rd b/man/import_fasta_sparse_nt.Rd