Merge branch 'dev'

Conflicts: README.md bwamem.c bwamem_pair.c fastmap.c main.c
mmokrejs · Sep 16, 2014 · df09f38 · df09f38
2 parents 41b137d + 43ce691
commit df09f38
Show file tree

Hide file tree

Showing 15 changed files with 230 additions and 124 deletions.
diff --git a/README.md b/README.md
@@ -87,16 +87,12 @@ settings:
 * Illumina paired-end reads no longer than ~70bp:
 
 		bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
-		bwa sampe ref.fa reads1.sai reads2.sai reads1.fq reads2.fq > aln-pe.sam
+		bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
 
 * PacBio subreads to a reference genome:
 
 		bwa mem -x pacbio ref.fa reads.fq > aln.sam
 
-* PacBio subreads to themselves (the output is not SAM):
-
-		bwa mem -x pbread reads.fq reads.fq > overlap.pas
-
 BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
 error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
 errors given longer query sequences as the chance of missing all seeds is small.
@@ -130,38 +126,6 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
 position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
 as well. BWA-MEM does not have this problem.
 
-####<a name="h38"></a>6. How to map sequences to GRCh38 with ALT contigs?
-
-BWA-backtrack and BWA-MEM partially support mapping to a reference containing
-ALT contigs that represent alternative alleles highly divergent from the
-reference genome.
-
-	# download the K8 executable required by bwa-helper.js
-	wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download
-	tar -jxf k8-0.2.1.tar.bz2
-
-	# download the ALT-to-GRCh38 alignment in the SAM format
-	wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download
-
-	# download the GRCh38 sequences with ALT contigs
-	wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
-
-	# index and mapping
-	bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
-	bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam
-
-Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the
-read has 50 or fewer hits. For each SAM line containing the XA tag,
-`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted
-to the same chromosomal region, adjusts mapping quality and outputs all the
-hits overlapping the reported hit. A read may be mapped to both the primary
-assembly and one or more ALT contigs all with high mapping quality.
-
-Note that this procedure assumes reads are single-end and may miss hits to
-highly repetitive regions as these hits will not be reported with option
-`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
-production uses.
-
 
 
 [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License

diff --git a/bntseq.c b/bntseq.c
@@ -37,6 +37,9 @@
 #include "kseq.h"
 KSEQ_DECLARE(gzFile)
 
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
 #endif
@@ -94,7 +97,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
 
 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
 {
-	char str[1024];
+	char str[8192];
 	FILE *fp;
 	const char *fname;
 	bntseq_t *bns;
@@ -117,14 +120,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
 			if (scanres != 2) goto badread;
 			p->name = strdup(str);
 			// read fasta comments 
-			while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+			while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
 			while (c != '\n' && c != EOF) c = fgetc(fp);
 			if (c == EOF) {
 				scanres = EOF;
 				goto badread;
 			}
 			*q = 0;
-			if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
+			if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
 			else p->anno = strdup("");
 			// read the rest
 			scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
@@ -165,11 +168,33 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
 
 bntseq_t *bns_restore(const char *prefix)
 {  
-	char ann_filename[1024], amb_filename[1024], pac_filename[1024];
+	char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
+	FILE *fp;
+	bntseq_t *bns;
 	strcat(strcpy(ann_filename, prefix), ".ann");
 	strcat(strcpy(amb_filename, prefix), ".amb");
 	strcat(strcpy(pac_filename, prefix), ".pac");
-	return bns_restore_core(ann_filename, amb_filename, pac_filename);
+	bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
+	if (bns == 0) return 0;
+	if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
+		char str[1024];
+		khash_t(str) *h;
+		int i, absent;
+		khint_t k;
+		h = kh_init(str);
+		for (i = 0; i < bns->n_seqs; ++i) {
+			k = kh_put(str, h, bns->anns[i].name, &absent);
+			kh_val(h, k) = i;
+		}
+		while (fscanf(fp, "%s", str) == 1) {
+			k = kh_get(str, h, str);
+			if (k != kh_end(h))
+				bns->anns[kh_val(h, k)].is_alt = 1;
+		}
+		kh_destroy(str, h);
+		fclose(fp);
+	}
+	return bns;
 }
 
 void bns_destroy(bntseq_t *bns)

diff --git a/bntseq.h b/bntseq.h
@@ -43,6 +43,7 @@ typedef struct {
 	int32_t len;
 	int32_t n_ambs;
 	uint32_t gi;
+	int32_t is_alt;
 	char *name, *anno;
 } bntann1_t;