-
Notifications
You must be signed in to change notification settings - Fork 204
CBW IMPACTT Metagenome sequence data analysis with Kraken2 and Phyloseq
Robyn Wright edited this page Jul 2, 2023
·
16 revisions
conda activate taxonomic
mkdir metagenome_workshop
cd metagenome_workshop/
ln -s ~/CourseData/MIC_data/metagenome_data/cat_reads/ .
mkdir kraken2_kreport
mkdir kraken2_outraw
parallel -j 1 --eta 'kraken2 --db ~/CourseData/MIC_data/metagenome_data/k2_pluspf_08gb_20230314/ --threads 4 --output kraken2_outraw/{1/.}_{2}_minikraken.kraken.txt --report kraken2_kreport/{1/.}_{2}_minikraken.kreport --use-names {1} --confidence {2}' ::: cat_reads/*.fastq ::: 0.0 0.1
conda activate bracken
mkdir bracken_out
parallel -j 2 'bracken -d ~/CourseData/MIC_data/metagenome_data/k2_pluspf_08gb_20230314/ -i {} -o bracken_out/{/.}.species.bracken -r 100 -l S -t 1' ::: kraken2_kreport/*minikraken.kreport
mkdir bracken_out_merged
combine_bracken_outputs.py --files bracken_out/*minikraken.species.bracken -o bracken_out_merged/merged_output_minikraken.species.bracken
conda deactivate
conda activate taxonomic
ln -s ~/CourseData/MIC_data/metagenome_data/kraken2_kreport/*RefSeq* kraken2_kreport/
ln -s ~/CourseData/MIC_data/metagenome_data/kraken2_outraw/*RefSeq* kraken2_outraw/
ln -s ~/CourseData/MIC_data/metagenome_data/bracken_out_merged/merged_output_RefSeq.species.bracken bracken_out_merged/
ln -s ~/CourseData/MIC_data/metagenome_data/combine_merged_files.py .
python combine_merged_files.py
Set up:
library(phyloseq)
library(vegan)
library(ggplot2)
library(randomcoloR)
library(gridExtra)
library(tidyr)
Import metadata:
metadata = read.csv('workspace/metagenome_workshop/Blueberry_metadata_metagenome.tsv', sep='\t')
rownames(metadata) = metadata$SampleID
metadata = metadata[,-1]
Import merged bracken output:
bracken = read.csv('workspace/metagenome_workshop/bracken_out_merged/bracken_combined.csv')
bracken = as.data.frame(bracken)
row.names(bracken) = bracken$name
taxonomy = bracken[, c(1,50)]
row.names(taxonomy) = row.names(bracken)
table_num = data.matrix(bracken[,2:49])
rownames(table_num) = bracken[,1]
bracken = as.matrix(table_num)
taxonomy <- separate(data = taxonomy, col = Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "\\;")
taxmat <- taxonomy[,-1]
taxonomy = tax_table(taxmat)
taxa_names(taxonomy) <- rownames(taxmat)
Make phyloseq object:
physeq = phyloseq(otu_table(bracken, taxa_are_rows = TRUE), sample_data(metadata), taxonomy)
Plot sample sums:
sums = as.data.frame(sample_sums(physeq))
sums[,'Database'] = sample_data(physeq)$Database
sums[,'Confidence.Threshold'] = sample_data(physeq)$Confidence.Threshold
colnames(sums) = c('Sum', 'Database', 'Confidence.Threshold')
sums$Confidence.Threshold <- as.factor(sums$Confidence.Threshold)
ggplot(sums, aes(x=Confidence.Threshold, y=Sum, fill=Database)) + geom_boxplot() + scale_y_log10() + labs(y= "Number of reads classified", x = "Confidence Threshold")
Look at only full database and normalise:
db = "RefSeqV214"
conf = 0.1
physeq_red <- prune_samples(sample_data(physeq)$Database == db, physeq)
physeq_red <- prune_samples(sample_data(physeq_red)$Confidence.Threshold == conf, physeq_red)
physeq_rare <- rarefy_even_depth(physeq_red, sample.size = min(sample_sums(physeq_red)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE) #rarefy to the lowest sample depth
physeq_relabun <- transform_sample_counts(physeq_red, function(x) (x / sum(x))*100) #convert to relative abundance
Plot stacked bar at domain level:
palette = distinctColorPalette(30)
rnk = "ta1"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Description_1, ~Description_3), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
Plot stacked bar at phylum level:
palette = distinctColorPalette(30)
rnk = "ta2"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Description_1, ~Description_3), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
Alpha diversity:
plot_richness(physeq_rare, measures=c("Observed", "Chao1", "Simpson", "Shannon"))
plot_richness(physeq_rare, x="Description_1", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
plot_richness(physeq_rare, x="Description_3", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
Beta diversity:
ps = physeq_rare
ps.ord <- ordinate(ps, "PCoA", "bray")
plot_ordination(ps, ps.ord, type="samples", color="Description_1", shape="Description_3")
distance <- phyloseq::distance(ps, method="bray", weighted=F)
adonis2(distance ~ sample_data(ps)$Description_1*sample_data(ps)$Description_3)
- Please feel free to post a question on the Microbiome Helper google group if you have any issues.
- General comments or inquires about Microbiome Helper can be sent to [email protected].