Skip to content

CBW IMPACTT Metagenome sequence data analysis with Kraken2 and Phyloseq

Robyn Wright edited this page Jul 2, 2023 · 16 revisions

Command line

conda activate taxonomic
mkdir metagenome_workshop
cd metagenome_workshop/
ln -s ~/CourseData/MIC_data/metagenome_data/cat_reads/ .
mkdir kraken2_kreport
mkdir kraken2_outraw
parallel -j 1 --eta 'kraken2 --db  ~/CourseData/MIC_data/metagenome_data/k2_pluspf_08gb_20230314/ --threads 4 --output kraken2_outraw/{1/.}_{2}_minikraken.kraken.txt --report kraken2_kreport/{1/.}_{2}_minikraken.kreport --use-names {1} --confidence {2}' ::: cat_reads/*.fastq ::: 0.0 0.1
conda activate bracken
mkdir bracken_out
parallel -j 2 'bracken -d ~/CourseData/MIC_data/metagenome_data/k2_pluspf_08gb_20230314/ -i {} -o bracken_out/{/.}.species.bracken -r 100 -l S -t 1' ::: kraken2_kreport/*minikraken.kreport
mkdir bracken_out_merged
combine_bracken_outputs.py --files bracken_out/*minikraken.species.bracken -o bracken_out_merged/merged_output_minikraken.species.bracken
conda deactivate
conda activate taxonomic
ln -s ~/CourseData/MIC_data/metagenome_data/kraken2_kreport/*RefSeq* kraken2_kreport/
ln -s ~/CourseData/MIC_data/metagenome_data/kraken2_outraw/*RefSeq* kraken2_outraw/
ln -s ~/CourseData/MIC_data/metagenome_data/bracken_out_merged/merged_output_RefSeq.species.bracken bracken_out_merged/
combine_merged_files.py

R

Set up:

library(phyloseq)
library(vegan)
library(ggplot2)
library(randomcoloR)
library(gridExtra)
library(tidyr)

Import metadata:

metadata = read.csv('workspace/metagenome_workshop/Blueberry_metadata_metagenome.tsv', sep='\t')
rownames(metadata) = metadata$SampleID
metadata = metadata[,-1]

Import merged bracken output:

bracken = read.csv('workspace/metagenome_workshop/bracken_out_merged/bracken_combined.csv')
bracken = as.data.frame(bracken)
row.names(bracken) = bracken$name

taxonomy = bracken[, c(1,50)]
row.names(taxonomy) = row.names(bracken)

table_num = data.matrix(bracken[,2:49])
rownames(table_num) = bracken[,1]
bracken = as.matrix(table_num)

taxonomy <- separate(data = taxonomy, col = Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "\\;")
taxmat <- taxonomy[,-1]
taxonomy = tax_table(taxmat)
taxa_names(taxonomy) <- rownames(taxmat)

Make phyloseq object:

physeq = phyloseq(otu_table(bracken, taxa_are_rows = TRUE), sample_data(metadata), taxonomy)

Plot sample sums:

sums = as.data.frame(sample_sums(physeq))
sums[,'Database'] = sample_data(physeq)$Database
sums[,'Confidence.Threshold'] = sample_data(physeq)$Confidence.Threshold
colnames(sums) = c('Sum', 'Database', 'Confidence.Threshold')
sums$Confidence.Threshold <- as.factor(sums$Confidence.Threshold)
ggplot(sums, aes(x=Confidence.Threshold, y=Sum, fill=Database)) + geom_boxplot() + scale_y_log10() + labs(y= "Number of reads classified", x = "Confidence Threshold")

Look at only full database and normalise:

db = "RefSeqV214"
conf = 0.1
physeq_red <- prune_samples(sample_data(physeq)$Database == db, physeq)
physeq_red <- prune_samples(sample_data(physeq_red)$Confidence.Threshold == conf, physeq_red)
physeq_rare <- rarefy_even_depth(physeq_red, sample.size = min(sample_sums(physeq_red)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE) #rarefy to the lowest sample depth
physeq_relabun  <- transform_sample_counts(physeq_red, function(x) (x / sum(x))*100) #convert to relative abundance

Plot stacked bar at domain level:

palette = distinctColorPalette(30)
rnk = "ta1"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)

plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Description_1, ~Description_3), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)

Plot stacked bar at phylum level:

palette = distinctColorPalette(30)
rnk = "ta2"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)

plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Description_1, ~Description_3), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)

Alpha diversity:

plot_richness(physeq_rare, measures=c("Observed", "Chao1", "Simpson", "Shannon"))
plot_richness(physeq_rare, x="Description_1", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
plot_richness(physeq_rare, x="Description_3", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()

Beta diversity:

ps = physeq_rare
ps.ord <- ordinate(ps, "PCoA", "bray")
plot_ordination(ps, ps.ord, type="samples", color="Description_1", shape="Description_3")
distance <- phyloseq::distance(ps, method="bray", weighted=F)
adonis2(distance ~ sample_data(ps)$Description_1*sample_data(ps)$Description_3)
Clone this wiki locally