-
Notifications
You must be signed in to change notification settings - Fork 204
CBW 2024 Advanced Module 2: Metagenomic Assembly and Binning
Robyn Wright edited this page Apr 24, 2024
·
15 revisions
mkdir amb_module2
cd amb_module2
ln -s ~/CourseData/MIC_data/AMB_data/mapped_matched_fastq/ .
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
cp ~/CourseData/MIC_data/AMB_data/sample_ids.txt .
cp ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .
conda activate /home/ubuntu/CourseData/MIC_data/tools/anvio-7
# Assemble with megahit
R1=$( ls mapped_matched_fastq/*_R1.fastq | tr '\n' ',' | sed 's/,$//' )
R2=$( ls mapped_matched_fastq/*_R2.fastq | tr '\n' ',' | sed 's/,$//' )
mkdir anvio
megahit -1 $R1 \
-2 $R2 \
--min-contig-len 1000 \
--num-cpu-threads 2 \
--presets meta-large \
--memory 0.8 \
-o anvio/megahit_out \
--verbose
# May need to pre-prepare this step
# Remove intermediate contigs to save space
rm -r megahit_out/*/intermediate_contigs
mkdir anvio
mkdir anvio/megahit_out
cp ~/CourseData/MIC_data/AMB_data/final.contigs.fa anvio/megahit_out/
anvi-script-reformat-fasta anvio/megahit_out/final.contigs.fa \
--simplify-names \
--min-len 2500 \
-o anvio/megahit_out/final.contigs.fixed.fa
mkdir anvio/anvio_databases
anvi-gen-contigs-database -f anvio/megahit_out/final.contigs.fixed.fa \
-o anvio/anvio_databases/CONTIGS.db \
-n HMP2
anvi-run-hmms -c anvio/anvio_databases/CONTIGS.db \
--num-threads 2
anvi-get-sequences-for-gene-calls -c anvio/anvio_databases/CONTIGS.db \
-o anvio/anvio_databases/gene_calls.fa
bowtie2-build anvio/megahit_out/final.contigs.fixed.fa \
anvio/megahit_out/final.contigs.fixed
mkdir anvio/bam_files
# Map contigs to samples
for SAMPLE in `awk '{print $1}' sample_ids.txt`
do
# do the bowtie mapping to get the SAM file:
bowtie2 --threads 2 \
-x anvio/megahit_out/final.contigs.fixed \
-1 "raw_data/"$SAMPLE"_R1_subsampled.fastq.gz" \
-2 "raw_data/"$SAMPLE"_R2_subsampled.fastq.gz" \
--no-unal \
-S anvio/bam_files/$SAMPLE.sam
# convert the resulting SAM file to a BAM file:
samtools view -F 4 -bS anvio/bam_files/$SAMPLE.sam > anvio/bam_files/$SAMPLE-RAW.bam
anvi-init-bam anvio/bam_files/$SAMPLE-RAW.bam -o anvio/bam_files/$SAMPLE.bam
rm anvio/bam_files/$SAMPLE-RAW.bam
done
mkdir anvio/anvio_databases/profiles
for SAMPLE in `awk '{print $1}' sample_ids.txt`
do
anvi-profile -c anvio/anvio_databases/CONTIGS.db \
-i anvio/bam_files/$SAMPLE.bam \
--num-threads 2 \
-o anvio/anvio_databases/profiles/$SAMPLE
done
anvi-merge -c anvio/anvio_databases/CONTIGS.db \
-o anvio/anvio_databases/merged_profiles \
anvio/anvio_databases/profiles/*/PROFILE.db
# Display contig stats
anvi-display-contigs-stats --report-as-text \
--output-file contigs_stats.txt \
anvio/anvio_databases/CONTIGS.db
anvi-cluster-contigs -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
-C "merged_concoct_2500" \
--driver CONCOCT \
--length-threshold 2500 \
--num-threads 2 \
--just-do-it
mkdir anvio/concoct_summary_2500
anvi-summarize -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
-C "merged_concoct_2500" \
-o anvio/concoct_summary_2500/summary/
#now take a look at the summary of all bins!
less anvio/concoct_summary_2500/summary/bins_summary.txt
less anvio/concoct_summary_2500/summary/bins_across_samples/abundance.txt
less anvio/concoct_summary_2500/summary/bins_across_samples/mean_coverage.txt
#now go to amb_module2/anvio/concoct_summary_2500/ and then right click on summary and click open in new tab
#now go to amb_module2/anvio/concoct_summary_2500 and add /summary/bins_summary.txt to the URL bar
#copy and paste this into a new excel (or similar) document - it will be useful to refer back to
#lets first look at the top bin that's 100% complete and 0% redundant - for me, this is called Bin_17, but I don't know if this naming is always consistent, so if that's a different bin for you, then that's fine!
anvi-refine -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
-C "merged_concoct_2500" \
-b Bin_17 --server-only -P 8081
#in second window
ssh -L 8081:localhost:8081 -i instructor.pem [email protected]
#go to http://localhost:8081/ in browser
#the press "Draw" in the browser window!
#click on the "Bins" tab
#If we select the whole tree, we can see that this is 100% complete and 0% redundant. If we unselect (right click) some of the tree branches, this makes the completion and length of the genome go down slightly, but doesn't affect the redundancy. We'll leave this and try another.
#ctrl+c to stop
anvi-refine -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
-C "merged_concoct_2500" \
-b Bin_27 --server-only -P 8081
#Refresh your Anv'io browser window and you should see a different chart now.
#Again, click "Draw" and then go to the "Bins tab"
#This one is 59% complete but has 54.9% redundancy - we ideally want to get the redundancy below 10% while keeping the completion above 50%
#Start by deselecting (right clicking) branches that appear to have a different abundance profile than others. There is one that seems to only be present in sample CSM7KOMH - deselecting this one only reduces the completion to 56.3%, but reduces redundancy to 15.5%
#You can deselect large areas to determine whether they'll have an impact on the redundancy, and then go through in more detail and deselect/reselect individual parts of this
#Make sure you use the zoom +/- and zoom in and out to see the tree in more detail, as well as checking the abundance/GC content profiles of the parts you've deselected
#Sometimes, you might want to add another bin to keep some of the contigs that you've deselected from the first one
#When you are happy with what you've selected, click on "Store refined bins in the database"
#The other two bins that are >50% completion are already <10% redundancy so it's not necessary to do anything to these, but you can have a look at them if you like
anvi-rename-bins -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
--collection-to-read merged_concoct_2500 \
--collection-to-write FINAL \
--size-for-MAG 1 \
--call-MAGs \
--min-completion-for-MAG 50 \
--max-redundancy-for-MAG 10 \
--prefix HMP2 \
--report-file renaming_bins.txt
anvi-summarize -c anvio/anvio_databases/CONTIGS.db \
-p anvio/anvio_databases/merged_profiles/PROFILE.db \
-C "FINAL" \
-o anvio/FINAL_summary/
#Now we'll have a look at the bins that we've created. Take a look in the folder anvio/FINAL_summary/bin_by_bin
#You should see that some of these are called *MAG* while some are called *Bin* - this is because in the command above, we're only calling things a MAG if they're >50% completion and <10% redundancy. Have a look in one of those MAG folders - you'll see a lot of statistics, as well as a *-contigs.fa. This is a fasta file of the contigs used for each MAG, and we can use it as their genome for further analyses. Now we'll create a copy of these in a new folder.
mkdir MAG_fasta
cp anvio/FINAL_summary/bin_by_bin/*MAG*/*contigs.fa MAG_fasta/
ls MAG_fasta/
#for some reason this has saved the MAGs with <50% completion also, but that's OK. We can delete them if we want - just check their completion in the new bins_summary.txt file
#Now that we have our refined MAGs, we can do anything that we like with them!
export CHECKM_DATA_PATH=/home/ubuntu/CourseData/MIC_data/tools/CheckM_databases
checkm lineage_wf --reduced_tree -t 12 -x fa MAG_fasta MAGs-CHECKM-lineage -f MAGs-CHECKM.txt --tab_table
checkm tree_qa MAGs-CHECKM-lineage -f MAGs-CHECKM-tax.txt --tab_table
less MAGs-CHECKM.txt
less MAGs-CHECKM-tax.txt
Get necessary databases:
conda activate rgi
cd ..
mkdir card_data
cd card_data
wget https://card.mcmaster.ca/latest/data
tar -xvf data ./card.json
rgi load --card_json ./card.json --local
rgi database --version --local
#3.2.9
rgi clean --local
wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants
mkdir -p wildcard
tar -xjf wildcard_data.tar.bz2 -C wildcard
gunzip wildcard/*.gz
rgi card_annotation -i localDB/card.json > card_annotation.log 2>&1
rgi wildcard_annotation -i wildcard --card_json localDB/card.json -v v3.2.9 > wildcard_annotation.log 2>&1
rgi load \
--card_json localDB/card.json \
--debug --local \
--card_annotation card_database_v3.2.9.fasta \
--wildcard_annotation wildcard_database_vv3.2.9.fasta \
--wildcard_index wildcard/index-for-model-sequences.txt \
--wildcard_version 3.2.9 \
--amr_kmers wildcard/all_amr_61mers.txt \
--kmer_database wildcard/61_kmer_db.json \
--kmer_size 61
cd ..
Run:
mkdir amb_module2/card_out/
cd card_data
parallel -j 1 'rgi main -i {} -o ~/workspace/amb_module2/card_out/{/.} -t contig -a DIAMOND -n 2 --include_loose --local --clean' ::: ~/workspace/amb_module2/MAG_fasta/*
cd ~/workspace/amb_module2/
rgi heatmap --input card_out/ --output card_heatmap
#Output file card_heatmap-7: Yellow represents a perfect hit, teal represents a strict hit, purple represents no hit.
Explore the other options within Anvi'o
Some other things that can be done with MAGs:
- GTDB-tk
- Please feel free to post a question on the Microbiome Helper google group if you have any issues.
- General comments or inquires about Microbiome Helper can be sent to [email protected].