CBW 2024 Advanced Module 2: Metagenomic Assembly and Binning

Assemble with Megahit

# Assemble with megahit
R1=$( ls mapped_matched_fastq/*_R1.fastq | tr '\n' ',' | sed 's/,$//' )
R2=$( ls mapped_matched_fastq/*_R2.fastq | tr '\n' ',' | sed 's/,$//' )
mkdir anvio
    
megahit -1 $R1 \
         -2 $R2 \
         --min-contig-len 1000 \
         --num-cpu-threads 12 \
         --presets meta-large \
         --memory 0.1 \
         -o anvio/megahit_out \
        --verbose
# May need to pre-prepare this step

# Remove intermediate contigs to save space
rm -r megahit_out/*/intermediate_contigs

Reformat for Anvi'o and make contigs databases

# Reformat fasta for Anvi'o
anvi-script-reformat-fasta anvio-7/megahit_out/final.contigs.fa \
                               --simplify-names \
                               --min-len 2500 \
                               -o anvio-7/megahit_out/final.contigs.fixed.fa
# make Anvio database                               
mkdir anvio-7/anvio_databases
anvi-gen-contigs-database -f anvio-7/megahit_out/final.contigs.fixed.fa \
                              -o anvio-7/anvio_databases/CONTIGS.db \
                              -n HMP2

Run HMMs to identify genes

# Run HMMs to identify genes
anvi-run-hmms -c anvio-7/anvio_databases/CONTIGS.db --num-threads 12
anvi-get-sequences-for-gene-calls -c anvio-7/anvio_databases/CONTIGS.db -o anvio-7/anvio_databases/gene_calls.fa

Make bowtie2 database of contigs and map contigs to samples

bowtie2-build anvio-7/megahit_out/final.contigs.fixed.fa anvio-7/megahit_out/final.contigs.fixed
mkdir anvio-7/bam_files

# Map contigs to samples 

for SAMPLE in `awk '{print $1}' sample_ids.txt`
do

    # do the bowtie mapping to get the SAM file:
    bowtie2 --threads 12 \
            -x anvio-7/megahit_out/final.contigs.fixed \
            -1 "raw_data/"$SAMPLE"_R1_subsampled.fastq.gz" \
            -2 "raw_data/"$SAMPLE"_R2_subsampled.fastq.gz" \
            --no-unal \
            -S anvio-7/bam_files/$SAMPLE.sam

    # convert the resulting SAM file to a BAM file:
    samtools view -F 4 -bS anvio-7/bam_files/$SAMPLE.sam > anvio-7/bam_files/$SAMPLE-RAW.bam
    
    anvi-init-bam anvio-7/bam_files/$SAMPLE-RAW.bam -o anvio-7/bam_files/$SAMPLE.bam
    rm anvio-7/bam_files/$SAMPLE-RAW.bam

    # sort and index the BAM file:
    # samtools sort anvio/bam_files/$SAMPLE-RAW.bam -o anvio/bam_files/$SAMPLE.bam
    # samtools index anvio/bam_files/$SAMPLE.bam

    # remove temporary files:
    # rm anvio/bam_files/$SAMPLE.sam anvio/bam_files/$SAMPLE-RAW.bam

done

Add coverage and detection statistics to the Anv'io profile

# Make the Anvi'o profile databases that contain coverage and detection statistics
mkdir anvio-7/anvio_databases/profiles

for SAMPLE in `awk '{print $1}' sample_ids.txt`
do

    anvi-profile -c anvio-7/anvio_databases/CONTIGS.db \
                 -i anvio-7/bam_files/$SAMPLE.bam \
                 --num-threads 12 \
                 -o anvio-7/anvio_databases/profiles/$SAMPLE
done

Merge sample profiles

# Merge the sample profiles
anvi-merge -c anvio-7/anvio_databases/CONTIGS.db \
           -o anvio-7/anvio_databases/merged_profiles \
           anvio-7/anvio_databases/profiles/*/PROFILE.db
           
# Display contig stats
anvi-display-contigs-stats --report-as-text \
                           --output-file contigs_stats.txt \
                           anvio-7/anvio_databases/CONTIGS.db

Cluster contigs

anvi-cluster-contigs -c anvio-7/anvio_databases/CONTIGS.db \
                         -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
                         -C "merged_concoct_2500" \
                         --driver CONCOCT \
                         --length-threshold 2500 \
                         --num-threads 12 \
                         --just-do-it

mkdir anvio-7/concoct_summary_2500
anvi-summarize -c anvio-7/anvio_databases/CONTIGS.db \
                   -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
                   -C "merged_concoct_2500" \
                   -o anvio-7/concoct_summary_2500/summary

#now take a look at the summary of all bins!
less anvio-7/concoct_summary_2500/summary/bins_summary.txt
less anvio-7/concoct_summary_2500/summary/bins_across_samples/abundance.txt
less anvio-7/concoct_summary_2500/summary/bins_across_samples/mean_coverage.txt
#lets download these files so we can open them up and keep referring to them

anvi-estimate-genome-completeness -c anvio_databases/CONTIGS.db \
                                  -p anvio_databases/merged_profiles/PROFILE.db \
                                  -C "merged_concoct_5000"

Now the interactive refinement!

#lets first look at the top bin that's 100% complete and 0% redundant - for me, this is called Bin_17, but I don't know if this naming is always consistent, so if that's a different bin for you, then that's fine!
anvi-refine -c anvio-7/anvio_databases/CONTIGS.db \
            -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
            -C "merged_concoct_2500" \
            -b Bin_17 --server-only -P 8082
            
#in second window
ssh -L 8082:localhost:8082 vulcan
#go to http://localhost:8082/ in browser
#the press "Draw" in the browser window!
#click on the "Bins" tab

#If we select the whole tree, we can see that this is 100% complete and 0% redundant. If we unselect (right click) some of the tree branches, this makes the completion and length of the genome go down slightly, but doesn't affect the redundancy. We'll leave this and try another.
#ctrl+c to stop
anvi-refine -c anvio-7/anvio_databases/CONTIGS.db \
            -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
            -C "merged_concoct_2500" \
            -b Bin_27 --server-only -P 8082
#Refresh your Anv'io browser window and you should see a different chart now.
#Again, click "Draw" and then go to the "Bins tab"
#This one is 59% complete but has 54.9% redundancy - we ideally want to get the redundancy below 10% while keeping the completion above 50%
#Start by deselecting (right clicking) branches that appear to have a different abundance profile than others. There is one that seems to only be present in sample CSM7KOMH - deselecting this one only reduces the completion to 56.3%, but reduces redundancy to 15.5%
#You can deselect large areas to determine whether they'll have an impact on the redundancy, and then go through in more detail and deselect/reselect individual parts of this
#Make sure you use the zoom +/- and zoom in and out to see the tree in more detail, as well as checking the abundance/GC content profiles of the parts you've deselected
#Sometimes, you might want to add another bin to keep some of the contigs that you've deselected from the first one
#When you are happy with what you've selected, click on "Store refined bins in the database"
#The other two bins that are >50% completion are already <10% redundancy so it's not necessary to do anything to these, but you can have a look at them if you like

Rename bins

#anvi-delete-collection -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
#                      -C FINAL

anvi-rename-bins -c anvio-7/anvio_databases/CONTIGS.db \
                     -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
                     --collection-to-read merged_concoct_2500 \
                     --collection-to-write FINAL \
                     --size-for-MAG 1 \
                     --call-MAGs \
                     --min-completion-for-MAG 50 \
                     --max-redundancy-for-MAG 10 \
                     --prefix HMP2 \
                     --report-file renaming_bins.txt

Make new collection with these

anvi-summarize -c anvio-7/anvio_databases/CONTIGS.db \
                   -p anvio-7/anvio_databases/merged_profiles/PROFILE.db \
                   -C "FINAL" \
                   -o anvio-7/FINAL_summary/
                   
#Now we'll have a look at the bins that we've created. Take a look in the folder anvio-7/FINAL_summary/bin_by_bin
#You should see that some of these are called *MAG* while some are called *Bin* - this is because in the command above, we're only calling things a MAG if they're >50% completion and <10% redundancy. Have a look in one of those MAG folders - you'll see a lot of statistics, as well as a *-contigs.fa. This is a fasta file of the contigs used for each MAG, and we can use it as their genome for further analyses. Now we'll create a copy of these in a new folder.
mkdir MAG_fasta
cp anvio-7/FINAL_summary/bin_by_bin/*MAG*/*contigs.fa MAG_fasta/
ls MAG_fasta/
#for some reason this has saved the MAGs with <50% completion also, but that's OK. We can delete them if we want - just check their completion in the new bins_summary.txt file
#Now that we have our refined MAGs, we can do anything that we like with them!

If you have time....

Assign taxonomy to the MAGs - there are several programs that I'd usually use, CheckM, GTDB-tk, but unfortunately they use quite a lot of memory.

Here's how I've run them: blah blah blah

And this is the output from them:

You could also try NCBI BLAST:

Now we can get taxonomy of these

Contact

Please feel free to post a question on the Microbiome Helper google group if you have any issues.
General comments or inquires about Microbiome Helper can be sent to [email protected].