Skip to content

CBW 2024 Advanced Module 3: Metagenomic functional annotation

Robyn Wright edited this page May 1, 2024 · 12 revisions

General functional annotation using MMSeqs in the Unix command line

Copy in the data we will be using

cd workspace
mkdir amb_module3
cd amb_module3
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
ln -s ~/CourseData/MIC_data/AMB_data/scripts/ .
ln -s ~/CourseData/MIC_data/AMB_data/MMSeqs2_db/ .

#edit script
vi scripts/concat_paired_end.pl 
#press "i" to edit the documents
#replace top line with #!/home/ubuntu/CourseData/MIC_data/.conda/envs/functional/bin/perl
#press esc and then ":x" 

scripts/concat_paired_end.pl -p 4 -o cat_reads raw_data/*.fastq.gz
gunzip cat_reads/*
#tar -xvf ~/workshops/metagenome_tutorial/MMSeqs2_db.tar.gz
ln -s ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .

Run MMSeqs

mkdir mmseqs_U90_out
parallel -j 4 --progress 'mmseqs createdb {} mmseqs_U90_out/mmseqs-{/.}-queryDB' ::: cat_reads/*
parallel -j 1 --progress 'mmseqs search mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB tmp --db-load-mode 3 --threads 2 --max-seqs 25 -s 1 -a -e 1e-5' ::: cat_reads/*
#it doesn't work!
cp ~/CourseData/MIC_data/AMB_data/mmseqs_U90_out/*resultDB* mmseqs_U90_out/

parallel -j 2 --progress 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2' ::: cat_reads/*
#if we didn't want the output, we could run it like this:
#parallel -j 1 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2 > /dev/null 2>&1' ::: cat_reads/*
mkdir mmseqs_m8_files
mv mmseqs_U90_out/*.m8 mmseqs_m8_files/

Get MMSeqs top hits

less mmseqs_m8_files/mmseqs-HSM7J4QT-s1.m8 
mkdir mmseqs_U90_out_tophit
python3 scripts/pick_uniref_top_hit.py --unirefm8Dir mmseqs_m8_files --output_path mmseqs_U90_out_tophit

Now let's make a file that maps from our kraken output to the MMSeqs output

python3 #press enter
import os

samples = os.listdir('cat_reads/')
samples = [s.split('.')[0] for s in samples]
print(samples)

kraken_path = '~/CourseData/MIC_data/AMB_data/kraken2_outraw_RefSeqCompleteV205/'
kraken_suffix = '_0.1_RefSeqCompleteV205.kraken.txt'
mmseqs_path = 'mmseqs_U90_out_tophit/mmseqs-'
mmseqs_suffix = '-s1.m8-parsed.txt'
mmseqs_m8_path = 'mmseqs_m8_files-'
mmseqs_m8_suffix = '-s1.m8'

with open('multi-sample-outfiles-w-m8.txt', 'w') as f:
  for sample in samples:
    string = sample+'\t'
    string += kraken_path+sample+kraken_suffix+'\t'+'kraken2'+'\t'
    string += mmseqs_path+sample+mmseqs_suffix+'\t'+'uniref'+'\t'
    string += mmseqs_m8_path+sample+mmseqs_m8_suffix+'\n'
    f.write(string)

quit() #press enter

Get taxonomy and function and normalise

ln -s MMSeqs2_db/*.pbz2 .
python3 scripts/parse_TaxonomyFunction.py --multisample multi-sample-outfiles-w-m8.txt --outputf Workshop_strat_matrix_RPKM.txt --stratified Y --map2EC Y

This might take a while, or it might say "killed". Either way, we can copy the output:

ln -s ~/CourseData/MIC_data/AMB_data/Workshop_strat_matrix_RPKM.txt .
less Workshop_strat_matrix_RPKM.txt

Reformat output

Split the function column to taxonomy and function:

conda activate picrust2
python
import pandas as pd
file = pd.read_csv('Workshop_strat_matrix_RPKM.txt', header=0, sep='\t')
samples = [s for s in file.columns if s != 'function']
file[['EC','taxon']] = file['function'].str.split('|',expand=True)
file = file.drop('function', axis=1).loc[:, ['EC', 'taxon']+samples]
file.to_csv('Workshop_strat_matrix_RPKM_split.txt', index=False, sep='\t')
quit()

add_descriptions.py -i Workshop_strat_matrix_RPKM_split.txt -o Workshop_strat_matrix_RPKM_split_des.txt -m EC

Make the taxonomy dictionary:

#if you want to see how this is made for yourself, you can do that like this. If you just want to copy the pre-made one then that's fine, too.
mkdir taxonomy
cd taxonomy
wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xvf taxdump.tar.gz

#nodes.dmp = taxid, parent taxid
#names.dmp = taxid, name

cd ..

python
import pickle
nodes_dict = {}
names_dict = {}
nodes_level_dict = {}

for line in open('taxonomy/nodes.dmp', 'r'):
  this_line = line.replace('\t', '').split('|')
  node, parent, level = this_line[0], this_line[1], this_line[2]
  nodes_dict[node] = parent
  nodes_level_dict[node] = level

for line in open('taxonomy/names.dmp', 'r'):
  this_line = line.replace('\t', '').split('|')
  taxid, name, name_type = this_line[0], this_line[1], this_line[3]
  if name_type == 'scientific name':
    names_dict[taxid] = name

full_taxonomy = {}
for taxid in nodes_dict:
  parent_id = nodes_dict[taxid]
  tax_list = [names_dict[taxid]]
  nodes_list = [taxid]
  level_list = [nodes_level_dict[taxid]]
  while parent_id in nodes_dict:
    if parent_id in ['1', '0']:
      break
    tax_list.append(names_dict[parent_id])
    nodes_list.append(parent_id)
    level_list.append(nodes_level_dict[parent_id])
    parent_id = nodes_dict[parent_id]
  full_taxonomy[taxid] = [nodes_list, tax_list, level_list]
  
with open('taxonomy.dict', 'wb') as f:
  pickle.dump(full_taxonomy, f)
quit()

Add the full taxonomy to the file we had, resave it in the full stratified format needed for JarrVis:

python
import pandas as pd
import pickle

file = pd.read_csv('Workshop_strat_matrix_RPKM_split_des.txt', header=0, sep='\t')

with open('taxonomy.dict', 'rb') as f:
  full_taxonomy = pickle.load(f)

tax_levels = {'phylum':'p__', 'class':'c__', 'order':'o__', 'family':'f__', 'genus':'g__', 'species':'s__'}
for row in file.index.values:
  tax = file.loc[row, 'taxon']
  taxid = tax.split('taxid ')[1].replace(')', '')
  full_tax = full_taxonomy[taxid]
  this_tax = []
  for level in tax_levels:
    if level in full_tax[2]:
      index = full_tax[2].index(level)
      name = full_tax[1][index]
      this_tax.append(tax_levels[level]+name)
    else:
      this_tax.append(tax_levels[level]+'Unclassified')
  this_tax = ';'.join(this_tax)
  file.loc[row, 'taxon'] = this_tax

file.to_csv('Workshop_strat_matrix_RPKM_split_des_full_tax.txt', sep='\t', index=False)

new_file = []
samples = [s for s in file.columns if s not in ['EC', 'description', 'taxon']]
for row in file.index.values:
  for sample in samples:
    this_row = [sample, file.loc[row, 'taxon'], file.loc[row, 'EC']+': '+file.loc[row, 'description'], file.loc[row, sample]]
    new_file.append(this_row)
    
jarrvis_file = pd.DataFrame(new_file, columns=['Sample', 'Genus', 'Gene', 'Contribution'])
jarrvis_file.to_csv('Workshop_stratified_for_jarrvis.txt', sep='\t', index=False)
quit()

Visualise with JarrVis

#Workshop_stratified_for_jarrvis.txt #download
#mgs_metadata.txt #download
#modify metadata header so that sample says sample_id
install.packages('shiny')
library(shiny)
runGist("943ff5fdbd94815cc27f302d9f56ff0b")

Add stratified file Add metadata file Click "select metadata categories" Taxonomy level to collapse - choose whatever you'd like, I went with genus Click "Select taxa categories" Filter by taxa: select all Click "Select function categories" Filter by function: select all Click "Update the gene contribution threshold from data" Move up the bottom filter so that we don't plot all of the really low abundance stuff, to ~500 is sensible Scroll to the bottom of the window and click "Display plot" Now scroll back up and you should see the functions in each group

General functional annotation using HUMAnN in the Unix command line

#hopefully you ran this in module 1, but incase you didn't:
conda activate taxonomic
mkdir ~/workspace/amb_module1/metaphlan_out
mkdir ~/workspace/amb_module3/humann_out
cd amb_module3
parallel -j 2 'metaphlan --input_type fastq {} --bowtie2db ~/CourseData/MIC_data/tools/CHOCOPhlAn_201901/ > ~/workspace/amb_module1/metaphlan_out/{/.}.txt' ::: cat_reads/*.fastq

conda activate biobakery3
#you might want to use tmux for this!
parallel -j 1 --eta --progress 'humann -i {} -o humann_out/ --threads 4 --taxonomic-profile ~/workspace/amb_module1/metaphlan_out/{/.}.txt --protein-database ~/CourseData/MIC_data/tools/humann_databases/uniref' ::: cat_reads/*.fastq

humann_join_tables -i humann_out -o HMP_humann_genefamilies.tsv --file_name genefamilies
humann_join_tables -i humann_out -o HMP_humann_pathabundance.tsv --file_name pathabundance
humann_join_tables -i humann_out -o HMP_humann_pathcoverage.tsv --file_name pathcoverage

#if you take a look at these, you'll see that we don't have very much that's well classified. This is because we're using very sub-sampled samples, and for useful functional annotation we really need higher sequencing depth than for taxonomy. I also ran HUMANN with the full samples, you can get that file from here:
wget -O HMP_humann_pathabundance_full.tsv https://www.dropbox.com/scl/fi/xcrd383810v4vvasnn22m/HMP_humann_pathabundance.tsv?rlkey=jedk0l8ctd3qv3hrqbavbge43&dl=1

#and a version that I modified to have the metadata as an additional line just under the header here:
wget -O HMP_humann_pathabundance_full_metadata.tsv https://www.dropbox.com/scl/fi/x6sqim72gr07a8h5z4iqw/HMP_humann_pathabundance_metadata.tsv?rlkey=huaqj9dzb5rsclnsmo79se8m5&dl=1

#humann has some built in functions for plotting this output, so we'll run some of the pathways like so:
mkdir humann_plots
parallel -j 1 'humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/{}.png --focal-feature {} --sort sum metadata --scaling logstack' ::: PWY-7221 PEPTIDOGLYCANSYN-PWY PWY-1269 PWY-6277 FUCCAT-PWY P164-PWY PWY-7013 HSERMETANA-PWY P163-PWY
#note that these pathways were chosen more or less at random to show a few different patterns. Feel free to look through the file and run this with some other pathways, e.g.:
humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/CALVIN-PWY.png --focal-feature CALVIN-PWY --sort sum metadata --scaling logstack
humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/ANAEROFRUCAT-PWY.png --focal-feature ANAEROFRUCAT-PWY --sort sum metadata --scaling logstack

AMR annotation of reads using CARD RGI in the Unix command line

conda activate rgi
#if you made it to this part in advanced module 2, you already have the data we need. If not, either run those commands now or copy across the data needed

Run with samples:

mkdir card_out
cd ~/workspace/card_data/
parallel -j 1 --link 'rgi bwt --read_one {1} --read_two {2} --aligner bowtie2 --output_file ~/workspace/amb_module3/card_out/{1/.}_CARD --threads 2 --local --clean' \
 ::: ~/workspace/amb_module3/raw_data/*_R1_subsampled.fastq.gz ::: ~/workspace/amb_module3/raw_data/*_R2_subsampled.fastq.gz
 
 cd ~/workspace/amb_module3/card_out
 less HSMA33J3_R1_subsampled.fastq_CARD.allele_mapping_data.txt
 less HSM6XRQY_R1_subsampled.fastq_CARD.overall_mapping_stats.txt
 less HSM7J4QT_R1_subsampled.fastq_CARD.reference_mapping_stats.txt
 less HSM7J4QT_R1_subsampled.fastq_CARD.gene_mapping_data.txt
 mkdir gene_mapping
 mv *gene_mapping_data* gene_mapping/
 mkdir other_read_files
 mv *subsampled* other_read_files

Just combine these and get people to look at them in excel? R heatmap?? See some details about these output files here

Clone this wiki locally