Skip to content

CBW 2024 Advanced Module 3: Metagenomic functional annotation

Robyn Wright edited this page Apr 24, 2024 · 12 revisions

General functional annotation using MMSeqs in the Unix command line

Copy in the data we will be using

cd workspace
mkdir amb_module3
cd amb_module3
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
ln -s ~/CourseData/MIC_data/AMB_data/scripts/ .
ln -s ~/CourseData/MIC_data/AMB_data/MMSeqs2_db/ .
scripts/concat_paired_end.pl -p 4 -o cat_reads raw_data/*.fastq.gz
gunzip cat_reads/*
#tar -xvf ~/workshops/metagenome_tutorial/MMSeqs2_db.tar.gz
cp ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .

Run MMSeqs

mkdir mmseqs_U90_out
parallel -j 4 --progress 'mmseqs createdb {} mmseqs_U90_out/mmseqs-{/.}-queryDB' ::: cat_reads/*
parallel -j 1 --progress 'mmseqs search mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB tmp --db-load-mode 3 --threads 2 --max-seqs 25 -s 1 -a -e 1e-5' ::: cat_reads/*
#it doesn't work!
cp ~/CourseData/MIC_data/AMB_data/mmseqs_U90_out/*resultDB* mmseqs_U90_out/

mmseqs convertalis mmseqs_U90_out/mmseqs-HSMA33J3queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-HSMA33J3resultDB mmseqs_U90_out/mmseqs-HSMA33J3-s1.m8 --db-load-mode 2

parallel -j 2 --progress 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2' ::: cat_reads/*
#if we didn't want the output, we could run it like this:
#parallel -j 1 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2 > /dev/null 2>&1' ::: cat_reads/*
mkdir mmseqs_m8_files
mv mmseqs_U90_out/*.m8 mmseqs_m8_files/

Get MMSeqs top hits

less mmseqs_m8_files/mmseqs-HSM7J4QT-s1.m8 
mkdir mmseqs_U90_out_tophit
python3 scripts/pick_uniref_top_hit.py --unirefm8Dir mmseqs_m8_files --output_path mmseqs_U90_out_tophit

Now let's make a file that maps from our kraken output to the MMSeqs output

python3 #press enter
import os

samples = os.listdir('cat_reads/')
samples = [s.split('.')[0] for s in samples]
print(samples)

kraken_path = '~/CourseData/MIC_data/AMB_data/kraken2_outraw_RefSeqCompleteV205/'
#kraken_path = '~/amb_module1/kraken2_outraw/'
kraken_suffix = '_0.1_RefSeqCompleteV205.kraken.txt'
mmseqs_path = 'mmseqs_U90_out_tophit/mmseqs-'
mmseqs_suffix = '-s1.m8-parsed.txt'
mmseqs_m8_path = 'mmseqs_m8_files-'
mmseqs_m8_suffix = '-s1.m8'

with open('multi-sample-outfiles-w-m8.txt', 'w') as f:
  for sample in samples:
    string = sample+'\t'
    string += kraken_path+sample+kraken_suffix+'\t'+'kraken2'+'\t'
    string += mmseqs_path+sample+mmseqs_suffix+'\t'+'uniref'+'\t'
    string += mmseqs_m8_path+sample+mmseqs_m8_suffix+'\n'
    f.write(string)

quit() #press enter

Get taxonomy and function and normalise

ln -s MMSeqs2_db/*.pbz2 .
python3 scripts/parse_TaxonomyFunction.py --multisample multi-sample-outfiles-w-m8.txt --outputf Workshop_strat_matrix_RPKM.txt --stratified Y --map2EC Y

This might take a while, or it might say "killed". Either way, we can copy the output:

ln -s ~/CourseData/MIC_data/AMB_data/Workshop_strat_matrix_RPKM.txt .
less Workshop_strat_matrix_RPKM.txt

Reformat output

Split the function column to taxonomy and function:

conda activate picrust2
python
import pandas as pd
file = pd.read_csv('Workshop_strat_matrix_RPKM.txt', header=0, sep='\t')
samples = [s for s in file.columns if s != 'function']
file[['EC','taxon']] = file['function'].str.split('|',expand=True)
file = file.drop('function', axis=1).loc[:, ['EC', 'taxon']+samples]
file.to_csv('Workshop_strat_matrix_RPKM_split.txt', index=False, sep='\t')
quit()

add_descriptions.py -i Workshop_strat_matrix_RPKM_split.txt -o Workshop_strat_matrix_RPKM_split_des.txt -m EC

Make the taxonomy dictionary:

#if you want to see how this is made for yourself, you can do that like this. If you just want to copy the pre-made one then that's fine, too.
mkdir taxonomy
cd taxonomy
wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xvf taxdump.tar.gz

#nodes.dmp = taxid, parent taxid
#names.dmp = taxid, name

python
import pickle
nodes_dict = {}
names_dict = {}
nodes_level_dict = {}

for line in open('taxonomy/nodes.dmp', 'r'):
  this_line = line.replace('\t', '').split('|')
  node, parent, level = this_line[0], this_line[1], this_line[2]
  nodes_dict[node] = parent
  nodes_level_dict[node] = level

for line in open('taxonomy/names.dmp', 'r'):
  this_line = line.replace('\t', '').split('|')
  taxid, name, name_type = this_line[0], this_line[1], this_line[3]
  if name_type == 'scientific name':
    names_dict[taxid] = name

full_taxonomy = {}
for taxid in nodes_dict:
  parent_id = nodes_dict[taxid]
  tax_list = [names_dict[taxid]]
  nodes_list = [taxid]
  level_list = [nodes_level_dict[taxid]]
  while parent_id in nodes_dict:
    if parent_id in ['1', '0']:
      break
    tax_list.append(names_dict[parent_id])
    nodes_list.append(parent_id)
    level_list.append(nodes_level_dict[parent_id])
    parent_id = nodes_dict[parent_id]
  full_taxonomy[taxid] = [nodes_list, tax_list, level_list]
  
with open('taxonomy.dict', 'wb') as f:
  pickle.dump(full_taxonomy, f)
quit()

Add the full taxonomy to the file we had, resave it in the full stratified format needed for JarrVis:

python
import pandas as pd
import pickle

file = pd.read_csv('Workshop_strat_matrix_RPKM_split_des.txt', header=0, sep='\t')

with open('taxonomy.dict', 'rb') as f:
  full_taxonomy = pickle.load(f)

tax_levels = {'phylum':'p__', 'class':'c__', 'order':'o__', 'family':'f__', 'genus':'g__', 'species':'s__'}
for row in file.index.values:
  tax = file.loc[row, 'taxon']
  taxid = tax.split('taxid ')[1].replace(')', '')
  full_tax = full_taxonomy[taxid]
  this_tax = []
  for level in tax_levels:
    if level in full_tax[2]:
      index = full_tax[2].index(level)
      name = full_tax[1][index]
      this_tax.append(tax_levels[level]+name)
    else:
      this_tax.append(tax_levels[level]+'Unclassified')
  this_tax = ';'.join(this_tax)
  file.loc[row, 'taxon'] = this_tax

file.to_csv('Workshop_strat_matrix_RPKM_split_des_full_tax.txt', sep='\t', index=False)

new_file = []
samples = [s for s in file.columns if s not in ['EC', 'description', 'taxon']]
for row in file.index.values:
  for sample in samples:
    this_row = [sample, file.loc[row, 'taxon'], file.loc[row, 'EC']+': '+file.loc[row, 'description'], file.loc[row, sample]]
    new_file.append(this_row)
    
jarrvis_file = pd.DataFrame(new_file, columns=['Sample', 'Genus', 'Gene', 'Contribution'])
jarrvis_file.to_csv('Workshop_stratified_for_jarrvis.txt', sep='\t', index=False)
quit()

Visualise with JarrVis

#Workshop_stratified_for_jarrvis.txt #download
#mgs_metadata.txt #download
install.packages('shiny')
library(shiny)
runGist("943ff5fdbd94815cc27f302d9f56ff0b")

Add stratified file Add metadata file Click "select metadata categories" Taxonomy level to collapse - choose whatever you'd like, I went with genus Click "Select taxa categories" Filter by taxa: select all Click "Select function categories" Filter by function: select all Click "Update the gene contribution threshold from data" Move up the bottom filter so that we don't plot all of the really low abundance stuff, to ~500 is sensible Scroll to the bottom of the window and click "Display plot" Now scroll back up and you should see the functions in each group

General functional annotation using HUMAnN in the Unix command line

AMR annotation of reads using CARD RGI in the Unix command line

conda activate rgi

If time - load database:

mkdir card_data
cd card_data
wget https://card.mcmaster.ca/latest/data
tar -xvf data ./card.json
rgi load --card_json ./card.json --local
rgi database --version --local
#3.2.9
rgi clean --local
wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants
mkdir -p wildcard
tar -xjf wildcard_data.tar.bz2 -C wildcard
gunzip wildcard/*.gz
rgi card_annotation -i localDB/card.json > card_annotation.log 2>&1
rgi wildcard_annotation -i wildcard --card_json localDB/card.json -v v3.2.9 > wildcard_annotation.log 2>&1
rgi load \
  --card_json localDB/card.json \
  --debug --local \
  --card_annotation card_database_v3.2.9.fasta \
  --wildcard_annotation wildcard_database_vv3.2.9.fasta \
  --wildcard_index wildcard/index-for-model-sequences.txt \
  --wildcard_version 3.2.9 \
  --amr_kmers wildcard/all_amr_61mers.txt \
  --kmer_database wildcard/61_kmer_db.json \
  --kmer_size 61

Run with samples:

mkdir card_out
parallel -j 1 --link 'rgi bwt --read_one {1} --read_two {2} --aligner bowtie2 --output_file /home/robyn/workshops/CBW_2024/card_out/{1/.}_CARD --threads 12 --local --clean' \
 ::: /home/robyn/workshops/CBW_2024/raw_data/*_R1_subsampled.fastq.gz ::: /home/robyn/workshops/CBW_2024/raw_data//*_R2_subsampled.fastq.gz
 
 cd ..
 cd card_out
 less HSMA33J3_R1_subsampled.fastq_CARD.allele_mapping_data.txt
 less HSM6XRQY_R1_subsampled.fastq_CARD.overall_mapping_stats.txt
 less HSM7J4QT_R1_subsampled.fastq_CARD.reference_mapping_stats.txt
 less HSM7J4QT_R1_subsampled.fastq_CARD.gene_mapping_data.txt
 mkdir gene_mapping
 mv *gene_mapping_data* gene_mapping/
 mkdir other_read_files
 mv *subsampled* other_read_files

See some details about these output files here

Clone this wiki locally