-
Notifications
You must be signed in to change notification settings - Fork 204
CBW 2024 Advanced Module 3: Metagenomic functional annotation
cd workspace
mkdir amb_module3
cd amb_module3
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
ln -s ~/CourseData/MIC_data/AMB_data/scripts/ .
ln -s ~/CourseData/MIC_data/AMB_data/MMSeqs2_db/ .
#edit script
vi scripts/concat_paired_end.pl
#press "i" to edit the documents
#replace top line with #!/home/ubuntu/CourseData/MIC_data/.conda/envs/functional/bin/perl
#press esc and then ":x"
scripts/concat_paired_end.pl -p 4 -o cat_reads raw_data/*.fastq.gz
gunzip cat_reads/*
#tar -xvf ~/workshops/metagenome_tutorial/MMSeqs2_db.tar.gz
ln -s ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .
mkdir mmseqs_U90_out
parallel -j 4 --progress 'mmseqs createdb {} mmseqs_U90_out/mmseqs-{/.}-queryDB' ::: cat_reads/*
parallel -j 1 --progress 'mmseqs search mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB tmp --db-load-mode 3 --threads 2 --max-seqs 25 -s 1 -a -e 1e-5' ::: cat_reads/*
#it doesn't work!
cp ~/CourseData/MIC_data/AMB_data/mmseqs_U90_out/*resultDB* mmseqs_U90_out/
parallel -j 2 --progress 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2' ::: cat_reads/*
#if we didn't want the output, we could run it like this:
#parallel -j 1 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2 > /dev/null 2>&1' ::: cat_reads/*
mkdir mmseqs_m8_files
mv mmseqs_U90_out/*.m8 mmseqs_m8_files/
less mmseqs_m8_files/mmseqs-HSM7J4QT-s1.m8
mkdir mmseqs_U90_out_tophit
python3 scripts/pick_uniref_top_hit.py --unirefm8Dir mmseqs_m8_files --output_path mmseqs_U90_out_tophit
python3 #press enter
import os
samples = os.listdir('cat_reads/')
samples = [s.split('.')[0] for s in samples]
print(samples)
kraken_path = '~/CourseData/MIC_data/AMB_data/kraken2_outraw_RefSeqCompleteV205/'
kraken_suffix = '_0.1_RefSeqCompleteV205.kraken.txt'
mmseqs_path = 'mmseqs_U90_out_tophit/mmseqs-'
mmseqs_suffix = '-s1.m8-parsed.txt'
mmseqs_m8_path = 'mmseqs_m8_files-'
mmseqs_m8_suffix = '-s1.m8'
with open('multi-sample-outfiles-w-m8.txt', 'w') as f:
for sample in samples:
string = sample+'\t'
string += kraken_path+sample+kraken_suffix+'\t'+'kraken2'+'\t'
string += mmseqs_path+sample+mmseqs_suffix+'\t'+'uniref'+'\t'
string += mmseqs_m8_path+sample+mmseqs_m8_suffix+'\n'
f.write(string)
quit() #press enter
ln -s MMSeqs2_db/*.pbz2 .
python3 scripts/parse_TaxonomyFunction.py --multisample multi-sample-outfiles-w-m8.txt --outputf Workshop_strat_matrix_RPKM.txt --stratified Y --map2EC Y
This might take a while, or it might say "killed". Either way, we can copy the output:
ln -s ~/CourseData/MIC_data/AMB_data/Workshop_strat_matrix_RPKM.txt .
less Workshop_strat_matrix_RPKM.txt
Split the function column to taxonomy and function:
conda activate picrust2
python
import pandas as pd
file = pd.read_csv('Workshop_strat_matrix_RPKM.txt', header=0, sep='\t')
samples = [s for s in file.columns if s != 'function']
file[['EC','taxon']] = file['function'].str.split('|',expand=True)
file = file.drop('function', axis=1).loc[:, ['EC', 'taxon']+samples]
file.to_csv('Workshop_strat_matrix_RPKM_split.txt', index=False, sep='\t')
quit()
add_descriptions.py -i Workshop_strat_matrix_RPKM_split.txt -o Workshop_strat_matrix_RPKM_split_des.txt -m EC
Make the taxonomy dictionary:
#if you want to see how this is made for yourself, you can do that like this. If you just want to copy the pre-made one then that's fine, too.
mkdir taxonomy
cd taxonomy
wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xvf taxdump.tar.gz
#nodes.dmp = taxid, parent taxid
#names.dmp = taxid, name
cd ..
python
import pickle
nodes_dict = {}
names_dict = {}
nodes_level_dict = {}
for line in open('taxonomy/nodes.dmp', 'r'):
this_line = line.replace('\t', '').split('|')
node, parent, level = this_line[0], this_line[1], this_line[2]
nodes_dict[node] = parent
nodes_level_dict[node] = level
for line in open('taxonomy/names.dmp', 'r'):
this_line = line.replace('\t', '').split('|')
taxid, name, name_type = this_line[0], this_line[1], this_line[3]
if name_type == 'scientific name':
names_dict[taxid] = name
full_taxonomy = {}
for taxid in nodes_dict:
parent_id = nodes_dict[taxid]
tax_list = [names_dict[taxid]]
nodes_list = [taxid]
level_list = [nodes_level_dict[taxid]]
while parent_id in nodes_dict:
if parent_id in ['1', '0']:
break
tax_list.append(names_dict[parent_id])
nodes_list.append(parent_id)
level_list.append(nodes_level_dict[parent_id])
parent_id = nodes_dict[parent_id]
full_taxonomy[taxid] = [nodes_list, tax_list, level_list]
with open('taxonomy.dict', 'wb') as f:
pickle.dump(full_taxonomy, f)
quit()
Add the full taxonomy to the file we had, resave it in the full stratified format needed for JarrVis:
python
import pandas as pd
import pickle
file = pd.read_csv('Workshop_strat_matrix_RPKM_split_des.txt', header=0, sep='\t')
with open('taxonomy.dict', 'rb') as f:
full_taxonomy = pickle.load(f)
tax_levels = {'phylum':'p__', 'class':'c__', 'order':'o__', 'family':'f__', 'genus':'g__', 'species':'s__'}
for row in file.index.values:
tax = file.loc[row, 'taxon']
taxid = tax.split('taxid ')[1].replace(')', '')
full_tax = full_taxonomy[taxid]
this_tax = []
for level in tax_levels:
if level in full_tax[2]:
index = full_tax[2].index(level)
name = full_tax[1][index]
this_tax.append(tax_levels[level]+name)
else:
this_tax.append(tax_levels[level]+'Unclassified')
this_tax = ';'.join(this_tax)
file.loc[row, 'taxon'] = this_tax
file.to_csv('Workshop_strat_matrix_RPKM_split_des_full_tax.txt', sep='\t', index=False)
new_file = []
samples = [s for s in file.columns if s not in ['EC', 'description', 'taxon']]
for row in file.index.values:
for sample in samples:
this_row = [sample, file.loc[row, 'taxon'], file.loc[row, 'EC']+': '+file.loc[row, 'description'], file.loc[row, sample]]
new_file.append(this_row)
jarrvis_file = pd.DataFrame(new_file, columns=['Sample', 'Genus', 'Gene', 'Contribution'])
jarrvis_file.to_csv('Workshop_stratified_for_jarrvis.txt', sep='\t', index=False)
quit()
#Workshop_stratified_for_jarrvis.txt #download
#mgs_metadata.txt #download
#modify metadata header so that sample says sample_id
install.packages('shiny')
library(shiny)
runGist("943ff5fdbd94815cc27f302d9f56ff0b")
Add stratified file Add metadata file Click "select metadata categories" Taxonomy level to collapse - choose whatever you'd like, I went with genus Click "Select taxa categories" Filter by taxa: select all Click "Select function categories" Filter by function: select all Click "Update the gene contribution threshold from data" Move up the bottom filter so that we don't plot all of the really low abundance stuff, to ~500 is sensible Scroll to the bottom of the window and click "Display plot" Now scroll back up and you should see the functions in each group
#hopefully you ran this in module 1, but incase you didn't:
conda activate taxonomic
mkdir ~/workspace/amb_module1/metaphlan_out
mkdir ~/workspace/amb_module3/humann_out
cd amb_module3
parallel -j 2 'metaphlan --input_type fastq {} --bowtie2db ~/CourseData/MIC_data/tools/CHOCOPhlAn_201901/ > ~/workspace/amb_module1/metaphlan_out/{/.}.txt' ::: cat_reads/*.fastq
conda activate biobakery3
#you might want to use tmux for this!
parallel -j 1 --eta --progress 'humann -i {} -o humann_out/ --threads 4 --taxonomic-profile ~/workspace/amb_module1/metaphlan_out/{/.}.txt --protein-database ~/CourseData/MIC_data/tools/humann_databases/uniref' ::: cat_reads/*.fastq
humann_join_tables -i humann_out -o HMP_humann_genefamilies.tsv --file_name genefamilies
humann_join_tables -i humann_out -o HMP_humann_pathabundance.tsv --file_name pathabundance
humann_join_tables -i humann_out -o HMP_humann_pathcoverage.tsv --file_name pathcoverage
#if you take a look at these, you'll see that we don't have very much that's well classified. This is because we're using very sub-sampled samples, and for useful functional annotation we really need higher sequencing depth than for taxonomy. I also ran HUMANN with the full samples, you can get that file from here:
wget -O HMP_humann_pathabundance_full.tsv https://www.dropbox.com/scl/fi/xcrd383810v4vvasnn22m/HMP_humann_pathabundance.tsv?rlkey=jedk0l8ctd3qv3hrqbavbge43&dl=1
#and a version that I modified to have the metadata as an additional line just under the header here:
wget -O HMP_humann_pathabundance_full_metadata.tsv https://www.dropbox.com/scl/fi/x6sqim72gr07a8h5z4iqw/HMP_humann_pathabundance_metadata.tsv?rlkey=huaqj9dzb5rsclnsmo79se8m5&dl=1
#humann has some built in functions for plotting this output, so we'll run some of the pathways like so:
mkdir humann_plots
parallel -j 1 'humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/{}.png --focal-feature {} --sort sum metadata --scaling logstack' ::: PWY-7221 PEPTIDOGLYCANSYN-PWY PWY-1269 PWY-6277 FUCCAT-PWY P164-PWY PWY-7013 HSERMETANA-PWY P163-PWY
#note that these pathways were chosen more or less at random to show a few different patterns. Feel free to look through the file and run this with some other pathways, e.g.:
humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/CALVIN-PWY.png --focal-feature CALVIN-PWY --sort sum metadata --scaling logstack
humann_barplot --input HMP_humann_pathabundance_full_metadata.tsv --focal-metadata disease_state --last-metadata disease_state --output humann_plots/ANAEROFRUCAT-PWY.png --focal-feature ANAEROFRUCAT-PWY --sort sum metadata --scaling logstack
conda activate rgi
#if you made it to this part in advanced module 2, you already have the data we need. If not, either run those commands now or copy across the data needed
Run with samples:
mkdir card_out
cd ~/workspace/card_data/
parallel -j 1 --link 'rgi bwt --read_one {1} --read_two {2} --aligner bowtie2 --output_file ~/workspace/amb_module3/card_out/{1/.}_CARD --threads 2 --local --clean' \
::: ~/workspace/amb_module3/raw_data/*_R1_subsampled.fastq.gz ::: ~/workspace/amb_module3/raw_data/*_R2_subsampled.fastq.gz
cd ~/workspace/amb_module3/card_out
less HSMA33J3_R1_subsampled.fastq_CARD.allele_mapping_data.txt
less HSM6XRQY_R1_subsampled.fastq_CARD.overall_mapping_stats.txt
less HSM7J4QT_R1_subsampled.fastq_CARD.reference_mapping_stats.txt
less HSM7J4QT_R1_subsampled.fastq_CARD.gene_mapping_data.txt
mkdir gene_mapping
mv *gene_mapping_data* gene_mapping/
mkdir other_read_files
mv *subsampled* other_read_files
Just combine these and get people to look at them in excel? R heatmap?? See some details about these output files here
- Please feel free to post a question on the Microbiome Helper google group if you have any issues.
- General comments or inquires about Microbiome Helper can be sent to [email protected].