-
Notifications
You must be signed in to change notification settings - Fork 204
CBW 2024 Advanced Module 3: Metagenomic functional annotation
cd workspace
mkdir amb_module3
cd amb_module3
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
ln -s ~/CourseData/MIC_data/AMB_data/scripts/ .
ln -s ~/CourseData/MIC_data/AMB_data/MMSeqs2_db/ .
scripts/concat_paired_end.pl -p 4 -o cat_reads raw_data/*.fastq.gz
gunzip cat_reads/*
#tar -xvf ~/workshops/metagenome_tutorial/MMSeqs2_db.tar.gz
cp ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .
mkdir mmseqs_U90_out
parallel -j 4 --progress 'mmseqs createdb {} mmseqs_U90_out/mmseqs-{/.}-queryDB' ::: cat_reads/*
parallel -j 1 --progress 'mmseqs search mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB tmp --db-load-mode 3 --threads 2 --max-seqs 25 -s 1 -a -e 1e-5' ::: cat_reads/*
#it doesn't work!
cp ~/CourseData/MIC_data/AMB_data/mmseqs_U90_out/*resultDB* mmseqs_U90_out/
mmseqs convertalis mmseqs_U90_out/mmseqs-HSMA33J3queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-HSMA33J3resultDB mmseqs_U90_out/mmseqs-HSMA33J3-s1.m8 --db-load-mode 2
parallel -j 2 --progress 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2' ::: cat_reads/*
#if we didn't want the output, we could run it like this:
#parallel -j 1 'mmseqs convertalis mmseqs_U90_out/mmseqs-{/.}-queryDB MMSeqs2_db/mmseqsUniref90DB mmseqs_U90_out/mmseqs-{/.}-resultDB mmseqs_U90_out/mmseqs-{/.}-s1.m8 --db-load-mode 2 > /dev/null 2>&1' ::: cat_reads/*
mkdir mmseqs_m8_files
mv mmseqs_U90_out/*.m8 mmseqs_m8_files/
less mmseqs_m8_files/mmseqs-HSM7J4QT-s1.m8
mkdir mmseqs_U90_out_tophit
python3 scripts/pick_uniref_top_hit.py --unirefm8Dir mmseqs_m8_files --output_path mmseqs_U90_out_tophit
python3 #press enter
import os
samples = os.listdir('cat_reads/')
samples = [s.split('.')[0] for s in samples]
print(samples)
kraken_path = '~/CourseData/MIC_data/AMB_data/kraken2_outraw_RefSeqCompleteV205/'
#kraken_path = '~/amb_module1/kraken2_outraw/'
kraken_suffix = '_0.1_RefSeqCompleteV205.kraken.txt'
mmseqs_path = 'mmseqs_U90_out_tophit/mmseqs-'
mmseqs_suffix = '-s1.m8-parsed.txt'
mmseqs_m8_path = 'mmseqs_m8_files-'
mmseqs_m8_suffix = '-s1.m8'
with open('multi-sample-outfiles-w-m8.txt', 'w') as f:
for sample in samples:
string = sample+'\t'
string += kraken_path+sample+kraken_suffix+'\t'+'kraken2'+'\t'
string += mmseqs_path+sample+mmseqs_suffix+'\t'+'uniref'+'\t'
string += mmseqs_m8_path+sample+mmseqs_m8_suffix+'\n'
f.write(string)
quit() #press enter
ln -s MMSeqs2_db/*.pbz2 .
python3 scripts/parse_TaxonomyFunction.py --multisample multi-sample-outfiles-w-m8.txt --outputf Workshop_strat_matrix_RPKM.txt --stratified Y --map2EC Y
This might take a while, or it might say "killed". Either way, we can copy the output:
ln -s ~/CourseData/MIC_data/AMB_data/Workshop_strat_matrix_RPKM.txt .
less Workshop_strat_matrix_RPKM.txt
Split the function column to taxonomy and function:
conda activate picrust2
python
import pandas as pd
file = pd.read_csv('Workshop_strat_matrix_RPKM.txt', header=0, sep='\t')
samples = [s for s in file.columns if s != 'function']
file[['EC','taxon']] = file['function'].str.split('|',expand=True)
file = file.drop('function', axis=1).loc[:, ['EC', 'taxon']+samples]
file.to_csv('Workshop_strat_matrix_RPKM_split.txt', index=False, sep='\t')
quit()
add_descriptions.py -i Workshop_strat_matrix_RPKM_split.txt -o Workshop_strat_matrix_RPKM_split_des.txt -m EC
Make the taxonomy dictionary:
#if you want to see how this is made for yourself, you can do that like this. If you just want to copy the pre-made one then that's fine, too.
mkdir taxonomy
cd taxonomy
wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xvf taxdump.tar.gz
#nodes.dmp = taxid, parent taxid
#names.dmp = taxid, name
python
import pickle
nodes_dict = {}
names_dict = {}
nodes_level_dict = {}
for line in open('taxonomy/nodes.dmp', 'r'):
this_line = line.replace('\t', '').split('|')
node, parent, level = this_line[0], this_line[1], this_line[2]
nodes_dict[node] = parent
nodes_level_dict[node] = level
for line in open('taxonomy/names.dmp', 'r'):
this_line = line.replace('\t', '').split('|')
taxid, name, name_type = this_line[0], this_line[1], this_line[3]
if name_type == 'scientific name':
names_dict[taxid] = name
full_taxonomy = {}
for taxid in nodes_dict:
parent_id = nodes_dict[taxid]
tax_list = [names_dict[taxid]]
nodes_list = [taxid]
level_list = [nodes_level_dict[taxid]]
while parent_id in nodes_dict:
if parent_id in ['1', '0']:
break
tax_list.append(names_dict[parent_id])
nodes_list.append(parent_id)
level_list.append(nodes_level_dict[parent_id])
parent_id = nodes_dict[parent_id]
full_taxonomy[taxid] = [nodes_list, tax_list, level_list]
with open('taxonomy.dict', 'wb') as f:
pickle.dump(full_taxonomy, f)
quit()
Add the full taxonomy to the file we had, resave it in the full stratified format needed for JarrVis:
python
import pandas as pd
import pickle
file = pd.read_csv('Workshop_strat_matrix_RPKM_split_des.txt', header=0, sep='\t')
with open('taxonomy.dict', 'rb') as f:
full_taxonomy = pickle.load(f)
tax_levels = {'phylum':'p__', 'class':'c__', 'order':'o__', 'family':'f__', 'genus':'g__', 'species':'s__'}
for row in file.index.values:
tax = file.loc[row, 'taxon']
taxid = tax.split('taxid ')[1].replace(')', '')
full_tax = full_taxonomy[taxid]
this_tax = []
for level in tax_levels:
if level in full_tax[2]:
index = full_tax[2].index(level)
name = full_tax[1][index]
this_tax.append(tax_levels[level]+name)
else:
this_tax.append(tax_levels[level]+'Unclassified')
this_tax = ';'.join(this_tax)
file.loc[row, 'taxon'] = this_tax
file.to_csv('Workshop_strat_matrix_RPKM_split_des_full_tax.txt', sep='\t', index=False)
new_file = []
samples = [s for s in file.columns if s not in ['EC', 'description', 'taxon']]
for row in file.index.values:
for sample in samples:
this_row = [sample, file.loc[row, 'taxon'], file.loc[row, 'EC']+': '+file.loc[row, 'description'], file.loc[row, sample]]
new_file.append(this_row)
jarrvis_file = pd.DataFrame(new_file, columns=['Sample', 'Genus', 'Gene', 'Contribution'])
jarrvis_file.to_csv('Workshop_stratified_for_jarrvis.txt', sep='\t', index=False)
quit()
#Workshop_stratified_for_jarrvis.txt #download
#mgs_metadata.txt #download
install.packages('shiny')
library(shiny)
runGist("943ff5fdbd94815cc27f302d9f56ff0b")
Add stratified file Add metadata file Click "select metadata categories" Taxonomy level to collapse - choose whatever you'd like, I went with genus Click "Select taxa categories" Filter by taxa: select all Click "Select function categories" Filter by function: select all Click "Update the gene contribution threshold from data" Move up the bottom filter so that we don't plot all of the really low abundance stuff, to ~500 is sensible Scroll to the bottom of the window and click "Display plot" Now scroll back up and you should see the functions in each group
conda activate rgi
If time - load database:
mkdir card_data
cd card_data
wget https://card.mcmaster.ca/latest/data
tar -xvf data ./card.json
rgi load --card_json ./card.json --local
rgi database --version --local
#3.2.9
rgi clean --local
wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants
mkdir -p wildcard
tar -xjf wildcard_data.tar.bz2 -C wildcard
gunzip wildcard/*.gz
rgi card_annotation -i localDB/card.json > card_annotation.log 2>&1
rgi wildcard_annotation -i wildcard --card_json localDB/card.json -v v3.2.9 > wildcard_annotation.log 2>&1
rgi load \
--card_json localDB/card.json \
--debug --local \
--card_annotation card_database_v3.2.9.fasta \
--wildcard_annotation wildcard_database_vv3.2.9.fasta \
--wildcard_index wildcard/index-for-model-sequences.txt \
--wildcard_version 3.2.9 \
--amr_kmers wildcard/all_amr_61mers.txt \
--kmer_database wildcard/61_kmer_db.json \
--kmer_size 61
Run with samples:
mkdir card_out
parallel -j 1 --link 'rgi bwt --read_one {1} --read_two {2} --aligner bowtie2 --output_file /home/robyn/workshops/CBW_2024/card_out/{1/.}_CARD --threads 12 --local --clean' \
::: /home/robyn/workshops/CBW_2024/raw_data/*_R1_subsampled.fastq.gz ::: /home/robyn/workshops/CBW_2024/raw_data//*_R2_subsampled.fastq.gz
cd ..
cd card_out
less HSMA33J3_R1_subsampled.fastq_CARD.allele_mapping_data.txt
less HSM6XRQY_R1_subsampled.fastq_CARD.overall_mapping_stats.txt
less HSM7J4QT_R1_subsampled.fastq_CARD.reference_mapping_stats.txt
less HSM7J4QT_R1_subsampled.fastq_CARD.gene_mapping_data.txt
mkdir gene_mapping
mv *gene_mapping_data* gene_mapping/
mkdir other_read_files
mv *subsampled* other_read_files
See some details about these output files here
- Please feel free to post a question on the Microbiome Helper google group if you have any issues.
- General comments or inquires about Microbiome Helper can be sent to [email protected].