-
Notifications
You must be signed in to change notification settings - Fork 105
Updating the PICRUSt2 database
This page will contain all commands used to update the PICRUSt2 database. It is currently still a work-in-progress. Please contact the Google Group with any questions, but there is a recent update to the PICRUSt2 database. We are still testing this, but you can see details on how you can use it here.
These were downloaded from The GTDB website. You should check for an updated version of GTDB before downloading them.
Download all files (what we are looking for is the genomes from the representative sequences, the metadata for bacteria and archaea, the bacterial and archaeal trees and the taxonomy information for the bacteria and archaea):
mkdir picrust2_database
cd picrust2_database/
mkdir GTDB_r214
cd GTDB_r214
wget https://data.gtdb.ecogenomic.org/releases/release214/214.1/genomic_files_reps/gtdb_genomes_reps_r214.tar.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_taxonomy_r214.tsv https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_r214.tree.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_metadata_r214.tsv.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_taxonomy_r214.tsv https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_r214.tree https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_metadata_r214.tsv.gz
tar -xvf gtdb_genomes_reps_r214.tar.gz
mkdir gtdb_genomes
find gtdb_genomes_reps_r214/ -type f -print0 | xargs -0 mv -t gtdb_genomes/
from ete3 import Tree
import os
tree_name_bac = 'gtdb_picrust_files/bac120_r214.tree'
tree_name_arc = 'gtdb_picrust_files/ar53_r214.tree'
tree_bac = Tree(tree_name_bac, format=1, quoted_node_names=True)
bac_genomes_in_tree = []
for node in tree_bac.traverse("postorder"):
if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
bac_genomes_in_tree.append(node.name)
tree_arc = Tree(tree_name_arc, format=1, quoted_node_names=True)
arc_genomes_in_tree = []
for node in tree_arc.traverse("postorder"):
if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
arc_genomes_in_tree.append(node.name)
genomes = os.listdir('GTDB_r214/gtdb_genomes/')
genomes = [g for g in genomes if '.decomp' not in g]
bac_genomes_in_tree = [g.split('_', 1)[1] for g in bac_genomes_in_tree]
arc_genomes_in_tree = [g.split('_', 1)[1] for g in arc_genomes_in_tree]
bac_genomes_in_tree = set(bac_genomes_in_tree)
arc_genomes_in_tree = set(arc_genomes_in_tree)
bac_genomes = []
arc_genomes = []
bac_count = 0
arc_count = 0
for genome in genomes:
if genome.replace('_genomic.fna.gz', '') in bac_genomes_in_tree:
m = os.system('mv GTDB_r214/gtdb_genomes/'+genome+' genomes_to_search_barrnap/bacteria')
bac_count += 1
elif genome.replace('_genomic.fna.gz', '') in arc_genomes_in_tree:
m = os.system('mv GTDB_r214/gtdb_genomes/'+genome+' genomes_to_search_barrnap/archaea')
arc_count += 1
if bac_count % 1000 == 0:
print('Bacterial genomes moved: '+str(bac_count))
if arc_count % 1000 == 0:
print('Archaeal genomes moved: '+str(arc_count))
print(bac_count, arc_count)
#77930 4184
List of bacteria:
import os
genomes = os.listdir('bacteria')
with open('bacteria.txt', 'w') as f:
for genome in genomes:
f.write(genome.replace('.gz', '')+'\n')
cd genomes_to_search_barrnap
mkdir barrnap_bacteria
mkdir barrnap_bacteria_08
mkdir barrnap_archaea_08
mkdir barrnap_archaea
parallel -j 24 --progress --eta 'gunzip {}' ::: archaea/*.gz
parallel -j 8 --progress --eta 'barrnap -q -k arc {} --outseq barrnap_archaea/{/} --reject 0.5 --threads 6' ::: archaea/*.fna
parallel -j 24 --progress --eta -a bacteria.txt 'gunzip bacteria/{}.gz'
parallel -j 8 --progress --eta -a bacteria.txt 'barrnap -q -k bac bacteria/{} --outseq barrnap_bacteria/{} --reject 0.5 --threads 6'
parallel -j 8 --progress --eta -a bacteria.txt 'barrnap -q -k bac bacteria/{} --outseq barrnap_bacteria_08/{} --reject 0.8 --threads 6'
parallel -j 8 --progress --eta 'barrnap -q -k arc {} --outseq barrnap_archaea_08/{/} --reject 0.8 --threads 6' ::: archaea/*.fna
mkdir barrnap_bacteria_16S_single_0.8
mkdir barrnap_archaea_16S_single_0.8
mkdir barrnap_bacteria_16S_multiple_0.8
mkdir barrnap_archaea_16S_multiple_0.8
from Bio import SeqIO
import os
import pandas as pd
import numpy as np
kingdom = 'bacteria'
genomes = os.listdir('barrnap_'+kingdom+'_08')
copies = []
for genome in genomes:
count = 0
sequences = []
for record in SeqIO.parse('barrnap_'+kingdom+'_08/'+genome, "fasta"):
if '16S' in record.id:
sequences.append(record)
count += 1
copies.append([genome, count])
if sequences != []:
if len(sequences) == 1:
w = SeqIO.write(sequences, 'barrnap_'+kingdom+'_16S_single_0.8/'+genome, "fasta")
else:
w = SeqIO.write(sequences, 'barrnap_'+kingdom+'_16S_multiple_0.8/'+genome, "fasta")
with open(kingdom+'_16S_copies_0.8.txt', 'w') as f:
for copy in copies:
w = f.write(copy[0].replace('_genomic.fna', '')+'\t'+str(copy[1])+'\n')
k05 = pd.read_csv(kingdom+'_16S_copies_0.5.txt', sep='\t', header=None, index_col=0)
k05 = k05[k05.max(axis=1) > 0]
k08 = pd.read_csv(kingdom+'_16S_copies_0.8.txt', sep='\t', header=None, index_col=0)
k08 = k08[k08.max(axis=1) > 0]
print(kingdom)
print('50%: ', k05.shape[0], np.mean(k05.iloc[:, 0].values), np.median(k05.iloc[:, 0].values), np.max(k05.iloc[:, 0].values))
print('80%: ', k08.shape[0], np.mean(k08.iloc[:, 0].values), np.median(k08.iloc[:, 0].values), np.max(k08.iloc[:, 0].values))
# archaea
# 50%: 1898 1.1991570073761855 1.0
# 80%: 1553 1.2137797810688988 1.0
# bacteria
# 50%: 36644 1.777071280427901 1.0 37
# 80%: 32595 1.8404663291915937 1.0 37
mkdir barrnap_bacteria_16S_clustered_0.8
parallel -j 8 'vsearch --cluster_fast {} --id 0.9 --centroids barrnap_bacteria_16S_clustered_0.8/{/} --threads 6' ::: barrnap_bacteria_16S_multiple_0.8/*
mkdir barrnap_archaea_16S_clustered_0.8
parallel -j 8 --progress --eta 'vsearch --cluster_fast {} --id 0.9 --centroids barrnap_archaea_16S_clustered_0.8/{/} --threads 6' ::: barrnap_archaea_16S_multiple_0.8/*
import os
from Bio import SeqIO
import random
from Bio.SeqRecord import SeqRecord
kingdoms = ['archaea', 'bacteria']
for kingdom in kingdoms:
clusters = os.listdir('barrnap_'+kingdom+'_16S_clustered_0.8')
singles = []
ids = []
for genome in clusters:
records = []
for record in SeqIO.parse('barrnap_'+kingdom+'_16S_clustered_0.8/'+genome, "fasta"):
this_record = SeqRecord(record.seq, id=genome.replace('_genomic.fna', ''), description='')
records.append(this_record)
if len(records) > 1:
num = int(random.choice(range(len(records))))
singles.append(records[num])
ids.append(records[num].id)
else:
singles.append(records[0])
ids.append(records[0].id)
single_copies = os.listdir('barrnap_'+kingdom+'_16S_single_0.8/')
for genome in single_copies:
for record in SeqIO.parse('barrnap_'+kingdom+'_16S_single_0.8/'+genome, "fasta"):
this_record = SeqRecord(record.seq, id=genome.replace('_genomic.fna', ''), description='')
singles.append(this_record)
ids.append(this_record.id)
SeqIO.write(singles, kingdom+"_16S_genes_0.8.fasta", "fasta")
Look at these sequence lengths:
from Bio import SeqIO
import numpy as np
kingdom = 'archaea'
lengths = []
for record in SeqIO.parse(kingdom+"_16S_genes_0.8.fasta", "fasta"):
lengths.append(len(str(record.seq)))
print(min(lengths), max(lengths), np.mean(lengths), np.median(lengths))
#bacteria
#1268 1862 1509.0679552078539 1520.0
#archaea
#1270 2547 1474.9130714745654 1472.0
vsearch --cluster_fast archaea_16S_genes_0.8.fasta --id 1 --centroids archaea_16S_centroids_0.8.fasta --uc archaea_16S_clusters_0.8.uc --threads 24
# Reading file archaea_16S_genes_0.8.fasta 100%
# 2290540 nt in 1553 seqs, min 1270, max 2547, avg 1475
# Masking 100%
# Sorting by length 100%
# Counting unique k-mers 100%
# Clustering 100%
# Sorting clusters 100%
# Writing clusters 100%
# Clusters: 1539 Size min 1, max 3, avg 1.0
# Singletons: 1526, 98.3% of seqs, 99.2% of clusters
vsearch --cluster_fast bacteria_16S_genes_0.8.fasta --id 1 --centroids bacteria_16S_centroids_0.8.fasta --uc bacteria_16S_clusters_0.8.uc --threads 24
# Reading file bacteria_16S_genes_0.8.fasta 100%
# 49188070 nt in 32595 seqs, min 1268, max 1862, avg 1509
# Masking 100%
# Sorting by length 100%
# Counting unique k-mers 100%
# Clustering 100%
# Sorting clusters 100%
# Writing clusters 100%
# Clusters: 31499 Size min 1, max 35, avg 1.0
# Singletons: 30794, 94.5% of seqs, 97.8% of clusters
ssu-align:
conda activate clustalo
export SSUALIGNDIR="/usr/local/share/ssu-align-0.1.1"
ssu-align --rfonly archaea_16S_centroids_0.8.fasta archaea_16S_centroids_ssu_align_0.8
ssu-align --rfonly bacteria_16S_centroids_0.8.fasta bacteria_16S_centroids_ssu_align_0.8
########################
esl-reformat -o archaea_16S_centroids_ssu_align_0.8.fna afa archaea_16S_centroids_ssu_align_0.8/archaea_16S_centroids_ssu_align_0.8.archaea.stk
esl-reformat -o bacteria_16S_centroids_ssu_align_0.8.fna afa bacteria_16S_centroids_ssu_align_0.8/bacteria_16S_centroids_ssu_align_0.8.bacteria.stk
cd ..
Look at these sequence lengths (unaligned):
from Bio import SeqIO
import numpy as np
kingdom = 'bacteria'
lengths = []
for record in SeqIO.parse(kingdom+'_16S_centroids_ssu_align_0.8.fna', "fasta"):
lengths.append(len(str(record.seq)))
print(min(lengths), max(lengths), np.mean(lengths), np.median(lengths))
#bacteria
#1508 1508 1508.0 1508.0
#archaea
#1582 1582 1582.0 1582.0
Archaea:
import os
import pandas as pd
from Bio import SeqIO
import numpy as np
import random
clusters = 'genomes_to_search_barrnap/archaea_16S_clusters_0.8.uc'
aligned_fasta = 'genomes_to_search_barrnap/archaea_16S_centroids_ssu_align_0.8.fna'
md = pd.read_csv('GTDB_r214/ar53_metadata_r214.tsv', index_col=0, header=0, sep='\t')
md = md[md['gtdb_representative'] == 't']
genes_16S = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
genes_16S.append(record.id)
rename_md = {}
for row in md.index.values:
rename_md[row] = row.split('_', 1)[1]
md = md.rename(index=rename_md)
clusters_all_genomes, clusters_centroid = {}, []
for row in open(clusters, 'r'):
row = row.replace('\n', '').split('\t')
if row[-1] != '*':
genomes = row[-2:]
if genomes[0] in genes_16S:
#print('First one', genomes)
clusters_centroid.append(genomes[0])
if genomes[0] in clusters_all_genomes:
clusters_all_genomes[genomes[0]].append(genomes[1])
else:
clusters_all_genomes[genomes[0]] = [genomes[1]]
elif genomes[1] in genes_16S:
clusters_centroid.append(genomes[1])
if genomes[1] in clusters_all_genomes:
clusters_all_genomes[genomes[1]].append(genomes[0])
else:
clusters_all_genomes[genomes[1]] = [genomes[0]]
else:
print('Neither', genomes)
#now choose best genome from all clusters
cluster_bests = {}
for cluster in clusters_all_genomes:
md_cluster = md.loc[[cluster]+clusters_all_genomes[cluster], :]
completeness = md_cluster['checkm_completeness'].values
contamination = md_cluster['checkm_contamination'].values
best = ''
if len(set(completeness)) == 1:
if len(set(contamination)) == 1:
num = int(random.choice(range(md_cluster.shape[0])))
best = md_cluster.index.values[num]
else:
lowest, best = 100, ''
for row in md_cluster.index.values:
if md_cluster.loc[row, 'checkm_contamination'] < lowest:
lowest, best = md_cluster.loc[row, 'checkm_contamination'], row
else:
highest, best = 0, ''
for row in md_cluster.index.values:
if md_cluster.loc[row, 'checkm_completeness'] > highest:
highest, best = md_cluster.loc[row, 'checkm_completeness'], row
cluster_bests[cluster] = best
#rename fasta file with the best of the cluster
new_records = []
all_ids = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
if record.id in cluster_bests:
print('Changing', record.id, 'to', cluster_bests[record.id])
record.id = cluster_bests[record.id]
all_ids.append(record.id)
new_records.append(record)
SeqIO.write(new_records, aligned_fasta.replace('.fna', '_best.fna'), "fasta")
#make file with name of best genome in cluster and other genomes within the cluster
with open('genomes_to_search_barrnap/archaea_16S_clusters_processed_0.8.txt', 'w') as f:
w = f.write('Centroid\tBest\tAll genomes\n')
for cluster in clusters_all_genomes:
w = f.write(cluster+'\t'+cluster_bests[cluster]+'\t'+cluster+','+','.join(clusters_all_genomes[cluster])+'\n')
#get reduced metadata file including only those genomes that are the best
md = md.loc[all_ids, :]
md.to_csv('archaea_metadata_clusters_ssu_align_centroids_0.8.csv')
Bacteria:
import os
import pandas as pd
from Bio import SeqIO
import numpy as np
import random
clusters = 'genomes_to_search_barrnap/bacteria_16S_clusters_0.8.uc'
aligned_fasta = 'genomes_to_search_barrnap/bacteria_16S_centroids_ssu_align_0.8.fna'
md = pd.read_csv('GTDB_r214/bac120_metadata_r214.tsv', index_col=0, header=0, sep='\t')
md = md[md['gtdb_representative'] == 't']
genes_16S = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
genes_16S.append(record.id)
rename_md = {}
for row in md.index.values:
rename_md[row] = row.split('_', 1)[1]
md = md.rename(index=rename_md)
clusters_all_genomes, clusters_centroid = {}, []
for row in open(clusters, 'r'):
row = row.replace('\n', '').split('\t')
if row[-1] != '*':
#print(row)
genomes = row[-2:]
if genomes[0] in genes_16S:
#print('First one', genomes)
clusters_centroid.append(genomes[0])
if genomes[0] in clusters_all_genomes:
clusters_all_genomes[genomes[0]].append(genomes[1])
else:
clusters_all_genomes[genomes[0]] = [genomes[1]]
elif genomes[1] in genes_16S:
clusters_centroid.append(genomes[1])
if genomes[1] in clusters_all_genomes:
clusters_all_genomes[genomes[1]].append(genomes[0])
else:
clusters_all_genomes[genomes[1]] = [genomes[0]]
else:
neither = True #note that this means that the sequences were removed from this step because they fitted the model of a different domain the best
#now choose best genome from all clusters
cluster_bests = {}
for cluster in clusters_all_genomes:
md_cluster = md.loc[[cluster]+clusters_all_genomes[cluster], :]
completeness = md_cluster['checkm_completeness'].values
contamination = md_cluster['checkm_contamination'].values
best = ''
if len(set(completeness)) == 1:
if len(set(contamination)) == 1:
num = int(random.choice(range(md_cluster.shape[0])))
best = md_cluster.index.values[num]
else:
lowest, best = 100, ''
for row in md_cluster.index.values:
if md_cluster.loc[row, 'checkm_contamination'] < lowest:
lowest, best = md_cluster.loc[row, 'checkm_contamination'], row
else:
highest, best = 0, ''
for row in md_cluster.index.values:
if md_cluster.loc[row, 'checkm_completeness'] > highest:
highest, best = md_cluster.loc[row, 'checkm_completeness'], row
cluster_bests[cluster] = best
#rename fasta file with the best of the cluster
new_records = []
all_ids = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
if record.id in cluster_bests:
print('Changing', record.id, 'to', cluster_bests[record.id])
record.id = cluster_bests[record.id]
all_ids.append(record.id)
new_records.append(record)
SeqIO.write(new_records, aligned_fasta.replace('.fna', '_best.fna'), "fasta")
#make file with name of best genome in cluster and other genomes within the cluster
with open('genomes_to_search_barrnap/bacteria_16S_clusters_processed_0.8.txt', 'w') as f:
w = f.write('Centroid\tBest\tAll genomes\n')
for cluster in clusters_all_genomes:
w = f.write(cluster+'\t'+cluster_bests[cluster]+'\t'+cluster+','+','.join(clusters_all_genomes[cluster])+'\n')
#get reduced metadata file including only those genomes that are the best
md = md.loc[all_ids, :]
md.to_csv('bacteria_metadata_clusters_ssu_align_centroids_0.8.csv')
mv gtdb_picrust_files gtdb_picrust_files_first_try
mkdir gtdb_picrust_files
import pandas as pd
from Bio import SeqIO
from ete3 import Tree
domains = ['bacteria', 'archaea']
trees = ['GTDB_r214/bac120_r214.tree', 'GTDB_r214/ar53_r214.tree']
for d in range(len(domains)):
domain = domains[d]
tree = Tree(trees[d], format=1, quoted_node_names=True)
aligned_fasta = 'genomes_to_search_barrnap/'+domain+'_16S_centroids_ssu_align_0.8_best.fna'
md = pd.read_csv(domain+'_metadata_clusters_ssu_align_centroids_0.8.csv', index_col=0, header=0)
md = md[md['checkm_contamination'] <= 10]
md = md[md['checkm_completeness'] >= 90]
new_records = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
if record.id in md.index.values:
new_records.append(record)
SeqIO.write(new_records, 'gtdb_picrust_files/'+domain+'_16S_centroids_ssu_align_best_reduced_0.8.fna', "fasta")
pruning = []
names = []
for node in tree.traverse("postorder"):
if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
new_name = str(node.name).split('_', 1)[1]
node.name = new_name
names.append(new_name)
if new_name in md.index.values:
pruning.append(new_name)
tree.prune(pruning)
tree.write(outfile='gtdb_picrust_files/'+domain+'_16S_centroids_ssu_align_best_reduced_0.8.tre')
cd gtdb_picrust_files
raxml-ng --check --msa archaea_16S_centroids_ssu_align_best_reduced_0.8.fna --model GTR+G --prefix archaea_0.8_raxml-check
#this didn't need fixing!
#esl-reformat -o archaea_raxml-check.raxml.reduced.phy phylip archaea_16S_centroids_ssu_align_best_reduced.fna
#but for some reason, the place_seqs step didn't work when I'd used this so I've just made the steps the same for both of them
raxml-ng --check --msa bacteria_16S_centroids_ssu_align_best_reduced_0.8.fna --model GTR+G --prefix bacteria_0.8_raxml-check
#this had some duplicates removed still apparently (copied this from previously, but outcome is the same!!)
Convert archaea alignment to phylip so we have the same for both:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
seq_records = []
seqs, lengths = 0, []
for record in SeqIO.parse('archaea_16S_centroids_ssu_align_best_reduced_0.8.fna', 'fasta'):
seq_records.append(record)
seqs += 1
lengths.append(len(str(record.seq)))
# count = 0
# for row in open('bacteria_0.8_raxml-check.raxml.reduced.phy', 'r'):
# if count < 2: print([row])
# count += 1
with open('archaea_0.8_raxml-check.raxml.reduced.phy', 'w') as f:
f.write(str(seqs)+' '+str(int(lengths[0]))+'\n')
for record in seq_records:
f.write(record.id+' '+str(record.seq)+'\n')
from ete3 import Tree
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
#from Bio.Alphabet import IUPAC
fixed_alignments = ['bacteria_0.8_raxml-check.raxml.reduced.phy', 'archaea_0.8_raxml-check.raxml.reduced.phy']
tree_files = ['bacteria_16S_centroids_ssu_align_best_reduced_0.8.tre', 'archaea_16S_centroids_ssu_align_best_reduced_0.8.tre']
for f in range(len(fixed_alignments)):
fixed_alignment = fixed_alignments[f]
tree_file = tree_files[f]
genomes = []
for row in open(fixed_alignment, 'r'):
if 'GB_' in row or 'GCA_' in row or 'GCF_' in row:
genomes.append(row.split(' ')[0].replace('>', '').replace('\n', ''))
genomes = set(genomes)
print(len(genomes))
#29311,
tree = Tree(tree_file, format=1, quoted_node_names=True)
genome_nodes = []
for node in tree.traverse("postorder"):
if node.name in genomes:
genome_nodes.append(node.name)
tree.prune(genome_nodes)
tree.write(outfile=tree_file.replace('.tre', '_fixed.tre'), format=1)
mkdir archaea_first_raxml
mv archaea_raxml.raxml* archaea_first_raxml/
raxml-ng --evaluate --msa archaea_0.8_raxml-check.raxml.reduced.phy --tree archaea_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre --prefix archaea_0.8_raxml --model GTR+G —threads 24
raxml-ng --evaluate --msa bacteria_0.8_raxml-check.raxml.reduced.phy --tree bacteria_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre --prefix bacteria_0.8_raxml --model GTR+G —threads 24
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
files = ['bacteria_0.8_raxml-check.raxml.reduced.phy', 'archaea_0.8_raxml-check.raxml.reduced.phy']
new_names = ['bacteria_0.8_raxml-check.raxml.reduced.fna', 'archaea_0.8_raxml-check.raxml.reduced.fna']
for f in range(len(files)):
sequences = {}
seq_records = []
count = 0
for row in open(files[f], 'r'):
#if count > 10: break
if '_' in row:
name, seq = row.replace('\n', '').split(' ')
sequences[name] = seq
seq_records.append(SeqRecord(Seq(seq),id=name, description=''))
count += 1
msa = MultipleSeqAlignment(seq_records)
AlignIO.write(msa, new_names[f], "fasta")
Convert the fasta file to DNA and reformat to stockholm:
esl-reformat -d -o bacteria_0.8_raxml-check.raxml.reduced_dna.fna afa bacteria_0.8_raxml-check.raxml.reduced.fna
esl-reformat -o bacteria_0.8_raxml-check.raxml.reduced_dna.sto stockholm bacteria_0.8_raxml-check.raxml.reduced_dna.fna
esl-reformat -d -o archaea_0.8_raxml-check.raxml.reduced_dna.fna afa archaea_0.8_raxml-check.raxml.reduced.fna
esl-reformat -o archaea_0.8_raxml-check.raxml.reduced_dna.sto stockholm archaea_0.8_raxml-check.raxml.reduced_dna.fna
# from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord
# from Bio.Seq import Seq
# from Bio.Alphabet import IUPAC
# from Bio import AlignIO
# from Bio.Align import MultipleSeqAlignment
#
# files = ['bacteria_raxml-check.raxml.reduced.phy']
# new_names = ['bacteria_raxml-check.raxml.reduced.fna']
#
# for f in range(len(files)):
# sequences = {}
# seq_records = []
# count = 0
# for row in open(files[f], 'r'):
# #if count > 10: break
# if '_' in row:
# name, seq = row.replace('\n', '').split(' ')
# sequences[name] = seq
# seq_records.append(SeqRecord(Seq(seq),id=name, description=''))
# count += 1
# msa = MultipleSeqAlignment(seq_records)
# AlignIO.write(msa, new_names[f], "fasta")
#esl-reformat -d -o archaea_16S_centroids_ssu_align_best_reduced_dna.fna afa archaea_16S_centroids_ssu_align_best_reduced.fna
#esl-reformat -o archaea_16S_centroids_ssu_align_best_reduced_dna.sto stockholm archaea_16S_centroids_ssu_align_best_reduced_dna.fna
#hmmbuild --cpu 24 archaea_16S_centroids_ssu_align_best_reduced_dna.hmm archaea_16S_centroids_ssu_align_best_reduced_dna.sto
#esl-reformat -o archaea_16S_centroids_ssu_align_best_reduced_dna_reformat.fna fasta archaea_16S_centroids_ssu_align_best_reduced_dna.sto
hmmbuild --cpu 24 bacteria_0.8_raxml-check.raxml.reduced_dna.hmm bacteria_0.8_raxml-check.raxml.reduced_dna.sto
hmmbuild --cpu 24 archaea_0.8_raxml-check.raxml.reduced_dna.hmm archaea_0.8_raxml-check.raxml.reduced_dna.sto
cd ..
mv gtdb_picrust_ref/ gtdb_0.5_picrust_ref
mkdir gtdb_picrust_ref
#mkdir gtdb_picrust_ref/arc_ref
mkdir gtdb_picrust_ref/bac_ref
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_dna_reformat.fna gtdb_picrust_ref/arc_ref/arc_ref.fna
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_dna.hmm gtdb_picrust_ref/arc_ref/arc_ref.hmm
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced.tre gtdb_picrust_ref/arc_ref/arc_ref.tre
#cp gtdb_picrust_files/archaea_raxml.raxml.bestModel gtdb_picrust_ref/arc_ref/arc_ref.model
#mv gtdb_picrust_ref/arc_ref gtdb_picrust_ref/arc_ref_initial
cp gtdb_picrust_files/bacteria_0.8_raxml-check.raxml.reduced_dna.fna gtdb_picrust_ref/bac_ref/bac_ref.fna
cp gtdb_picrust_files/bacteria_0.8_raxml-check.raxml.reduced_dna.hmm gtdb_picrust_ref/bac_ref/bac_ref.hmm
cp gtdb_picrust_files/bacteria_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre gtdb_picrust_ref/bac_ref/bac_ref.tre
cp gtdb_picrust_files/bacteria_0.8_raxml.raxml.bestModel gtdb_picrust_ref/bac_ref/bac_ref.model
mkdir gtdb_picrust_ref/arc_ref
cp gtdb_picrust_files/archaea_0.8_raxml-check.raxml.reduced_dna.fna gtdb_picrust_ref/arc_ref/arc_ref.fna
cp gtdb_picrust_files/archaea_0.8_raxml-check.raxml.reduced_dna.hmm gtdb_picrust_ref/arc_ref/arc_ref.hmm
cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre gtdb_picrust_ref/arc_ref/arc_ref.tre
cp gtdb_picrust_files/archaea_0.8_raxml.raxml.bestModel gtdb_picrust_ref/arc_ref/arc_ref.model
Please first check our FAQ if you have any questions about PICRUSt2.
For other general questions and comments about PICRUSt2 please search the PICRUSt google group. If the question has not been previously answered then please make a new thread.
To report a bug or to make a feature request please make a new issue at the top of this page.