Skip to content

Updating the PICRUSt2 database

Robyn Wright edited this page Jan 13, 2025 · 3 revisions

This page will contain all commands used to update the PICRUSt2 database. It is currently still a work-in-progress. Please contact the Google Group with any questions, but there is a recent update to the PICRUSt2 database. We are still testing this, but you can see details on how you can use it here.

Get the genomes

These were downloaded from The GTDB website. You should check for an updated version of GTDB before downloading them.

Download all files (what we are looking for is the genomes from the representative sequences, the metadata for bacteria and archaea, the bacterial and archaeal trees and the taxonomy information for the bacteria and archaea):

mkdir picrust2_database
cd picrust2_database/
mkdir GTDB_r214
cd GTDB_r214
wget https://data.gtdb.ecogenomic.org/releases/release214/214.1/genomic_files_reps/gtdb_genomes_reps_r214.tar.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_taxonomy_r214.tsv https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_r214.tree.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/bac120_metadata_r214.tsv.gz https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_taxonomy_r214.tsv https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_r214.tree https://data.gtdb.ecogenomic.org/releases/release214/214.1/ar53_metadata_r214.tsv.gz

tar -xvf gtdb_genomes_reps_r214.tar.gz
mkdir gtdb_genomes
find gtdb_genomes_reps_r214/ -type f -print0 | xargs -0 mv -t gtdb_genomes/

First get bacterial and archaeal genomes separately using tree files

from ete3 import Tree
import os

tree_name_bac = 'gtdb_picrust_files/bac120_r214.tree'
tree_name_arc = 'gtdb_picrust_files/ar53_r214.tree'
  
tree_bac = Tree(tree_name_bac, format=1, quoted_node_names=True)
bac_genomes_in_tree = []
for node in tree_bac.traverse("postorder"):
  if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
    bac_genomes_in_tree.append(node.name)
  
tree_arc = Tree(tree_name_arc, format=1, quoted_node_names=True)
arc_genomes_in_tree = []
for node in tree_arc.traverse("postorder"):
  if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
    arc_genomes_in_tree.append(node.name)

genomes = os.listdir('GTDB_r214/gtdb_genomes/')
genomes = [g for g in genomes if '.decomp' not in g]
bac_genomes_in_tree = [g.split('_', 1)[1] for g in bac_genomes_in_tree]
arc_genomes_in_tree = [g.split('_', 1)[1] for g in arc_genomes_in_tree]
bac_genomes_in_tree = set(bac_genomes_in_tree)
arc_genomes_in_tree = set(arc_genomes_in_tree)
bac_genomes = []
arc_genomes = []

bac_count = 0
arc_count = 0
for genome in genomes:
  if genome.replace('_genomic.fna.gz', '') in bac_genomes_in_tree:
    m = os.system('mv GTDB_r214/gtdb_genomes/'+genome+' genomes_to_search_barrnap/bacteria')
    bac_count += 1
  elif genome.replace('_genomic.fna.gz', '') in arc_genomes_in_tree:
    m = os.system('mv GTDB_r214/gtdb_genomes/'+genome+' genomes_to_search_barrnap/archaea')
    arc_count += 1
  if bac_count % 1000 == 0:
    print('Bacterial genomes moved: '+str(bac_count))
  if arc_count % 1000 == 0:
    print('Archaeal genomes moved: '+str(arc_count))
    
print(bac_count, arc_count)
#77930 4184

List of bacteria:

import os

genomes = os.listdir('bacteria')
with open('bacteria.txt', 'w') as f:
  for genome in genomes:
    f.write(genome.replace('.gz', '')+'\n')

Run barrnap

cd genomes_to_search_barrnap
mkdir barrnap_bacteria
mkdir barrnap_bacteria_08
mkdir barrnap_archaea_08
mkdir barrnap_archaea

parallel -j 24 --progress --eta 'gunzip {}' ::: archaea/*.gz
parallel -j 8 --progress --eta 'barrnap -q -k arc {} --outseq barrnap_archaea/{/} --reject 0.5 --threads 6' ::: archaea/*.fna

parallel -j 24 --progress --eta -a bacteria.txt 'gunzip bacteria/{}.gz'
parallel -j 8  --progress --eta -a bacteria.txt 'barrnap -q -k bac bacteria/{} --outseq barrnap_bacteria/{} --reject 0.5 --threads 6'

parallel -j 8  --progress --eta -a bacteria.txt 'barrnap -q -k bac bacteria/{} --outseq barrnap_bacteria_08/{} --reject 0.8 --threads 6'

parallel -j 8 --progress --eta 'barrnap -q -k arc {} --outseq barrnap_archaea_08/{/} --reject 0.8 --threads 6' ::: archaea/*.fna

mkdir barrnap_bacteria_16S_single_0.8
mkdir barrnap_archaea_16S_single_0.8
mkdir barrnap_bacteria_16S_multiple_0.8
mkdir barrnap_archaea_16S_multiple_0.8

Count copies per genome and get files with multiple 16S copies only

from Bio import SeqIO
import os
import pandas as pd
import numpy as np

kingdom = 'bacteria'
genomes = os.listdir('barrnap_'+kingdom+'_08')

copies = []
for genome in genomes:
  count = 0
  sequences = []
  for record in SeqIO.parse('barrnap_'+kingdom+'_08/'+genome, "fasta"):
    if '16S' in record.id:
      sequences.append(record)
      count += 1
  copies.append([genome, count])
  if sequences != []:
    if len(sequences) == 1:
      w = SeqIO.write(sequences, 'barrnap_'+kingdom+'_16S_single_0.8/'+genome, "fasta")
    else:
      w = SeqIO.write(sequences, 'barrnap_'+kingdom+'_16S_multiple_0.8/'+genome, "fasta")

with open(kingdom+'_16S_copies_0.8.txt', 'w') as f:
  for copy in copies:
    w = f.write(copy[0].replace('_genomic.fna', '')+'\t'+str(copy[1])+'\n')
    
k05 = pd.read_csv(kingdom+'_16S_copies_0.5.txt', sep='\t', header=None, index_col=0)
k05 = k05[k05.max(axis=1) > 0]
k08 = pd.read_csv(kingdom+'_16S_copies_0.8.txt', sep='\t', header=None, index_col=0)
k08 = k08[k08.max(axis=1) > 0]
print(kingdom)
print('50%: ', k05.shape[0], np.mean(k05.iloc[:, 0].values), np.median(k05.iloc[:, 0].values), np.max(k05.iloc[:, 0].values))
print('80%: ', k08.shape[0], np.mean(k08.iloc[:, 0].values), np.median(k08.iloc[:, 0].values), np.max(k08.iloc[:, 0].values))
# archaea
# 50%:  1898 1.1991570073761855 1.0
# 80%:  1553 1.2137797810688988 1.0
# bacteria
# 50%:  36644 1.777071280427901 1.0 37
# 80%:  32595 1.8404663291915937 1.0 37

Now cluster the ones with multiple 16S copies

mkdir barrnap_bacteria_16S_clustered_0.8
parallel -j 8 'vsearch --cluster_fast {} --id 0.9 --centroids barrnap_bacteria_16S_clustered_0.8/{/} --threads 6' ::: barrnap_bacteria_16S_multiple_0.8/*

mkdir barrnap_archaea_16S_clustered_0.8
parallel -j 8 --progress --eta 'vsearch --cluster_fast {} --id 0.9 --centroids barrnap_archaea_16S_clustered_0.8/{/} --threads 6' ::: barrnap_archaea_16S_multiple_0.8/*

Make files with single 16S for each genome

import os
from Bio import SeqIO
import random
from Bio.SeqRecord import SeqRecord

kingdoms = ['archaea', 'bacteria']
for kingdom in kingdoms:
  clusters = os.listdir('barrnap_'+kingdom+'_16S_clustered_0.8')
  singles = []
  ids = []
  for genome in clusters:
    records = []
    for record in SeqIO.parse('barrnap_'+kingdom+'_16S_clustered_0.8/'+genome, "fasta"):
      this_record = SeqRecord(record.seq, id=genome.replace('_genomic.fna', ''), description='')
      records.append(this_record)
    if len(records) > 1: 
      num = int(random.choice(range(len(records))))
      singles.append(records[num])
      ids.append(records[num].id)
    else:
      singles.append(records[0])
      ids.append(records[0].id)
  single_copies = os.listdir('barrnap_'+kingdom+'_16S_single_0.8/')
  for genome in single_copies:
    for record in SeqIO.parse('barrnap_'+kingdom+'_16S_single_0.8/'+genome, "fasta"):
      this_record = SeqRecord(record.seq, id=genome.replace('_genomic.fna', ''), description='')
      singles.append(this_record)
      ids.append(this_record.id)
  SeqIO.write(singles, kingdom+"_16S_genes_0.8.fasta", "fasta")

Look at these sequence lengths:

from Bio import SeqIO
import numpy as np

kingdom = 'archaea'
lengths = []
for record in SeqIO.parse(kingdom+"_16S_genes_0.8.fasta", "fasta"):
  lengths.append(len(str(record.seq)))

print(min(lengths), max(lengths), np.mean(lengths), np.median(lengths))
#bacteria
#1268 1862 1509.0679552078539 1520.0
#archaea
#1270 2547 1474.9130714745654 1472.0

Cluster these single 16S genes for all genomes

vsearch --cluster_fast archaea_16S_genes_0.8.fasta --id 1 --centroids archaea_16S_centroids_0.8.fasta --uc archaea_16S_clusters_0.8.uc --threads 24
# Reading file archaea_16S_genes_0.8.fasta 100%  
# 2290540 nt in 1553 seqs, min 1270, max 2547, avg 1475
# Masking 100%  
# Sorting by length 100%
# Counting unique k-mers 100%  
# Clustering 100%  
# Sorting clusters 100%
# Writing clusters 100%  
# Clusters: 1539 Size min 1, max 3, avg 1.0
# Singletons: 1526, 98.3% of seqs, 99.2% of clusters

vsearch --cluster_fast bacteria_16S_genes_0.8.fasta --id 1 --centroids bacteria_16S_centroids_0.8.fasta --uc bacteria_16S_clusters_0.8.uc --threads 24
# Reading file bacteria_16S_genes_0.8.fasta 100%  
# 49188070 nt in 32595 seqs, min 1268, max 1862, avg 1509
# Masking 100%  
# Sorting by length 100%
# Counting unique k-mers 100%  
# Clustering 100%  
# Sorting clusters 100%
# Writing clusters 100%  
# Clusters: 31499 Size min 1, max 35, avg 1.0
# Singletons: 30794, 94.5% of seqs, 97.8% of clusters

Align sequences

ssu-align:

conda activate clustalo
export SSUALIGNDIR="/usr/local/share/ssu-align-0.1.1"
ssu-align --rfonly archaea_16S_centroids_0.8.fasta archaea_16S_centroids_ssu_align_0.8

ssu-align --rfonly bacteria_16S_centroids_0.8.fasta bacteria_16S_centroids_ssu_align_0.8

########################

esl-reformat -o archaea_16S_centroids_ssu_align_0.8.fna afa archaea_16S_centroids_ssu_align_0.8/archaea_16S_centroids_ssu_align_0.8.archaea.stk

esl-reformat -o bacteria_16S_centroids_ssu_align_0.8.fna afa bacteria_16S_centroids_ssu_align_0.8/bacteria_16S_centroids_ssu_align_0.8.bacteria.stk

cd ..

Look at these sequence lengths (unaligned):

from Bio import SeqIO
import numpy as np

kingdom = 'bacteria'
lengths = []
for record in SeqIO.parse(kingdom+'_16S_centroids_ssu_align_0.8.fna', "fasta"):
  lengths.append(len(str(record.seq)))

print(min(lengths), max(lengths), np.mean(lengths), np.median(lengths))
#bacteria
#1508 1508 1508.0 1508.0
#archaea
#1582 1582 1582.0 1582.0

Now look at choosing best genome for each cluster

Archaea:

import os
import pandas as pd
from Bio import SeqIO
import numpy as np
import random

clusters = 'genomes_to_search_barrnap/archaea_16S_clusters_0.8.uc'
aligned_fasta = 'genomes_to_search_barrnap/archaea_16S_centroids_ssu_align_0.8.fna'
md = pd.read_csv('GTDB_r214/ar53_metadata_r214.tsv', index_col=0, header=0, sep='\t')
md = md[md['gtdb_representative'] == 't']

genes_16S = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
  genes_16S.append(record.id)
  
rename_md = {}
for row in md.index.values:
  rename_md[row] = row.split('_', 1)[1]

md = md.rename(index=rename_md)

clusters_all_genomes, clusters_centroid = {}, []
for row in open(clusters, 'r'):
  row = row.replace('\n', '').split('\t')
  if row[-1] != '*':
    genomes = row[-2:]
    if genomes[0] in genes_16S:
      #print('First one', genomes)
      clusters_centroid.append(genomes[0])
      if genomes[0] in clusters_all_genomes:
        clusters_all_genomes[genomes[0]].append(genomes[1])
      else:
        clusters_all_genomes[genomes[0]] = [genomes[1]]
    elif genomes[1] in genes_16S:
      clusters_centroid.append(genomes[1])
      if genomes[1] in clusters_all_genomes:
        clusters_all_genomes[genomes[1]].append(genomes[0])
      else:
        clusters_all_genomes[genomes[1]] = [genomes[0]]
    else:
      print('Neither', genomes)

#now choose best genome from all clusters
cluster_bests = {}
for cluster in clusters_all_genomes:
  md_cluster = md.loc[[cluster]+clusters_all_genomes[cluster], :]
  completeness = md_cluster['checkm_completeness'].values
  contamination = md_cluster['checkm_contamination'].values
  best = ''
  if len(set(completeness)) == 1:
    if len(set(contamination)) == 1:
      num = int(random.choice(range(md_cluster.shape[0])))
      best = md_cluster.index.values[num]
    else:
      lowest, best = 100, ''
      for row in md_cluster.index.values:
        if md_cluster.loc[row, 'checkm_contamination'] < lowest:
          lowest, best = md_cluster.loc[row, 'checkm_contamination'], row
  else:
    highest, best = 0, ''
    for row in md_cluster.index.values:
      if md_cluster.loc[row, 'checkm_completeness'] > highest:
        highest, best = md_cluster.loc[row, 'checkm_completeness'], row
  cluster_bests[cluster] = best

#rename fasta file with the best of the cluster
new_records = []
all_ids = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
  if record.id in cluster_bests:
    print('Changing', record.id, 'to', cluster_bests[record.id])
    record.id = cluster_bests[record.id]
  all_ids.append(record.id)
  new_records.append(record)

SeqIO.write(new_records, aligned_fasta.replace('.fna', '_best.fna'), "fasta")

#make file with name of best genome in cluster and other genomes within the cluster
with open('genomes_to_search_barrnap/archaea_16S_clusters_processed_0.8.txt', 'w') as f:
  w = f.write('Centroid\tBest\tAll genomes\n')
  for cluster in clusters_all_genomes:
    w = f.write(cluster+'\t'+cluster_bests[cluster]+'\t'+cluster+','+','.join(clusters_all_genomes[cluster])+'\n')
  
#get reduced metadata file including only those genomes that are the best
md = md.loc[all_ids, :]
md.to_csv('archaea_metadata_clusters_ssu_align_centroids_0.8.csv')

Bacteria:

import os
import pandas as pd
from Bio import SeqIO
import numpy as np
import random

clusters = 'genomes_to_search_barrnap/bacteria_16S_clusters_0.8.uc'
aligned_fasta = 'genomes_to_search_barrnap/bacteria_16S_centroids_ssu_align_0.8.fna'
md = pd.read_csv('GTDB_r214/bac120_metadata_r214.tsv', index_col=0, header=0, sep='\t')
md = md[md['gtdb_representative'] == 't']

genes_16S = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
  genes_16S.append(record.id)
  
rename_md = {}
for row in md.index.values:
  rename_md[row] = row.split('_', 1)[1]

md = md.rename(index=rename_md)

clusters_all_genomes, clusters_centroid = {}, []
for row in open(clusters, 'r'):
  row = row.replace('\n', '').split('\t')
  if row[-1] != '*':
    #print(row)
    genomes = row[-2:]
    if genomes[0] in genes_16S:
      #print('First one', genomes)
      clusters_centroid.append(genomes[0])
      if genomes[0] in clusters_all_genomes:
        clusters_all_genomes[genomes[0]].append(genomes[1])
      else:
        clusters_all_genomes[genomes[0]] = [genomes[1]]
    elif genomes[1] in genes_16S:
      clusters_centroid.append(genomes[1])
      if genomes[1] in clusters_all_genomes:
        clusters_all_genomes[genomes[1]].append(genomes[0])
      else:
        clusters_all_genomes[genomes[1]] = [genomes[0]]
    else:
      neither = True #note that this means that the sequences were removed from this step because they fitted the model of a different domain the best

#now choose best genome from all clusters
cluster_bests = {}
for cluster in clusters_all_genomes:
  md_cluster = md.loc[[cluster]+clusters_all_genomes[cluster], :]
  completeness = md_cluster['checkm_completeness'].values
  contamination = md_cluster['checkm_contamination'].values
  best = ''
  if len(set(completeness)) == 1:
    if len(set(contamination)) == 1:
      num = int(random.choice(range(md_cluster.shape[0])))
      best = md_cluster.index.values[num]
    else:
      lowest, best = 100, ''
      for row in md_cluster.index.values:
        if md_cluster.loc[row, 'checkm_contamination'] < lowest:
          lowest, best = md_cluster.loc[row, 'checkm_contamination'], row
  else:
    highest, best = 0, ''
    for row in md_cluster.index.values:
      if md_cluster.loc[row, 'checkm_completeness'] > highest:
        highest, best = md_cluster.loc[row, 'checkm_completeness'], row
  cluster_bests[cluster] = best

#rename fasta file with the best of the cluster
new_records = []
all_ids = []
for record in SeqIO.parse(aligned_fasta, "fasta"):
  if record.id in cluster_bests:
    print('Changing', record.id, 'to', cluster_bests[record.id])
    record.id = cluster_bests[record.id]
  all_ids.append(record.id)
  new_records.append(record)

SeqIO.write(new_records, aligned_fasta.replace('.fna', '_best.fna'), "fasta")

#make file with name of best genome in cluster and other genomes within the cluster
with open('genomes_to_search_barrnap/bacteria_16S_clusters_processed_0.8.txt', 'w') as f:
  w = f.write('Centroid\tBest\tAll genomes\n')
  for cluster in clusters_all_genomes:
    w = f.write(cluster+'\t'+cluster_bests[cluster]+'\t'+cluster+','+','.join(clusters_all_genomes[cluster])+'\n')
  
#get reduced metadata file including only those genomes that are the best
md = md.loc[all_ids, :]
md.to_csv('bacteria_metadata_clusters_ssu_align_centroids_0.8.csv')
mv gtdb_picrust_files gtdb_picrust_files_first_try
mkdir gtdb_picrust_files

And filter the files to only include genomes <= 10% contamination and >= 90% completion

import pandas as pd
from Bio import SeqIO
from ete3 import Tree

domains = ['bacteria', 'archaea']
trees = ['GTDB_r214/bac120_r214.tree', 'GTDB_r214/ar53_r214.tree']

for d in range(len(domains)):
  domain = domains[d]
  tree = Tree(trees[d], format=1, quoted_node_names=True)
  aligned_fasta = 'genomes_to_search_barrnap/'+domain+'_16S_centroids_ssu_align_0.8_best.fna'
  md = pd.read_csv(domain+'_metadata_clusters_ssu_align_centroids_0.8.csv', index_col=0, header=0)
  md = md[md['checkm_contamination'] <= 10]
  md = md[md['checkm_completeness'] >= 90]
  new_records = []
  for record in SeqIO.parse(aligned_fasta, "fasta"):
    if record.id in md.index.values:
      new_records.append(record)
  SeqIO.write(new_records, 'gtdb_picrust_files/'+domain+'_16S_centroids_ssu_align_best_reduced_0.8.fna', "fasta")
  pruning = []
  names = []
  for node in tree.traverse("postorder"):
    if 'GB_' in node.name or 'GCA_' in node.name or 'GCF_' in node.name:
      new_name = str(node.name).split('_', 1)[1]
      node.name = new_name
      names.append(new_name)
      if new_name in md.index.values:
        pruning.append(new_name)
  tree.prune(pruning)
  tree.write(outfile='gtdb_picrust_files/'+domain+'_16S_centroids_ssu_align_best_reduced_0.8.tre')
  

Run the raxml check

cd gtdb_picrust_files
raxml-ng --check --msa archaea_16S_centroids_ssu_align_best_reduced_0.8.fna --model GTR+G --prefix archaea_0.8_raxml-check
#this didn't need fixing!
#esl-reformat -o archaea_raxml-check.raxml.reduced.phy phylip archaea_16S_centroids_ssu_align_best_reduced.fna
#but for some reason, the place_seqs step didn't work when I'd used this so I've just made the steps the same for both of them

raxml-ng --check --msa bacteria_16S_centroids_ssu_align_best_reduced_0.8.fna --model GTR+G --prefix bacteria_0.8_raxml-check
#this had some duplicates removed still apparently (copied this from previously, but outcome is the same!!)

Convert archaea alignment to phylip so we have the same for both:

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

seq_records = []
seqs, lengths = 0, []
for record in SeqIO.parse('archaea_16S_centroids_ssu_align_best_reduced_0.8.fna', 'fasta'):
  seq_records.append(record)
  seqs += 1
  lengths.append(len(str(record.seq)))
  
# count = 0
# for row in open('bacteria_0.8_raxml-check.raxml.reduced.phy', 'r'):
#   if count < 2: print([row])
#   count += 1
  
with open('archaea_0.8_raxml-check.raxml.reduced.phy', 'w') as f:
  f.write(str(seqs)+' '+str(int(lengths[0]))+'\n')
  for record in seq_records:
    f.write(record.id+' '+str(record.seq)+'\n')
  

Filter the tree files to include only these

from ete3 import Tree
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
#from Bio.Alphabet import IUPAC

fixed_alignments = ['bacteria_0.8_raxml-check.raxml.reduced.phy', 'archaea_0.8_raxml-check.raxml.reduced.phy']
tree_files = ['bacteria_16S_centroids_ssu_align_best_reduced_0.8.tre', 'archaea_16S_centroids_ssu_align_best_reduced_0.8.tre']

for f in range(len(fixed_alignments)):
  fixed_alignment = fixed_alignments[f]
  tree_file = tree_files[f]
  genomes = []
  for row in open(fixed_alignment, 'r'):
    if 'GB_' in row or 'GCA_' in row or 'GCF_' in row:
      genomes.append(row.split(' ')[0].replace('>', '').replace('\n', ''))
  
  genomes = set(genomes)
  print(len(genomes))
  #29311, 
  tree = Tree(tree_file, format=1, quoted_node_names=True)
  genome_nodes = []
  for node in tree.traverse("postorder"):
    if node.name in genomes:
      genome_nodes.append(node.name)
  
  tree.prune(genome_nodes)
  tree.write(outfile=tree_file.replace('.tre', '_fixed.tre'), format=1)

Run raxml

mkdir archaea_first_raxml
mv archaea_raxml.raxml* archaea_first_raxml/

raxml-ng --evaluate --msa archaea_0.8_raxml-check.raxml.reduced.phy --tree archaea_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre --prefix archaea_0.8_raxml --model GTR+G —threads 24

raxml-ng --evaluate --msa bacteria_0.8_raxml-check.raxml.reduced.phy --tree bacteria_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre --prefix bacteria_0.8_raxml --model GTR+G —threads 24

Resave the phylip files and reformat for the hmm files

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

files = ['bacteria_0.8_raxml-check.raxml.reduced.phy', 'archaea_0.8_raxml-check.raxml.reduced.phy']
new_names = ['bacteria_0.8_raxml-check.raxml.reduced.fna', 'archaea_0.8_raxml-check.raxml.reduced.fna']

for f in range(len(files)):
  sequences = {}
  seq_records = []
  count = 0
  for row in open(files[f], 'r'):
    #if count > 10: break
    if '_' in row:
      name, seq = row.replace('\n', '').split(' ')
      sequences[name] = seq
      seq_records.append(SeqRecord(Seq(seq),id=name, description=''))
    count += 1
  msa = MultipleSeqAlignment(seq_records)
  AlignIO.write(msa, new_names[f], "fasta")

Convert the fasta file to DNA and reformat to stockholm:

esl-reformat -d -o bacteria_0.8_raxml-check.raxml.reduced_dna.fna afa bacteria_0.8_raxml-check.raxml.reduced.fna

esl-reformat -o bacteria_0.8_raxml-check.raxml.reduced_dna.sto stockholm bacteria_0.8_raxml-check.raxml.reduced_dna.fna

esl-reformat -d -o archaea_0.8_raxml-check.raxml.reduced_dna.fna afa archaea_0.8_raxml-check.raxml.reduced.fna

esl-reformat -o archaea_0.8_raxml-check.raxml.reduced_dna.sto stockholm archaea_0.8_raxml-check.raxml.reduced_dna.fna

Resave the archaea file and reformat for the hmm files

# from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord
# from Bio.Seq import Seq
# from Bio.Alphabet import IUPAC
# from Bio import AlignIO
# from Bio.Align import MultipleSeqAlignment
# 
# files = ['bacteria_raxml-check.raxml.reduced.phy']
# new_names = ['bacteria_raxml-check.raxml.reduced.fna']
# 
# for f in range(len(files)):
#   sequences = {}
#   seq_records = []
#   count = 0
#   for row in open(files[f], 'r'):
#     #if count > 10: break
#     if '_' in row:
#       name, seq = row.replace('\n', '').split(' ')
#       sequences[name] = seq
#       seq_records.append(SeqRecord(Seq(seq),id=name, description=''))
#     count += 1
#   msa = MultipleSeqAlignment(seq_records)
#   AlignIO.write(msa, new_names[f], "fasta")

Reformat for the hmm files

#esl-reformat -d -o archaea_16S_centroids_ssu_align_best_reduced_dna.fna afa archaea_16S_centroids_ssu_align_best_reduced.fna

#esl-reformat -o archaea_16S_centroids_ssu_align_best_reduced_dna.sto stockholm archaea_16S_centroids_ssu_align_best_reduced_dna.fna

Run the HMMs

#hmmbuild --cpu 24 archaea_16S_centroids_ssu_align_best_reduced_dna.hmm archaea_16S_centroids_ssu_align_best_reduced_dna.sto

#esl-reformat -o archaea_16S_centroids_ssu_align_best_reduced_dna_reformat.fna fasta archaea_16S_centroids_ssu_align_best_reduced_dna.sto

hmmbuild --cpu 24 bacteria_0.8_raxml-check.raxml.reduced_dna.hmm bacteria_0.8_raxml-check.raxml.reduced_dna.sto
hmmbuild --cpu 24 archaea_0.8_raxml-check.raxml.reduced_dna.hmm archaea_0.8_raxml-check.raxml.reduced_dna.sto

Copy and rename the resulting files

cd ..
mv gtdb_picrust_ref/ gtdb_0.5_picrust_ref
mkdir gtdb_picrust_ref
#mkdir gtdb_picrust_ref/arc_ref
mkdir gtdb_picrust_ref/bac_ref
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_dna_reformat.fna gtdb_picrust_ref/arc_ref/arc_ref.fna
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_dna.hmm gtdb_picrust_ref/arc_ref/arc_ref.hmm
#cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced.tre gtdb_picrust_ref/arc_ref/arc_ref.tre
#cp gtdb_picrust_files/archaea_raxml.raxml.bestModel gtdb_picrust_ref/arc_ref/arc_ref.model
#mv gtdb_picrust_ref/arc_ref gtdb_picrust_ref/arc_ref_initial

cp gtdb_picrust_files/bacteria_0.8_raxml-check.raxml.reduced_dna.fna gtdb_picrust_ref/bac_ref/bac_ref.fna
cp gtdb_picrust_files/bacteria_0.8_raxml-check.raxml.reduced_dna.hmm gtdb_picrust_ref/bac_ref/bac_ref.hmm
cp gtdb_picrust_files/bacteria_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre gtdb_picrust_ref/bac_ref/bac_ref.tre
cp gtdb_picrust_files/bacteria_0.8_raxml.raxml.bestModel gtdb_picrust_ref/bac_ref/bac_ref.model

mkdir gtdb_picrust_ref/arc_ref
cp gtdb_picrust_files/archaea_0.8_raxml-check.raxml.reduced_dna.fna gtdb_picrust_ref/arc_ref/arc_ref.fna
cp gtdb_picrust_files/archaea_0.8_raxml-check.raxml.reduced_dna.hmm gtdb_picrust_ref/arc_ref/arc_ref.hmm
cp gtdb_picrust_files/archaea_16S_centroids_ssu_align_best_reduced_0.8_fixed.tre gtdb_picrust_ref/arc_ref/arc_ref.tre
cp gtdb_picrust_files/archaea_0.8_raxml.raxml.bestModel gtdb_picrust_ref/arc_ref/arc_ref.model
Clone this wiki locally