forked from enormandeau/Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast_cluster_sequences.py
executable file
·56 lines (43 loc) · 1.39 KB
/
blast_cluster_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Cluster sequence IDs into groups of IDs whose sequences blast together.
Input file is the result file (outfmt 0) of fasta file blasted against itself.
Usage:
%program <input_file> <output_file>"""
import sys
from collections import defaultdict
from copy import deepcopy
try:
in_file = sys.argv[1]
out_file = sys.argv[2]
except:
print __doc__
sys.exit(0)
in_dict = defaultdict(set)
all_ids = set()
clusters = []
MAX_DEPTH = 100
with open(in_file) as f:
for line in f:
all_ids.update(line.strip().split())
k = line.strip().split()[0]
in_dict[k].update(line.strip().split())
total_len = len(in_dict)
print "There are", len(all_ids), "unique identifiers"
while len(in_dict.items()) > 0:
sys.stdout.write("\r" + str(len (in_dict)) + " of " + str(total_len) + ": " + str(100 * len(in_dict) / total_len) + "% ")
sys.stdout.flush()
temp = in_dict.popitem()
items = set(temp[1])
for d in range(MAX_DEPTH):
temp_items = deepcopy(items)
for i in items:
temp_items.update(in_dict[i])
in_dict.pop(i)
items = deepcopy(temp_items)
clusters.append(items)
clusters.sort(key=len, reverse=True)
print "which regroup into", len(clusters), "clusters"
with open(out_file, "w") as f:
for c in sorted(clusters):
f.write("\t".join(c) + "\n")