1
+ import os
2
+ import csv
3
+ from sqlalchemy import create_engine
4
+ from sqlalchemy import text
5
+ from constants import Constants
6
+ from utils import Utils
7
+
8
+ PROTEIN_GENE_HEADER = f'Gene ID\t Display Gene ID\t Species\t Taxon ID'
9
+ GRN_GENE_HEADER = f'Gene ID\t Display Gene ID\t Species\t Taxon ID\t Regulator'
10
+
11
+ def _get_all_data_from_database_table (database_namespace , table_name ):
12
+ db = create_engine (os .environ ['DB_URL' ])
13
+ with db .connect () as connection :
14
+ result_set = connection .execute (text (f"SELECT * FROM { database_namespace } .{ table_name } " ))
15
+ return result_set .fetchall ()
16
+
17
+ def _get_all_db_genes (database_namespace ):
18
+ gene_records = _get_all_data_from_database_table (database_namespace , "gene" )
19
+ genes = {}
20
+ for gene in gene_records :
21
+ key = (gene [0 ], gene [3 ])
22
+ if len (gene ) > 4 :
23
+ value = (gene [1 ], gene [2 ], gene [4 ])
24
+ else :
25
+ value = (gene [1 ], gene [2 ])
26
+ genes [key ] = value
27
+ return genes
28
+
29
+ def _get_all_db_grn_genes ():
30
+ return _get_all_db_genes (Constants .GRN_DATABASE_NAMESPACE )
31
+
32
+ def _get_all_db_ppi_genes ():
33
+ return _get_all_db_genes (Constants .PPI_DATABASE_NAMESPACE )
34
+
35
+ def _get_all_genes ():
36
+ db_grn_genes = _get_all_db_grn_genes ()
37
+ db_ppi_genes = _get_all_db_ppi_genes ()
38
+
39
+ if not os .path .exists ('union-gene-data' ):
40
+ os .makedirs ('union-gene-data' )
41
+ Utils .create_union_file ([Constants .PPI_GENE_SOURCE , Constants .GRN_GENE_SOURCE ], Constants .GENE_DATA_DIRECTORY )
42
+ genes = db_grn_genes
43
+
44
+ for gene in db_ppi_genes :
45
+ if gene not in genes :
46
+ display_gene_id , species = db_ppi_genes [gene ]
47
+ genes [gene ] = [display_gene_id , species , False ]
48
+
49
+ with open (Constants .GENE_DATA_DIRECTORY , 'r+' , encoding = "UTF-8" ) as f :
50
+ i = 0
51
+ reader = csv .reader (f )
52
+ for row in reader :
53
+ if i != 0 :
54
+ row = row [0 ].split ('\t ' )
55
+ gene_id = row [0 ]
56
+ display_gene_id = row [1 ]
57
+ species = row [2 ]
58
+ taxon_id = row [3 ]
59
+ regulator = row [4 ].capitalize ()
60
+ key = (gene_id , taxon_id )
61
+ value = (display_gene_id , species , regulator )
62
+ if key not in genes :
63
+ genes [key ] = value
64
+ elif genes [key ][0 ] != display_gene_id :
65
+ if display_gene_id != "None" :
66
+ genes [key ] = value
67
+ i += 1
68
+ return genes
69
+
70
+
71
+ def get_all_proteins ():
72
+ protein_records = _get_all_data_from_database_table (Constants .PPI_DATABASE_NAMESPACE , "protein" )
73
+ proteins = {}
74
+ for protein in protein_records :
75
+ key = (protein [0 ], protein [5 ])
76
+ value = (protein [1 ], protein [2 ], protein [3 ], protein [4 ])
77
+ proteins [key ] = value
78
+ return proteins
79
+
80
+ def processing_grn_gene_file ():
81
+ return _processing_gene_file (_get_all_db_grn_genes (), is_protein = False )
82
+
83
+ def processing_ppi_gene_file ():
84
+ return _processing_gene_file (_get_all_db_ppi_genes ())
85
+
86
+ def _processing_gene_file (db_genes , is_protein = True ):
87
+ print (f'Processing gene' )
88
+ missing_genes = {}
89
+ genes_to_update = {}
90
+ all_genes = _get_all_genes ()
91
+ for gene in all_genes :
92
+ display_gene_id , species , regulator = all_genes [gene ]
93
+ values_for_ppi = (display_gene_id , species )
94
+ values_for_grn = (display_gene_id , species , regulator )
95
+ if gene not in db_genes :
96
+ if is_protein :
97
+ missing_genes [gene ] = values_for_ppi
98
+ else :
99
+ missing_genes [gene ] = values_for_grn
100
+ elif gene in db_genes and db_genes [gene ][0 ] != display_gene_id :
101
+ if db_genes [gene ][0 ] != "None" :
102
+ if is_protein :
103
+ genes_to_update [gene ] = values_for_ppi
104
+ else :
105
+ genes_to_update [gene ] = values_for_grn
106
+ return missing_genes , genes_to_update
107
+
108
+ def processing_protein_file (file_path , db_proteins ):
109
+ print (f'Processing file { file_path } ' )
110
+ ppi_missing_proteins = {}
111
+ ppi_proteins_to_update = {}
112
+ with open (file_path , 'r+' , encoding = "UTF-8" ) as f :
113
+ i = 0
114
+ reader = csv .reader (f )
115
+ for row in reader :
116
+ if i != 0 :
117
+ row = row [0 ].split ('\t ' )
118
+ standard_name = row [0 ]
119
+ gene_systematic_name = row [1 ]
120
+ length = float (row [2 ]) if row [2 ] != "None" else 0
121
+ molecular_weight = float (row [3 ]) if row [3 ] != "None" else 0
122
+ pi = float (row [4 ]) if row [4 ] != "None" else 0
123
+ taxon_id = row [5 ]
124
+ key = (standard_name , taxon_id )
125
+ value = (gene_systematic_name , length , molecular_weight , pi )
126
+ if key not in db_proteins :
127
+ ppi_missing_proteins [key ] = value
128
+ elif db_proteins [key ] != value :
129
+ ppi_proteins_to_update [key ] = value
130
+ i += 1
131
+ return ppi_missing_proteins , ppi_proteins_to_update
132
+
133
+ def create_grn_gene_file (file_path , data ):
134
+ _create_gene_file (file_path , GRN_GENE_HEADER , data , is_protein = False )
135
+
136
+ def create_ppi_gene_file (file_path , data ):
137
+ _create_gene_file (file_path , PROTEIN_GENE_HEADER , data )
138
+
139
+ def _create_gene_file (file_path , headers , data , is_protein = True ):
140
+ print (f'Creating { file_path } \n ' )
141
+ gene_file = open (file_path , 'w' )
142
+ gene_file .write (f'{ headers } \n ' )
143
+ for gene in data :
144
+ if is_protein :
145
+ gene_file .write (f'{ gene [0 ]} \t { data [gene ][0 ]} \t { data [gene ][1 ]} \t { gene [1 ]} \n ' )
146
+ else :
147
+ gene_file .write (f'{ gene [0 ]} \t { data [gene ][0 ]} \t { data [gene ][1 ]} \t { gene [1 ]} \t { data [gene ][2 ]} \n ' )
148
+ gene_file .close ()
149
+
150
+ def create_ppi_protein_file (file_path , data ):
151
+ print (f'Creating { file_path } \n ' )
152
+ protein_file = open (file_path , 'w' )
153
+ headers = f'Standard Name\t Gene Systematic Name\t Length\t Molecular Weight\t PI\t Taxon ID'
154
+ protein_file .write (f'{ headers } \n ' )
155
+ for protein in data :
156
+ protein_file .write (f'{ protein [0 ]} \t { data [protein ][0 ]} \t { data [protein ][1 ]} \t { data [protein ][2 ]} \t { data [protein ][3 ]} \t { protein [1 ]} \n ' )
157
+ protein_file .close ()
158
+
159
+ # Processing gene files
160
+ ppi_missing_genes , ppi_genes_to_update = processing_ppi_gene_file ()
161
+ grn_missing_genes , grn_genes_to_update = processing_grn_gene_file ()
162
+ ppi_missing_proteins , ppi_proteins_to_update = processing_protein_file (Constants .PPI_PROTEIN_TABLE_DATA_DIRECTORY , get_all_proteins ())
163
+ create_grn_gene_file (Constants .GRN_MISSING_GENE_DIRECTORY , grn_missing_genes )
164
+ create_grn_gene_file (Constants .GRN_UPDATE_GENE_DIRECTORY , grn_genes_to_update )
165
+ create_ppi_gene_file (Constants .PPI_MISSING_GENE_DIRECTORY , ppi_missing_genes )
166
+ create_ppi_gene_file (Constants .PPI_UPDATE_GENE_DIRECTORY , ppi_genes_to_update )
167
+ create_ppi_protein_file (Constants .PPI_MISSING_PROTEIN_DIRECTORY , ppi_missing_proteins )
168
+ create_ppi_protein_file (Constants .PPI_UPDATE_PROTEIN_DIRECTORY , ppi_proteins_to_update )
0 commit comments