Skip to content

Commit 265d7e3

Browse files
committed
Merge branch 'beta'
2 parents f4b4c72 + feb269b commit 265d7e3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1788
-858
lines changed

database/README.md

+207-149
Large diffs are not rendered by default.

database/constants.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
class Constants:
2+
GRN_FOLDER_PATH = 'network-database'
3+
PPI_FOLDER_PATH = 'protein-protein-database'
4+
EXPRESSION_FOLDER_PATH = 'expression-database'
5+
UNION_GENE_FOLDER_PATH = 'union-gene-data/'
6+
7+
# Gene data source file path
8+
GRN_GENE_SOURCE = GRN_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
9+
PPI_GENE_SOURCE = PPI_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
10+
EXPRESSION_GENE_SOURCE = EXPRESSION_FOLDER_PATH + "/script-results/processed-expression/genes.csv"
11+
12+
# Union gene data
13+
GENE_DATA_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union_genes.csv'
14+
MISSING_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-missing-genes.csv'
15+
UPDATE_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-update-genes.csv'
16+
17+
# Constants name: NETWORK_<table_name>_DATA_DIRECTORY
18+
GRN_DATABASE_NAMESPACE = 'gene_regulatory_network'
19+
GRN_SOURCE_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
20+
GRN_NETWORK_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/network.csv'
21+
22+
# Protein-protein-interactions
23+
PPI_DATABASE_NAMESPACE = 'protein_protein_interactions'
24+
PPI_SOURCE_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
25+
PPI_NETWORK_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/physical_interaction_no_dupe.csv'
26+
PPI_PROTEIN_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/protein.csv'
27+
28+
# Expression data
29+
EXPRESISON_DATABASE_NAMESPACE = 'gene_expression'
30+
EXPRESSION_REFS_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/refs.csv'
31+
EXPRESSION_METADATA_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-metadata.csv'
32+
EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-data.csv'
33+
EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/production-rates.csv'
34+
EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/degradation-rates.csv'
35+
36+
# Paths for update files
37+
PPI_MISSING_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
38+
PPI_UPDATE_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'
39+
PPI_MISSING_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-proteins.csv'
40+
PPI_UPDATE_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-proteins.csv'
41+
GRN_MISSING_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
42+
GRN_UPDATE_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'

database/filter_update.py

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import os
2+
import csv
3+
from sqlalchemy import create_engine
4+
from sqlalchemy import text
5+
from constants import Constants
6+
from utils import Utils
7+
8+
PROTEIN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID'
9+
GRN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\tRegulator'
10+
11+
def _get_all_data_from_database_table(database_namespace, table_name):
12+
db = create_engine(os.environ['DB_URL'])
13+
with db.connect() as connection:
14+
result_set = connection.execute(text(f"SELECT * FROM {database_namespace}.{table_name}"))
15+
return result_set.fetchall()
16+
17+
def _get_all_db_genes(database_namespace):
18+
gene_records = _get_all_data_from_database_table(database_namespace, "gene")
19+
genes = {}
20+
for gene in gene_records:
21+
key = (gene[0], gene[3])
22+
if len(gene) > 4:
23+
value = (gene[1], gene[2], gene[4])
24+
else:
25+
value = (gene[1], gene[2])
26+
genes[key] = value
27+
return genes
28+
29+
def _get_all_db_grn_genes():
30+
return _get_all_db_genes(Constants.GRN_DATABASE_NAMESPACE)
31+
32+
def _get_all_db_ppi_genes():
33+
return _get_all_db_genes(Constants.PPI_DATABASE_NAMESPACE)
34+
35+
def _get_all_genes():
36+
db_grn_genes = _get_all_db_grn_genes()
37+
db_ppi_genes = _get_all_db_ppi_genes()
38+
39+
if not os.path.exists('union-gene-data'):
40+
os.makedirs('union-gene-data')
41+
Utils.create_union_file([Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
42+
genes = db_grn_genes
43+
44+
for gene in db_ppi_genes:
45+
if gene not in genes:
46+
display_gene_id, species = db_ppi_genes[gene]
47+
genes[gene] = [display_gene_id, species, False]
48+
49+
with open(Constants.GENE_DATA_DIRECTORY, 'r+', encoding="UTF-8") as f:
50+
i = 0
51+
reader = csv.reader(f)
52+
for row in reader:
53+
if i != 0:
54+
row = row[0].split('\t')
55+
gene_id = row[0]
56+
display_gene_id = row[1]
57+
species = row[2]
58+
taxon_id = row[3]
59+
regulator = row[4].capitalize()
60+
key = (gene_id, taxon_id)
61+
value = (display_gene_id, species, regulator)
62+
if key not in genes:
63+
genes[key] = value
64+
elif genes[key][0] != display_gene_id:
65+
if display_gene_id != "None":
66+
genes[key] = value
67+
i+=1
68+
return genes
69+
70+
71+
def get_all_proteins():
72+
protein_records = _get_all_data_from_database_table(Constants.PPI_DATABASE_NAMESPACE, "protein")
73+
proteins = {}
74+
for protein in protein_records:
75+
key = (protein[0], protein[5])
76+
value = (protein[1], protein[2], protein[3], protein[4])
77+
proteins[key] = value
78+
return proteins
79+
80+
def processing_grn_gene_file():
81+
return _processing_gene_file(_get_all_db_grn_genes(), is_protein=False)
82+
83+
def processing_ppi_gene_file():
84+
return _processing_gene_file(_get_all_db_ppi_genes())
85+
86+
def _processing_gene_file(db_genes, is_protein=True):
87+
print(f'Processing gene')
88+
missing_genes = {}
89+
genes_to_update = {}
90+
all_genes = _get_all_genes()
91+
for gene in all_genes:
92+
display_gene_id, species, regulator = all_genes[gene]
93+
values_for_ppi = (display_gene_id, species)
94+
values_for_grn = (display_gene_id, species, regulator)
95+
if gene not in db_genes:
96+
if is_protein:
97+
missing_genes[gene] = values_for_ppi
98+
else:
99+
missing_genes[gene] = values_for_grn
100+
elif gene in db_genes and db_genes[gene][0] != display_gene_id:
101+
if db_genes[gene][0] != "None":
102+
if is_protein:
103+
genes_to_update[gene] = values_for_ppi
104+
else:
105+
genes_to_update[gene] = values_for_grn
106+
return missing_genes, genes_to_update
107+
108+
def processing_protein_file(file_path, db_proteins):
109+
print(f'Processing file {file_path}')
110+
ppi_missing_proteins = {}
111+
ppi_proteins_to_update = {}
112+
with open(file_path, 'r+', encoding="UTF-8") as f:
113+
i = 0
114+
reader = csv.reader(f)
115+
for row in reader:
116+
if i != 0:
117+
row = row[0].split('\t')
118+
standard_name = row[0]
119+
gene_systematic_name = row[1]
120+
length = float(row[2]) if row[2] != "None" else 0
121+
molecular_weight = float(row[3]) if row[3] != "None" else 0
122+
pi = float(row[4]) if row[4] != "None" else 0
123+
taxon_id = row[5]
124+
key = (standard_name, taxon_id)
125+
value = (gene_systematic_name, length, molecular_weight, pi)
126+
if key not in db_proteins:
127+
ppi_missing_proteins[key] = value
128+
elif db_proteins[key] != value:
129+
ppi_proteins_to_update[key] = value
130+
i+=1
131+
return ppi_missing_proteins, ppi_proteins_to_update
132+
133+
def create_grn_gene_file(file_path, data):
134+
_create_gene_file(file_path, GRN_GENE_HEADER, data, is_protein=False)
135+
136+
def create_ppi_gene_file(file_path, data):
137+
_create_gene_file(file_path, PROTEIN_GENE_HEADER, data)
138+
139+
def _create_gene_file(file_path, headers, data, is_protein=True):
140+
print(f'Creating {file_path}\n')
141+
gene_file = open(file_path, 'w')
142+
gene_file.write(f'{headers}\n')
143+
for gene in data:
144+
if is_protein:
145+
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\n')
146+
else:
147+
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\t{data[gene][2]}\n')
148+
gene_file.close()
149+
150+
def create_ppi_protein_file(file_path, data):
151+
print(f'Creating {file_path}\n')
152+
protein_file = open(file_path, 'w')
153+
headers = f'Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID'
154+
protein_file.write(f'{headers}\n')
155+
for protein in data:
156+
protein_file.write(f'{protein[0]}\t{data[protein][0]}\t{data[protein][1]}\t{data[protein][2]}\t{data[protein][3]}\t{protein[1]}\n')
157+
protein_file.close()
158+
159+
# Processing gene files
160+
ppi_missing_genes, ppi_genes_to_update = processing_ppi_gene_file()
161+
grn_missing_genes, grn_genes_to_update = processing_grn_gene_file()
162+
ppi_missing_proteins, ppi_proteins_to_update = processing_protein_file(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, get_all_proteins())
163+
create_grn_gene_file(Constants.GRN_MISSING_GENE_DIRECTORY, grn_missing_genes)
164+
create_grn_gene_file(Constants.GRN_UPDATE_GENE_DIRECTORY, grn_genes_to_update)
165+
create_ppi_gene_file(Constants.PPI_MISSING_GENE_DIRECTORY, ppi_missing_genes)
166+
create_ppi_gene_file(Constants.PPI_UPDATE_GENE_DIRECTORY, ppi_genes_to_update)
167+
create_ppi_protein_file(Constants.PPI_MISSING_PROTEIN_DIRECTORY, ppi_missing_proteins)
168+
create_ppi_protein_file(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, ppi_proteins_to_update)

database/loader.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import csv
2+
from utils import *
3+
from constants import Constants
4+
# python3 loader.py | psql postgresql://localhost/postgres
5+
import os
6+
7+
if not os.path.exists('union-gene-data'):
8+
os.makedirs('union-gene-data')
9+
10+
# Get union gene data
11+
Utils.create_union_file([Constants.EXPRESSION_GENE_SOURCE, Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
12+
13+
# Regulatory Network
14+
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
15+
Utils.load_grn_genes(Constants.GRN_GENE_SOURCE, Constants.GRN_DATABASE_NAMESPACE)
16+
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
17+
18+
# Protein-protein-interactions
19+
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
20+
Utils.load_ppi_genes(Constants.PPI_GENE_SOURCE, Constants.PPI_DATABASE_NAMESPACE)
21+
Utils.load_proteins(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
22+
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
23+
24+
# Expression data
25+
Utils.load_refs(Constants.EXPRESSION_REFS_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
26+
Utils.load_expression_genes(Constants.EXPRESSION_GENE_SOURCE, Constants.EXPRESISON_DATABASE_NAMESPACE)
27+
Utils.load_expression_metadata(Constants.EXPRESSION_METADATA_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
28+
Utils.load_expression_data(Constants.EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
29+
Utils.load_production_rates(Constants.EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
30+
Utils.load_degradation_rates(Constants.EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)

database/loader_update.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import argparse
2+
from constants import Constants
3+
from utils import Utils
4+
5+
def load_grn_data_into_database():
6+
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
7+
Utils.update_grn_genes(Constants.GRN_UPDATE_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
8+
Utils.load_grn_genes(Constants.GRN_MISSING_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
9+
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
10+
11+
def load_ppi_data_into_database():
12+
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
13+
Utils.update_ppi_genes(Constants.PPI_UPDATE_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
14+
Utils.update_ppi_proteins(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
15+
Utils.load_ppi_genes(Constants.PPI_MISSING_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
16+
Utils.load_proteins(Constants.PPI_MISSING_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
17+
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
18+
19+
def main():
20+
# Set up argument parsing
21+
parser = argparse.ArgumentParser(description="Load data into database for GRN or PPI networks.")
22+
parser.add_argument('--network', choices=['GRN', 'PPI'], required=True, help="Specify the network type to load data for")
23+
24+
# Parse arguments
25+
args = parser.parse_args()
26+
27+
# Execute the relevant operations based on the argument
28+
if args.network == 'GRN':
29+
load_grn_data_into_database()
30+
elif args.network == 'PPI':
31+
load_ppi_data_into_database()
32+
else:
33+
print("Invalid network type. Please choose 'GRN' or 'PPI'.")
34+
35+
if __name__ == "__main__":
36+
main()

database/network-database/schema.sql

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@ CREATE TABLE gene_regulatory_network.network (
2020
time_stamp TIMESTAMP WITH TIME ZONE,
2121
source VARCHAR,
2222
FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
23-
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
24-
FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network.source(time_stamp, source)
23+
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network_testing.gene(gene_id, taxon_id),
24+
FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_testing.source(time_stamp, source)
2525
);

database/network-database/scripts/filter_genes.py

-76
This file was deleted.

0 commit comments

Comments
 (0)