-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdatautils.py
111 lines (96 loc) · 3.85 KB
/
datautils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pyodbc
import codecs
from os.path import join
import datetime
import json
# SQL db setting
db_conn = {
'dsn': 'sqlserverdatasource',
'user': '',
'password': '',
'database': ''
}
# sql query template
txt_id_template = '{CN_Doc_ID}-{table}-{col}'
sql_template = 'select CN_Doc_ID + \'-{table}-{col}\' fn, {col} txt from {table} where CN_Doc_ID in ({ids})'
def query_sqlsvr(query, db_conf):
patients = {}
medicatons = []
ADEs = {}
drugs = {}
con_string = 'DSN=%s;UID=%s;PWD=%s;DATABASE=%s;' % (db_conf['dsn'],
db_conf['user'],
db_conf['password'],
db_conf['database'])
cnxn = pyodbc.connect(con_string)
cursor = cnxn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
return rows
# extract cris text files
def extract_cris_texts(out_path):
q_txt_list = 'select * from ( select top 500 CN_Doc_ID, src_table, src_col ' \
'from sqlcris_user.[13_025].EXPERT_ADR_Data ) a ' \
'order by src_table, CN_Doc_ID'
txt_list = query_sqlsvr(q_txt_list, db_conn)
table = ''
col = ''
ids = ''
sqls = []
for txt in txt_list:
if txt.src_table != table:
if table.strip() != '':
sqls.append(sql_template.format(**{'table': table, 'col':col, 'ids': ids}))
table = txt.src_table
col = txt.src_col
ids = ''
ids += (',' if len(ids) > 0 else '') + "'" + txt.CN_Doc_ID + "'"
sqls.append(sql_template.format({'table': table, 'col': col, 'ids': ids}))
q_data = ' union '.join(sqls)
print 'query [\n%s\n]...' % q_data
rows = query_sqlsvr(q_data, db_conn)
for r in rows:
with codecs.open(join(out_path, r.fn), 'w', encoding='utf-8') as wf:
wf.write(r.txt)
print '{} files saved to {}'.format(len(rows), out_path)
# extract cris text files
def extract_cris_annotation(out_path):
q_txt_list = 'select * from ( select top 500 CN_Doc_ID, src_table, src_col, annotation_start, annotation_end ' \
'from sqlcris_user.[13_025].EXPERT_ADR_Data ) a ' \
'order by src_table, CN_Doc_ID '
txt_list = query_sqlsvr(q_txt_list, db_conn)
table = ''
col = ''
ids = ''
sqls = []
offsets = {}
for txt in txt_list:
if txt.src_table != table:
if table.strip() != '':
sqls.append(sql_template.format(**{'table': table, 'col':col, 'ids': ids}))
table = txt.src_table
col = txt.src_col
ids = ''
offsets = []
ids += (',' if len(ids) > 0 else '') + "'" + txt.CN_Doc_ID + "'"
doc_id = txt_id_template.format(**{'CN_Doc_ID': txt.CN_Doc_ID, 'table': txt.src_table, 'col': txt.src_col})
offsets[doc_id] = [{'start': txt.annotation_start, 'end': txt.annotation_end}] if doc_id not in offsets \
else offsets[doc_id] + [{'start': int(txt.annotation_start), 'end': int(txt.annotation_end)}]
sqls.append(sql_template.format({'table': table, 'col': col, 'ids': ids}))
q_data = ' union '.join(sqls)
print 'query [\n%s\n]... ' % q_data
rows = query_sqlsvr(q_data, db_conn)
for r in rows:
with codecs.open(join(out_path, r.fn), 'w', encoding='utf-8') as wf:
wf.write(r.txt)
data = []
i = 0
for ann in offsets:
data.append([ann['start'], ann['end'], r.txt[ann['start']:ann['end']], 'ADE', i])
i += 1
with codecs.open(join(out_path, r.fn + '.json'), 'w', encoding='utf-8') as wf:
json.dump(data, wf)
print r.fn + '.json done.'
print '{} files saved to {}'.format(len(rows), out_path)
if __name__ == "__main__":
extract_cris_annotation('./cris_docs/')