-
Notifications
You must be signed in to change notification settings - Fork 1
/
Medical.py
executable file
·232 lines (142 loc) · 5.23 KB
/
Medical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env python
# coding: utf-8
# # Importing Libraries
# In[1]:
import streamlit as st #importing streamlit liabrary
# In[2]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.decomposition import PCA
from matplotlib import pyplot
# In[3]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go # our main display package
import string # used for preprocessing
import re # used for preprocessing
import nltk # the Natural Language Toolkit, used for preprocessing
import numpy as np # used for managing NaNs
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# # Importing datasets
# In[4]:
df=pd.read_csv('Dimension-covid.csv') #for preprocessing
df1=pd.read_csv('Dimension-covid.csv') #for returning results
# # Preprocessing data
# In[5]:
# function to remove all urls
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize Words
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
#Creating one function so that all functions can be applied at once
def preprocessing(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
text = ' '.join(text)
return text
skipgram = Word2Vec.load('skipgramx11.bin')
FastText=Word2Vec.load('FastText.bin')
# In[12]:
vector_size=100 #defining vector size for each word
def get_mean_vector(word2vec_model, words):
# remove out-of-vocabulary words
words = [word for word in tokenize(words) if word in list(word2vec_model.wv.index_to_key)]
if len(words) >= 1:
return np.mean(word2vec_model.wv[words], axis=0)
else:
return np.array([0]*100)
K=pd.read_csv('skipgram-vec.csv')
K2=[]
for i in range(df.shape[0]):
K2.append(K[str(i)].values)
KK=pd.read_csv('FastText-vec.csv')
K1=[]
for i in range(df.shape[0]):
K1.append(KK[str(i)].values)
from numpy import dot
from numpy.linalg import norm
def cos_sim(a,b):
return dot(a, b)/(norm(a)*norm(b))
pd.set_option("display.max_colwidth", -1) #this function will display full text from each column
#streamlit function
def main():
# Load data and models
data = df1 #our data which we have to display
st.title("Clinical Trial Search engine") #title of our app
st.write('Select Model') #text below title
Vectors = st.selectbox("Model",options=['Skipgram' , 'Fasttext'])
if Vectors=='Skipgram':
K=K2
word2vec_model=skipgram
elif Vectors=='Fasttext':
K=K1
word2vec_model=FastText
st.write('Type your query here')
query = st.text_input("Search box") #getting input from user
def preprocessing_input(query):
query=preprocessing(query)
query=query.replace('\n',' ')
K=get_mean_vector(word2vec_model,query)
return K
def top_n(query,p,df1):
query=preprocessing_input(query)
x=[]
for i in range(len(p)):
x.append(cos_sim(query,p[i]))
tmp=list(x)
res = sorted(range(len(x)), key = lambda sub: x[sub])[-10:]
sim=[tmp[i] for i in reversed(res)]
print(sim)
L=[]
for i in reversed(res):
L.append(i)
return df1.iloc[L, [1,2,5,6]],sim
model = top_n
if query:
P,sim =model(str(query),K,data) #storing our output dataframe in P
#Plotly function to display our dataframe in form of plotly table
fig = go.Figure(data=[go.Table(header=dict(values=['ID', 'Title','Abstract','Publication Date','Score']),cells=dict(values=[list(P['Trial ID'].values),list(P['Title'].values), list(P['Abstract'].values),list(P['Publication date'].values),list(np.around(sim,4))],align=['center','right']))])
#displying our plotly table
fig.update_layout(height=1700,width=700,margin=dict(l=0, r=10, t=20, b=20))
st.plotly_chart(fig)
# Get individual results
if __name__ == "__main__":
main()