-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Paul
committed
May 16, 2024
1 parent
8c740f1
commit d089631
Showing
19 changed files
with
163 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import json | ||
import os | ||
import pickle | ||
import re | ||
|
||
import azure.functions as func | ||
import numpy as np | ||
import pandas as pd | ||
import tensorflow as tf | ||
from flask import Flask, request | ||
from pythainlp import word_tokenize | ||
|
||
flask_app = Flask(__name__) | ||
|
||
with open('sentiment_analysis/models/logistic_regression.pkl', 'rb') as f: | ||
logistic_regression_model = pickle.load(f) | ||
with open('sentiment_analysis/models/naive_bayes.pkl', 'rb') as f: | ||
naive_bayes_model = pickle.load(f) | ||
with open('sentiment_analysis/models/sklearn_vectorizer.pkl', 'rb') as f: | ||
sklearn_vectorizer = pickle.load(f) | ||
lstm_model = tf.keras.models.load_model('sentiment_analysis/models/lstm.h5', compile=False) | ||
lstm_vectorizer = tf.keras.models.load_model('sentiment_analysis/models/tf_vectorizer') | ||
transformers_vectorizer = tf.keras.models.load_model('sentiment_analysis/models/tf_vectorizer_t') | ||
transformers_model = tf.keras.models.load_model('sentiment_analysis/models/transformers', compile=False) | ||
|
||
batch_size = 32 | ||
|
||
|
||
def filter_thai(text): | ||
''' | ||
basically, filter out special characters | ||
''' | ||
pattern = re.compile(r"[^\u0E00-\u0E7F ]|^'|'$|''") | ||
char_to_remove = re.findall(pattern, text) | ||
list_with_char_removed = [char for char in text if not char in char_to_remove] | ||
return ''.join(list_with_char_removed) | ||
|
||
|
||
def tokenize_text(x): | ||
return ' '.join(list(filter(lambda y: y.replace(' ', ''), word_tokenize(filter_thai(x))))) | ||
|
||
|
||
@flask_app.get("/return_http") | ||
def run_ai(): | ||
if request.headers['aikey'] == os.environ['AIKEY']: | ||
if request.is_json: | ||
try: | ||
json_body = request.get_json() | ||
data = json_body['json_data'] | ||
model_choice = json_body['model_choice'] | ||
# logging.info('successfully retrieved data') | ||
data = pd.DataFrame.from_dict(json.loads(data)) | ||
# data was reordered by string indexing as it was serialized and sent | ||
data = data.loc[list(map(str, range(len(data.index))))] | ||
# logging.info('successfully converted data to data frame') | ||
if model_choice != 'Vote': | ||
predictions = {model_choice: None} | ||
else: | ||
predictions = {'Transformers': None, 'LSTM': None, 'Logistic Regression': None, 'Naive Bayes': None} | ||
data['0'] = data['0'].apply(filter_thai) | ||
if 'Logistic Regression' in predictions.keys(): | ||
predictions['Logistic Regression'] = logistic_regression_model.predict_proba( | ||
sklearn_vectorizer.transform(data['0']).toarray() | ||
).tolist() | ||
if 'Naive Bayes' in predictions.keys(): | ||
predictions['Naive Bayes'] = naive_bayes_model.predict_proba( | ||
sklearn_vectorizer.transform(data['0']).toarray() | ||
).tolist() | ||
data['0'] = data['0'].apply(tokenize_text) | ||
if 'LSTM' in predictions.keys(): | ||
predictions['LSTM'] = np.concatenate([lstm_model(lstm_vectorizer( | ||
data.iloc[(batch_size * i):min((batch_size * (i + 1)), len(data.index))])).numpy() for i in | ||
range(1 + len(data.index) // batch_size)], axis=0).tolist() | ||
if 'Transformers' in predictions.keys(): | ||
predictions['Transformers'] = np.concatenate([transformers_model(transformers_vectorizer( | ||
data.iloc[(batch_size * i):min((batch_size * (i + 1)), len(data.index))])).numpy() for i in | ||
range(1 + len(data.index) // batch_size)], | ||
axis=0).tolist() | ||
# logging.info('successfully ran prediction models') | ||
return {'predictions': predictions, | ||
'message': tf.config.list_physical_devices('GPU') | ||
}, 201 | ||
except Exception as e: | ||
return {"error": str(e)}, 400 | ||
else: | ||
return {"error": "Request must be JSON"}, 415 | ||
else: | ||
return {'error': 'Wrong key'}, 401 | ||
|
||
|
||
app = func.WsgiFunctionApp(app=flask_app.wsgi_app, | ||
http_auth_level=func.AuthLevel.ANONYMOUS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"version": "2.0", | ||
"logging": | ||
{ | ||
"applicationInsights": | ||
{ | ||
"samplingSettings": | ||
{ | ||
"isEnabled": true, | ||
"excludedTypes": "Request" | ||
} | ||
} | ||
}, | ||
"extensionBundle": | ||
{ | ||
"id": "Microsoft.Azure.Functions.ExtensionBundle", | ||
"version": "[2.*, 3.0.0)" | ||
}, | ||
"extensions": | ||
{ | ||
"http": | ||
{ | ||
"routePrefix": "" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
pandas==2.0.3 | ||
numpy==1.24.4 | ||
pythainlp==4.0.2 | ||
scikit-learn==1.3.2 | ||
tensorflow==2.10.1 | ||
flask==2.2.5 |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
|
||
�root"_tf_keras_sequential*�{"name": "sequential", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}}]}, "shared_object_id": 2, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 1]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}, "keras_version": "2.10.0", "backend": "tensorflow", "model_config": {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}, "shared_object_id": 0}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1}]}}}2 | ||
�root.layer_with_weights-0"_tf_keras_layer*�{"name": "text_vectorization", "trainable": true, "expects_training_arg": false, "dtype": "string", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1, "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}}2 | ||
�'root.layer_with_weights-0._lookup_layer"_tf_keras_layer*�{"name": "string_lookup", "trainable": true, "expects_training_arg": false, "dtype": "int64", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "StringLookup", "config": {"name": "string_lookup", "trainable": true, "dtype": "int64", "invert": false, "max_tokens": 1024, "num_oov_indices": 1, "oov_token": "[UNK]", "mask_token": "", "output_mode": "int", "sparse": false, "pad_to_max_tokens": false, "vocabulary": null, "idf_weights": null, "encoding": "utf-8", "has_input_vocabulary": false}, "shared_object_id": 4, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2 |
Binary file not shown.
Binary file added
BIN
+1.75 KB
sentiment_analysis/models/tf_vectorizer/variables/variables.data-00000-of-00001
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
|
||
�root"_tf_keras_sequential*�{"name": "sequential", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}}]}, "shared_object_id": 2, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 1]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}, "keras_version": "2.10.0", "backend": "tensorflow", "model_config": {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}, "shared_object_id": 0}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1}]}}}2 | ||
�root.layer_with_weights-0"_tf_keras_layer*�{"name": "text_vectorization", "trainable": true, "expects_training_arg": false, "dtype": "string", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1, "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}}2 | ||
�'root.layer_with_weights-0._lookup_layer"_tf_keras_layer*�{"name": "string_lookup", "trainable": true, "expects_training_arg": false, "dtype": "int64", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "StringLookup", "config": {"name": "string_lookup", "trainable": true, "dtype": "int64", "invert": false, "max_tokens": 1024, "num_oov_indices": 1, "oov_token": "[UNK]", "mask_token": "", "output_mode": "int", "sparse": false, "pad_to_max_tokens": false, "vocabulary": null, "idf_weights": null, "encoding": "utf-8", "has_input_vocabulary": false}, "shared_object_id": 4, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2 |
Binary file not shown.
Binary file added
BIN
+1.75 KB
sentiment_analysis/models/tf_vectorizer_t/variables/variables.data-00000-of-00001
Binary file not shown.
Binary file added
BIN
+256 Bytes
sentiment_analysis/models/tf_vectorizer_t/variables/variables.index
Binary file not shown.
Oops, something went wrong.