Testing Azure Function

paulxiep · May 16, 2024 · d089631 · d089631
1 parent 8c740f1
commit d089631
Show file tree

Hide file tree

Showing 19 changed files with 163 additions and 0 deletions.
diff --git a/function_app.py b/function_app.py
@@ -0,0 +1,92 @@
+import json
+import os
+import pickle
+import re
+
+import azure.functions as func
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from flask import Flask, request
+from pythainlp import word_tokenize
+
+flask_app = Flask(__name__)
+
+with open('sentiment_analysis/models/logistic_regression.pkl', 'rb') as f:
+    logistic_regression_model = pickle.load(f)
+with open('sentiment_analysis/models/naive_bayes.pkl', 'rb') as f:
+    naive_bayes_model = pickle.load(f)
+with open('sentiment_analysis/models/sklearn_vectorizer.pkl', 'rb') as f:
+    sklearn_vectorizer = pickle.load(f)
+lstm_model = tf.keras.models.load_model('sentiment_analysis/models/lstm.h5', compile=False)
+lstm_vectorizer = tf.keras.models.load_model('sentiment_analysis/models/tf_vectorizer')
+transformers_vectorizer = tf.keras.models.load_model('sentiment_analysis/models/tf_vectorizer_t')
+transformers_model = tf.keras.models.load_model('sentiment_analysis/models/transformers', compile=False)
+
+batch_size = 32
+
+
+def filter_thai(text):
+    '''
+    basically, filter out special characters
+    '''
+    pattern = re.compile(r"[^\u0E00-\u0E7F ]|^'|'$|''")
+    char_to_remove = re.findall(pattern, text)
+    list_with_char_removed = [char for char in text if not char in char_to_remove]
+    return ''.join(list_with_char_removed)
+
+
+def tokenize_text(x):
+    return ' '.join(list(filter(lambda y: y.replace(' ', ''), word_tokenize(filter_thai(x)))))
+
+
+@flask_app.get("/return_http")
+def run_ai():
+    if request.headers['aikey'] == os.environ['AIKEY']:
+        if request.is_json:
+            try:
+                json_body = request.get_json()
+                data = json_body['json_data']
+                model_choice = json_body['model_choice']
+                # logging.info('successfully retrieved data')
+                data = pd.DataFrame.from_dict(json.loads(data))
+                # data was reordered by string indexing as it was serialized and sent
+                data = data.loc[list(map(str, range(len(data.index))))]
+                # logging.info('successfully converted data to data frame')
+                if model_choice != 'Vote':
+                    predictions = {model_choice: None}
+                else:
+                    predictions = {'Transformers': None, 'LSTM': None, 'Logistic Regression': None, 'Naive Bayes': None}
+                data['0'] = data['0'].apply(filter_thai)
+                if 'Logistic Regression' in predictions.keys():
+                    predictions['Logistic Regression'] = logistic_regression_model.predict_proba(
+                        sklearn_vectorizer.transform(data['0']).toarray()
+                    ).tolist()
+                if 'Naive Bayes' in predictions.keys():
+                    predictions['Naive Bayes'] = naive_bayes_model.predict_proba(
+                        sklearn_vectorizer.transform(data['0']).toarray()
+                    ).tolist()
+                data['0'] = data['0'].apply(tokenize_text)
+                if 'LSTM' in predictions.keys():
+                    predictions['LSTM'] = np.concatenate([lstm_model(lstm_vectorizer(
+                        data.iloc[(batch_size * i):min((batch_size * (i + 1)), len(data.index))])).numpy() for i in
+                                                          range(1 + len(data.index) // batch_size)], axis=0).tolist()
+                if 'Transformers' in predictions.keys():
+                    predictions['Transformers'] = np.concatenate([transformers_model(transformers_vectorizer(
+                        data.iloc[(batch_size * i):min((batch_size * (i + 1)), len(data.index))])).numpy() for i in
+                                                                  range(1 + len(data.index) // batch_size)],
+                                                                 axis=0).tolist()
+                # logging.info('successfully ran prediction models')
+                return {'predictions': predictions,
+                        'message': tf.config.list_physical_devices('GPU')
+                        }, 201
+            except Exception as e:
+                return {"error": str(e)}, 400
+        else:
+            return {"error": "Request must be JSON"}, 415
+    else:
+        return {'error': 'Wrong key'}, 401
+
+
+app = func.WsgiFunctionApp(app=flask_app.wsgi_app,
+                           http_auth_level=func.AuthLevel.ANONYMOUS)
diff --git a/host.json b/host.json
@@ -0,0 +1,26 @@
+{
+  "version": "2.0",
+  "logging":
+  {
+    "applicationInsights":
+    {
+      "samplingSettings":
+      {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle":
+  {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[2.*, 3.0.0)"
+  },
+  "extensions":
+  {
+    "http":
+    {
+        "routePrefix": ""
+    }
+  }
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+pandas==2.0.3
+numpy==1.24.4
+pythainlp==4.0.2
+scikit-learn==1.3.2
+tensorflow==2.10.1
+flask==2.2.5
diff --git a/sentiment_analysis/models/logistic_regression.pkl b/sentiment_analysis/models/logistic_regression.pkl
diff --git a/sentiment_analysis/models/lstm.h5 b/sentiment_analysis/models/lstm.h5
diff --git a/sentiment_analysis/models/naive_bayes.pkl b/sentiment_analysis/models/naive_bayes.pkl
diff --git a/sentiment_analysis/models/sklearn_vectorizer.pkl b/sentiment_analysis/models/sklearn_vectorizer.pkl
diff --git a/sentiment_analysis/models/tf_vectorizer/keras_metadata.pb b/sentiment_analysis/models/tf_vectorizer/keras_metadata.pb
@@ -0,0 +1,4 @@
+
+�root"_tf_keras_sequential*�{"name": "sequential", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}}]}, "shared_object_id": 2, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 1]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}, "keras_version": "2.10.0", "backend": "tensorflow", "model_config": {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}, "shared_object_id": 0}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1}]}}}2
+�root.layer_with_weights-0"_tf_keras_layer*�{"name": "text_vectorization", "trainable": true, "expects_training_arg": false, "dtype": "string", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 64, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1, "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}}2
+�'root.layer_with_weights-0._lookup_layer"_tf_keras_layer*�{"name": "string_lookup", "trainable": true, "expects_training_arg": false, "dtype": "int64", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "StringLookup", "config": {"name": "string_lookup", "trainable": true, "dtype": "int64", "invert": false, "max_tokens": 1024, "num_oov_indices": 1, "oov_token": "[UNK]", "mask_token": "", "output_mode": "int", "sparse": false, "pad_to_max_tokens": false, "vocabulary": null, "idf_weights": null, "encoding": "utf-8", "has_input_vocabulary": false}, "shared_object_id": 4, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2
diff --git a/sentiment_analysis/models/tf_vectorizer/saved_model.pb b/sentiment_analysis/models/tf_vectorizer/saved_model.pb
diff --git a/sentiment_analysis/models/tf_vectorizer/variables/variables.data-00000-of-00001 b/sentiment_analysis/models/tf_vectorizer/variables/variables.data-00000-of-00001
diff --git a/sentiment_analysis/models/tf_vectorizer/variables/variables.index b/sentiment_analysis/models/tf_vectorizer/variables/variables.index
diff --git a/sentiment_analysis/models/tf_vectorizer_t/keras_metadata.pb b/sentiment_analysis/models/tf_vectorizer_t/keras_metadata.pb
@@ -0,0 +1,4 @@
+
+�root"_tf_keras_sequential*�{"name": "sequential", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}}]}, "shared_object_id": 2, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 1]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 1]}, "float32", "input_1"]}, "keras_version": "2.10.0", "backend": "tensorflow", "model_config": {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 1]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}, "shared_object_id": 0}, {"class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1}]}}}2
+�root.layer_with_weights-0"_tf_keras_layer*�{"name": "text_vectorization", "trainable": true, "expects_training_arg": false, "dtype": "string", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "TextVectorization", "config": {"name": "text_vectorization", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "max_tokens": 1024, "standardize": null, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 128, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null, "has_input_vocabulary": false}, "shared_object_id": 1, "build_input_shape": {"class_name": "TensorShape", "items": [null, 1]}}2
+�'root.layer_with_weights-0._lookup_layer"_tf_keras_layer*�{"name": "string_lookup", "trainable": true, "expects_training_arg": false, "dtype": "int64", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "StringLookup", "config": {"name": "string_lookup", "trainable": true, "dtype": "int64", "invert": false, "max_tokens": 1024, "num_oov_indices": 1, "oov_token": "[UNK]", "mask_token": "", "output_mode": "int", "sparse": false, "pad_to_max_tokens": false, "vocabulary": null, "idf_weights": null, "encoding": "utf-8", "has_input_vocabulary": false}, "shared_object_id": 4, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2
diff --git a/sentiment_analysis/models/tf_vectorizer_t/saved_model.pb b/sentiment_analysis/models/tf_vectorizer_t/saved_model.pb
diff --git a/sentiment_analysis/models/tf_vectorizer_t/variables/variables.data-00000-of-00001 b/sentiment_analysis/models/tf_vectorizer_t/variables/variables.data-00000-of-00001
diff --git a/sentiment_analysis/models/tf_vectorizer_t/variables/variables.index b/sentiment_analysis/models/tf_vectorizer_t/variables/variables.index