Yachay-AI · i-am-pluto · Nov 1, 2023 · Nov 1, 2023
diff --git a/baseline.py b/baseline.py
@@ -1,5 +1,8 @@
 import pandas as pd
 import numpy as np
+import pickle
+from sklearn.linear_model import LinearRegression
+import sys
 
 def softmax(x):
     # Compute the exponential values for each element in the input array
@@ -14,22 +17,49 @@ def softmax(x):
 
 # Initialize an empty list to store the maximum confidence values.
 max_confidences = []
-
+probabilities = []
 # Iterate over the DataFrame rows.
 for _, row in data_frame.iterrows():
     # Compute softmax for the 'raw_prediction' column of the current row.
     softmax_values = softmax(row['raw_prediction'])
-
+    probabilities.append(softmax_values)
     # Find the maximum confidence value and append it to the list.
     max_confidences.append(softmax_values.max())
 
 # Add a new column 'confidence' to the DataFrame using the list of maximum confidence values.
 data_frame['confidence'] = max_confidences
 data_frame['pred'] = [x.argmax() for x in data_frame['raw_prediction']]
+data_frame['probabilities'] = probabilities
+
+bins = [0, 0.1, 0.2, 0.3, 1.0]
+probabilities = data_frame['probabilities'].values
+
+histograms = []
+for row in probabilities:
+    histograms.append(np.histogram(row, bins=bins)[0])
+
+histograms = np.array(histograms)
+histograms = np.delete(histograms, 0, 1)
+
+column_names = [f'{bins[i]}_{bins[i+1]}' for i in range(len(bins) - 1)]
+histograms = histograms.T
+
+data_frame[column_names[1]] = histograms[0]
+data_frame[column_names[2]] = histograms[1]
+data_frame[column_names[3]] = histograms[2]
+
+data_frame['peaks'] = data_frame['0.1_0.2'] + data_frame['0.2_0.3'] + data_frame['0.3_1.0']
+
+data_frame['text'] = data_frame['text'].str.replace(r'@[\w]+','')
+data_frame['text'] = data_frame['text'].str.replace(r'https?://\S+|www\.\S+','')
+
+data_frame['text_length'] = data_frame['text'].str.len()
 
-# Sort the DataFrame by 'confidence' in descending order.
-sorted_data_frame = data_frame.sort_values(by='confidence', ascending=False)
+with open('trained_model.pkl', 'rb') as f:
+    trained_model = pickle.load(f)
+data_frame['score'] = trained_model.predict(data_frame[['confidence', 'peaks','text_length']])
 
+sorted_data_frame = data_frame.sort_values(by=['score'], ascending=False)
 # Determine the number of top records to consider for computing mean distance.
 top_records_count = int(0.1 * len(data_frame))
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,27 @@
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.2
+cramjam==2.7.0
+cryptography==41.0.5
+Deprecated==1.2.14
+fastparquet==2023.10.1
+fsspec==2023.10.0
+idna==3.4
+joblib==1.3.2
+numpy==1.26.1
+packaging==23.2
+pandas==1.5.3
+pyarrow==13.0.0
+pycparser==2.21
 PyGithub==1.58.1
+PyJWT==2.8.0
+PyNaCl==1.5.0
+python-dateutil==2.8.2
+pytz==2023.3.post1
 requests==2.30.0
-pandas==1.5.3
-scikit-learn==1.2.2
+scikit-learn==1.2.2
+scipy==1.11.3
+six==1.16.0
+threadpoolctl==3.2.0
+urllib3==2.0.7
+wrapt==1.15.0