Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submission 2 #25

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
475 changes: 475 additions & 0 deletions .ipynb_checkpoints/first_notebook-checkpoint.ipynb

Large diffs are not rendered by default.

151 changes: 149 additions & 2 deletions baseline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pandas as pd
import numpy as np
import sys
from joblib import load
import scipy.stats

def softmax(x):
# Compute the exponential values for each element in the input array
Expand All @@ -8,6 +11,12 @@ def softmax(x):
# Compute the softmax values by dividing the exponential of each element by the sum of exponentials
return exps / np.sum(exps)

def transform_array(arr, length):
if len(arr) > length:
return arr[:length] # Truncate
else:
return np.pad(arr, (0, length - len(arr)), 'constant') # Pad


# Load the data from a Parquet file into a pandas DataFrame.
data_frame = pd.read_parquet(sys.argv[1])
Expand All @@ -23,12 +32,150 @@ def softmax(x):
# Find the maximum confidence value and append it to the list.
max_confidences.append(softmax_values.max())

model = load('xgboost_model.joblib')

# Add a new column 'confidence' to the DataFrame using the list of maximum confidence values.
data_frame['confidence'] = max_confidences
data_frame['pred'] = [x.argmax() for x in data_frame['raw_prediction']]
X = np.stack(data_frame['raw_prediction'])


# Basic statistical features
mean_confidence = np.mean(X, axis=1)
std_confidence = np.std(X, axis=1)
max_confidence = np.max(X, axis=1)
min_confidence = np.min(X, axis=1)
sum_confidence = np.sum(X, axis=1)
median_confidence = np.median(X, axis=1)

# Additional percentiles
percentile_25 = np.percentile(X, 25, axis=1)
percentile_75 = np.percentile(X, 75, axis=1)
percentile_10 = np.percentile(X, 10, axis=1)
percentile_90 = np.percentile(X, 90, axis=1)

# Indices (positions) of max, min, median
argmax_confidence = np.argmax(X, axis=1)
argmin_confidence = np.argmin(X, axis=1)
argmedian_confidence = np.argmin(np.abs(X - np.median(X, axis=1, keepdims=True)), axis=1)

# Skewness and Kurtosis
skew_confidence = np.apply_along_axis(lambda x: scipy.stats.skew(x), axis=1, arr=X)
kurtosis_confidence = np.apply_along_axis(lambda x: scipy.stats.kurtosis(x), axis=1, arr=X)

# Range (max - min)
range_confidence = max_confidence - min_confidence

# Mean Absolute Deviation (MAD)
mad_confidence = np.mean(np.abs(X - np.mean(X, axis=1, keepdims=True)), axis=1)



# Cumulative Sum and Product
cumulative_sum_confidence = np.cumsum(X, axis=1).mean(axis=1)

# Difference Between Consecutive Features and Moving Average
difference_confidence = np.diff(X, axis=1).mean(axis=1)


# Softmax operation
softmax_confidence = scipy.special.softmax(X, axis=1)

# Taking top 5 values after Softmax
top_5_softmax = np.sort(softmax_confidence, axis=1)[:, -5:]

# Features for each of the top 5 softmax values
top_1_softmax = top_5_softmax[:, -1]
top_2_softmax = top_5_softmax[:, -2]
top_3_softmax = top_5_softmax[:, -3]
top_4_softmax = top_5_softmax[:, -4]
top_5_softmax = top_5_softmax[:, -5]


# Combine all features into a single 2D array
new_features = np.column_stack(
(mean_confidence, std_confidence, max_confidence, min_confidence, sum_confidence,
median_confidence, percentile_25, percentile_75, percentile_10, percentile_90,
argmax_confidence, argmin_confidence, argmedian_confidence, skew_confidence, kurtosis_confidence,
range_confidence, mad_confidence, cumulative_sum_confidence, difference_confidence,
top_1_softmax, top_2_softmax, top_3_softmax, top_4_softmax, top_5_softmax))


import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
import textstat
from gensim.models import Word2Vec
from collections import Counter

# Ensure the necessary NLTK data is downloaded
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_features(df, text_column):
# Helper functions for different features
def length_based_features(text):
chars = len(text)
words = len(text.split())
avg_word_length = chars / words if words else 0
return [chars, words, avg_word_length]

def pos_features(text):
words = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(words)
pos_counts = Counter(tag for word, tag in pos_tags)
return list(pos_counts.values())

def ner_features(text):
words = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(words)
named_ents = nltk.ne_chunk(pos_tags, binary=True)
return len([chunk for chunk in named_ents if hasattr(chunk, 'label') and chunk.label() == 'NE'])

def sentiment_score(text):
return TextBlob(text).sentiment.polarity

def readability_scores(text):
flesch_reading = textstat.flesch_reading_ease(text)
gunning_fog = textstat.gunning_fog(text)
return [flesch_reading, gunning_fog]

# Initialize lists to store each feature
lengths, sentiments, readabilities, ners = [], [], [], []

# Iterate through each text entry and extract features
for text in df[text_column]:
lengths.append(length_based_features(text))
sentiments.append(sentiment_score(text))
readabilities.append(readability_scores(text))
ners.append(ner_features(text))

# Convert lists to NumPy arrays
lengths = np.array(lengths)
sentiments = np.array(sentiments).reshape(-1, 1)
readabilities = np.array(readabilities)
ners = np.array(ners).reshape(-1, 1)

# Concatenate all features into a single array
features = np.concatenate([lengths, sentiments, readabilities, ners], axis=1)

return features


text_features = extract_features(data_frame, 'text')

new_features = np.column_stack((new_features, text_features))


data_frame['pred'] = model.predict(new_features)

#data_frame['pred'] = [x.argmax() for x in data_frame['raw_prediction']]

# Sort the DataFrame by 'confidence' in descending order.
sorted_data_frame = data_frame.sort_values(by='confidence', ascending=False)
sorted_data_frame = data_frame.sort_values(by='pred', ascending=True)

# Determine the number of top records to consider for computing mean distance.
top_records_count = int(0.1 * len(data_frame))
Expand Down
Loading