-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feat(models): Implemented three models for calculating similarity bet…
…ween licenses
- Loading branch information
1 parent
4064aaf
commit e991696
Showing
8 changed files
with
122 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import joblib | ||
import argparse | ||
from atarashi.libs.commentPreprocessor import CommentPreprocessor | ||
from sklearn.svm import LinearSVC | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.naive_bayes import MultinomialNB | ||
|
||
|
||
def similarity_calc(filePath, model): | ||
|
||
''' | ||
The function is designed to give output as the most similar predicted files | ||
provided by the user. There are three different model approaches are designed | ||
which can result into different similarities. The comments from files are | ||
extracted and then the prediction is done on the basis of pre-trained | ||
models in data folder. | ||
:param filePath: Input file path to scan | ||
:param model: the name of the model by which the prediction is made. | ||
:return: Result with license shortname, sim_score, sim_type and description | ||
:rtype: list(JSON Format) | ||
''' | ||
|
||
commentFile = CommentPreprocessor.extract(filePath) | ||
with open(commentFile) as file: | ||
doc = file.read() | ||
|
||
match = [] | ||
processed_comment = CommentPreprocessor.preprocess(doc) | ||
loaded_vect = joblib.load("atarashi/agents/models/binaryFiles/vectorizer.pkl") | ||
if model == "lr_classifier": | ||
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl") | ||
license_name = classifier.predict((loaded_vect.transform([processed_comment]))) | ||
|
||
elif model == "nb_classifier": | ||
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl") | ||
license_name = classifier.predict((loaded_vect.transform([processed_comment]))) | ||
|
||
elif model == "svc_classifier": | ||
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl") | ||
license_name = classifier.predict((loaded_vect.transform([processed_comment]))) | ||
|
||
match.append({ | ||
'shortname': license_name[0], | ||
'sim_score': 1, | ||
'sim_type': model, | ||
'description': "The predicted shortname is the similar license" | ||
}) | ||
|
||
return match | ||
|
||
|
||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") | ||
parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name") | ||
args = parser.parse_args() | ||
filename = args.inputFile | ||
model = args.modelname | ||
scanner = similarity_calc(filename,model) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import pandas as pd | ||
import re | ||
import joblib | ||
from atarashi.libs.commentPreprocessor import CommentPreprocessor | ||
from sklearn.svm import LinearSVC | ||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.naive_bayes import MultinomialNB | ||
|
||
|
||
|
||
def model_train(): | ||
|
||
data = pd.read_csv("atarashi/data/licenses/licenseList.csv") | ||
data.drop(['parent_shortname', 'report_shortname', 'url', 'notes', 'source', 'risk','fullname'], axis = 1, inplace = True) | ||
data.dropna(inplace=True) | ||
data['text'] = data['text'].astype(str) | ||
data['cleaned'] = data['text'].apply(CommentPreprocessor.preprocess) | ||
|
||
X_train, y_train = data['cleaned'],data['shortname'] | ||
count_vect = CountVectorizer() | ||
X_train_counts = count_vect.fit_transform(X_train) | ||
tfidf_transformer = TfidfTransformer() | ||
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) | ||
|
||
##Initialisation of Models and creating | ||
naive_bayes = MultinomialNB() | ||
l_regress = LogisticRegression() | ||
svc_classifier = LinearSVC() | ||
|
||
print("Model training is going on") | ||
naive_bayes.fit(X_train_tfidf,y_train) | ||
l_regress.fit(X_train_tfidf,y_train) | ||
svc_classifier.fit(X_train_tfidf,y_train) | ||
|
||
|
||
print("Training completed") | ||
print("Saving the models into data folder....") | ||
joblib.dump(naive_bayes,"atarashi/agents/models/binaryFiles/nb_model.pkl") | ||
joblib.dump(l_regress,"atarashi/agents/models/binaryFiles/lr_model.pkl") | ||
joblib.dump(svc_classifier,"atarashi/agents/models/binaryFiles/svc_model.pkl") | ||
joblib.dump(count_vect,"atarashi/agents/models/binaryFiles/vectorizer.pkl") | ||
print("Done") | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
model_train() | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,7 @@ | |
from atarashi.agents.dameruLevenDist import DameruLevenDist | ||
from atarashi.agents.tfidf import TFIDF | ||
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity | ||
|
||
from atarashi.agents.models.test import similarity_calc | ||
__author__ = "Aman Jain" | ||
__email__ = "[email protected]" | ||
__version__ = "0.0.10" | ||
|
@@ -56,6 +56,8 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine | |
scanner = "" | ||
if agent_name == "wordFrequencySimilarity": | ||
scanner = WordFrequencySimilarity(processedLicense) | ||
elif agent_name == "lr_classifier" or agent_name == "svc_classifier" or agent_name == "nb_classifier": | ||
result = similarity_calc(inputFile,agent_name) | ||
elif agent_name == "DLD": | ||
scanner = DameruLevenDist(processedLicense) | ||
elif agent_name == "tfidf": | ||
|
@@ -96,7 +98,7 @@ def main(): | |
parser.add_argument("-l", "--processedLicenseList", required=False, | ||
help="Specify the location of processed license list file") | ||
parser.add_argument("-a", "--agent_name", required=True, | ||
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], | ||
choices=['wordFrequencySimilarity','lr_classifier','svc_classifier','nb_classifier' ,'DLD', 'tfidf', 'Ngram'], | ||
help="Name of the agent that needs to be run") | ||
parser.add_argument("-s", "--similarity", required=False, default="CosineSim", | ||
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], | ||
|