Skip to content

Commit

Permalink
Feat(models): Implemented three models for calculating similarity bet…
Browse files Browse the repository at this point in the history
…ween licenses
  • Loading branch information
Kaushl2208 committed Aug 11, 2020
1 parent 4064aaf commit e991696
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 2 deletions.
Empty file.
Binary file added atarashi/agents/models/binaryFiles/lr_model.pkl
Binary file not shown.
Binary file added atarashi/agents/models/binaryFiles/nb_model.pkl
Binary file not shown.
Binary file added atarashi/agents/models/binaryFiles/svc_model.pkl
Binary file not shown.
Binary file not shown.
65 changes: 65 additions & 0 deletions atarashi/agents/models/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import joblib
import argparse
from atarashi.libs.commentPreprocessor import CommentPreprocessor
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


def similarity_calc(filePath, model):

'''
The function is designed to give output as the most similar predicted files
provided by the user. There are three different model approaches are designed
which can result into different similarities. The comments from files are
extracted and then the prediction is done on the basis of pre-trained
models in data folder.
:param filePath: Input file path to scan
:param model: the name of the model by which the prediction is made.
:return: Result with license shortname, sim_score, sim_type and description
:rtype: list(JSON Format)
'''

commentFile = CommentPreprocessor.extract(filePath)
with open(commentFile) as file:
doc = file.read()

match = []
processed_comment = CommentPreprocessor.preprocess(doc)
loaded_vect = joblib.load("atarashi/agents/models/binaryFiles/vectorizer.pkl")
if model == "lr_classifier":
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl")
license_name = classifier.predict((loaded_vect.transform([processed_comment])))

elif model == "nb_classifier":
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl")
license_name = classifier.predict((loaded_vect.transform([processed_comment])))

elif model == "svc_classifier":
classifier = joblib.load("atarashi/agents/models/binaryFiles/nb_model.pkl")
license_name = classifier.predict((loaded_vect.transform([processed_comment])))

match.append({
'shortname': license_name[0],
'sim_score': 1,
'sim_type': model,
'description': "The predicted shortname is the similar license"
})

return match





if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("inputFile", help="Specify the input file which needs to be scanned")
parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name")
args = parser.parse_args()
filename = args.inputFile
model = args.modelname
scanner = similarity_calc(filename,model)
53 changes: 53 additions & 0 deletions atarashi/agents/models/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pandas as pd
import re
import joblib
from atarashi.libs.commentPreprocessor import CommentPreprocessor
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB



def model_train():

data = pd.read_csv("atarashi/data/licenses/licenseList.csv")
data.drop(['parent_shortname', 'report_shortname', 'url', 'notes', 'source', 'risk','fullname'], axis = 1, inplace = True)
data.dropna(inplace=True)
data['text'] = data['text'].astype(str)
data['cleaned'] = data['text'].apply(CommentPreprocessor.preprocess)

X_train, y_train = data['cleaned'],data['shortname']
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

##Initialisation of Models and creating
naive_bayes = MultinomialNB()
l_regress = LogisticRegression()
svc_classifier = LinearSVC()

print("Model training is going on")
naive_bayes.fit(X_train_tfidf,y_train)
l_regress.fit(X_train_tfidf,y_train)
svc_classifier.fit(X_train_tfidf,y_train)


print("Training completed")
print("Saving the models into data folder....")
joblib.dump(naive_bayes,"atarashi/agents/models/binaryFiles/nb_model.pkl")
joblib.dump(l_regress,"atarashi/agents/models/binaryFiles/lr_model.pkl")
joblib.dump(svc_classifier,"atarashi/agents/models/binaryFiles/svc_model.pkl")
joblib.dump(count_vect,"atarashi/agents/models/binaryFiles/vectorizer.pkl")
print("Done")



if __name__ == "__main__":

model_train()




6 changes: 4 additions & 2 deletions atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity

from atarashi.agents.models.test import similarity_calc
__author__ = "Aman Jain"
__email__ = "[email protected]"
__version__ = "0.0.10"
Expand Down Expand Up @@ -56,6 +56,8 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine
scanner = ""
if agent_name == "wordFrequencySimilarity":
scanner = WordFrequencySimilarity(processedLicense)
elif agent_name == "lr_classifier" or agent_name == "svc_classifier" or agent_name == "nb_classifier":
result = similarity_calc(inputFile,agent_name)
elif agent_name == "DLD":
scanner = DameruLevenDist(processedLicense)
elif agent_name == "tfidf":
Expand Down Expand Up @@ -96,7 +98,7 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity','lr_classifier','svc_classifier','nb_classifier' ,'DLD', 'tfidf', 'Ngram'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down

0 comments on commit e991696

Please sign in to comment.