Skip to content

Commit f214fea

Browse files
committed
Feat(models): Implemented three models for calculating similarity between licenses
#New changes update
1 parent 23877ee commit f214fea

14 files changed

+293
-7
lines changed

.travis.yml

+4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ python:
99
- 3.5
1010
- 3.6
1111
- 3.7
12+
- 3.8
1213

1314
before_install:
1415
- pip install --upgrade pip
@@ -23,3 +24,6 @@ script:
2324
- atarashi -a tfidf -s CosineSim ./atarashi/atarashii.py
2425
- atarashi -a DLD ./atarashi/atarashii.py
2526
- atarashi -a wordFrequencySimilarity ./atarashi/atarashii.py
27+
- atarashi -a lr_classifier ./atarashi/atarashii.py
28+
- atarashi -a svc_classifier ./atarashi/atarashii.py
29+
- atarashi -a nb_classifier ./atarashi/atarashii.py

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ include requirements*.*
1111
include pyproject.toml
1212
include atarashi/data/licenses/processedLicenses.csv
1313
include atarashi/data/Ngram_keywords.json
14+
include atarashi/data/models/*
1415

1516
prune .git
1617
prune venv

README.md

+16
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,22 @@ Get the help by running `atarashi -h` or `atarashi --help`
8181
- With **Bigram Cosine similarity**
8282

8383
`atarashi -a Ngram -s BigramCosineSim /path/to/file.c`
84+
- **Classification models**
85+
- **Training** (optional)
86+
87+
`python3 train.py`
88+
- Running **Classification Models**
89+
90+
91+
- **Logistic Regression**
92+
93+
`atarashi -a lr_classifier -m /path/to/models/folder/ /path/to/file.c`
94+
- **Multimomial Naive Bayes**
95+
96+
`atarashi -a nb_classifier -m /path/to/models/folder/ /path/to/file.c`
97+
- **Linear SVC**
98+
99+
`atarashi -a svc_classifier -m /path/to/models/folder/ /path/to/file.c`
84100
- Running in **verbose** mode
85101

86102
`atarashi -a DLD -v /path/to/file.c`

atarashi/agents/models/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
from .test import Model as Model

atarashi/agents/models/test.py

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright 2018 Kaushlendra Pratap ([email protected])
6+
7+
SPDX-License-Identifier: GPL-2.0
8+
9+
This program is free software; you can redistribute it and/or
10+
modify it under the terms of the GNU General Public License
11+
version 2 as published by the Free Software Foundation.
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU General Public License for more details.
16+
17+
You should have received a copy of the GNU General Public License along
18+
with this program; if not, write to the Free Software Foundation, Inc.,
19+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20+
"""
21+
22+
import joblib
23+
import os
24+
import argparse
25+
from atarashi.agents.atarashiAgent import AtarashiAgent
26+
from atarashi.libs.initialmatch import spdx_identifer
27+
28+
29+
class Model(AtarashiAgent):
30+
31+
'''
32+
Class Model Inherits the Atarashi Agent class inorder to follow a linear and similar interface.
33+
Few Methods of parent class are required in Model class.
34+
35+
:Inherits: Atarashi Agent
36+
:Inherited_Method_1(__init__): Parent class constructor to verify the provided licenseList
37+
:Inherited_Method_2(loadFile): Extracting the license text from the source code and returning a pre-processed comment text.
38+
39+
:Derived Class: Model
40+
:Method_1(__init__): Initialising absolute path of the models directory
41+
:Method_2(similarity_calc): Classifying the license name from the input processed comment.
42+
:Method_3(model_predict): Returning a list containing respective metadata.
43+
:Method_4(getSimalgo): Getter method
44+
:Method_5(setSimAlgo): Setter method for assigning the algorithm to use.
45+
:Method_6(scan): Acts as a control method which allows to move forward when everything asked for is there.
46+
47+
'''
48+
49+
lr_classifier = "lr_classifier"
50+
nb_classifier = "nb_classifier"
51+
svc_classifier = "svc_classifier"
52+
53+
def __init__(self, licenseList, modelsLoc):
54+
super().__init__(licenseList)
55+
self.models_folder = os.path.abspath(modelsLoc)
56+
57+
def similarity_calc(self, processed_comment):
58+
59+
'''
60+
The function is designed to give the prediction results of the specific model
61+
asked by the user. Implementation of all three models and their binary files
62+
is done here.
63+
64+
:param processed_comment: Pre-processed string derived from the input extracted license.
65+
:return: A list containing the predicted license name by the specific model.
66+
:rtype: list()
67+
68+
'''
69+
70+
with open(os.path.join(self.models_folder, 'vectorizer.pkl'), 'rb') as f:
71+
loaded_vect = joblib.load(f)
72+
73+
if self.getSimAlgo() == self.lr_classifier:
74+
classifier = joblib.load(os.path.join(self.models_folder, 'lr_model.pkl'))
75+
elif self.getSimAlgo() == self.nb_classifier:
76+
classifier = joblib.load(os.path.join(self.models_folder, 'nb_model.pkl'))
77+
elif self.getSimAlgo() == self.svc_classifier:
78+
classifier = joblib.load(os.path.join(self.models_folder, 'svc_model.pkl'))
79+
80+
return classifier.predict((loaded_vect.transform([processed_comment])))
81+
82+
83+
def model_predict(self, filePath):
84+
85+
'''
86+
The function is designed to give output as the most similar predicted files
87+
provided by the user. Three different model approaches are designed
88+
which can result into different similarities. The comments from files are
89+
extracted and then the prediction is done on the basis of pre-trained
90+
models in data folder.
91+
92+
:param filePath: Input file path to scan
93+
:return: Result with license shortname, sim_score, sim_type and description
94+
:rtype: list(JSON Format)
95+
'''
96+
97+
match = []
98+
99+
with open(filePath) as file:
100+
raw_data = file.read()
101+
102+
# Match SPDX identifiers
103+
spdx_identifiers = spdx_identifer(raw_data, self.licenseList['shortname'])
104+
match.extend(spdx_identifiers)
105+
106+
processed_comment = super().loadFile(filePath)
107+
license_name = self.similarity_calc(processed_comment)
108+
109+
match.append({
110+
'shortname': str(license_name[0]),
111+
'sim_score': 1,
112+
'sim_type': self.getSimAlgo(),
113+
'description': "Shortname: is the predicted license by the model"
114+
})
115+
return match
116+
117+
def getSimAlgo(self):
118+
return self.algo
119+
120+
def setSimAlgo(self, newAlgo):
121+
if newAlgo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier):
122+
self.algo = newAlgo
123+
124+
def scan(self, filePath):
125+
if self.algo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier):
126+
return self.model_predict(filePath)
127+
else:
128+
return -1
129+
130+
131+
if __name__ == "__main__":
132+
133+
parser = argparse.ArgumentParser()
134+
parser.add_argument("processedLicenseList", help="Specify the processed license list file")
135+
parser.add_argument("modelFolder", help="Specify the location of folder with models")
136+
parser.add_argument("inputFile", help="Specify the input file which needs to be scanned")
137+
parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name")
138+
args = parser.parse_args()
139+
140+
licenseList = args.processedLicenseList
141+
filename = args.inputFile
142+
model = args.modelname
143+
modelFolder = args.modelFolder
144+
145+
scanner = Model(licenseList, modelFolder)
146+
scanner.setSimAlgo(model)
147+
scanner.scan(filename)

atarashi/agents/models/train.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright 2018 Kaushlendra Pratap ([email protected])
6+
7+
SPDX-License-Identifier: GPL-2.0
8+
9+
This program is free software; you can redistribute it and/or
10+
modify it under the terms of the GNU General Public License
11+
version 2 as published by the Free Software Foundation.
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU General Public License for more details.
16+
17+
You should have received a copy of the GNU General Public License along
18+
with this program; if not, write to the Free Software Foundation, Inc.,
19+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20+
"""
21+
22+
import pandas as pd
23+
import os
24+
import joblib
25+
from atarashi.libs.commentPreprocessor import CommentPreprocessor
26+
from sklearn.svm import LinearSVC
27+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
28+
from sklearn.linear_model import LogisticRegression
29+
from sklearn.naive_bayes import MultinomialNB
30+
31+
32+
33+
def model_train():
34+
35+
'''
36+
This function is a very versatile function which starts from loading the Pandas Dataframe
37+
and applying the pre-defined preprocessing technique. It also generates a vocabulary of words
38+
for each license text. Initialisation of all three models followed by the training of each
39+
model on the provided training dataset. Finally, it stores the binary file into models
40+
folder for quick classification in future.
41+
42+
'''
43+
44+
current_dir = os.path.dirname(os.path.abspath(__file__))
45+
data_dir = os.path.abspath(os.path.join(current_dir,os.path.join(os.pardir,os.pardir)))
46+
47+
licensepath = os.path.join(data_dir, "data/licenses/licenseList.csv")
48+
binary1 = os.path.join(data_dir, 'data/models/lr_model.pkl')
49+
binary2 = os.path.join(data_dir, 'data/models/nb_model.pkl')
50+
binary3 = os.path.join(data_dir, 'data/models/svc_model.pkl')
51+
binary4 = os.path.join(data_dir, 'data/models/vectorizer.pkl')
52+
53+
data = pd.read_csv(licensepath)
54+
data.drop(['parent_shortname', 'report_shortname', 'url', 'notes', 'source', 'risk','fullname'], axis = 1, inplace = True)
55+
data.dropna(inplace=True)
56+
data['text'] = data['text'].astype(str)
57+
data['cleaned'] = data['text'].apply(CommentPreprocessor.preprocess)
58+
59+
X_train, y_train = data['cleaned'],data['shortname']
60+
count_vect = CountVectorizer()
61+
X_train_counts = count_vect.fit_transform(X_train)
62+
tfidf_transformer = TfidfTransformer()
63+
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
64+
65+
##Initialisation of Models and creating
66+
naive_bayes = MultinomialNB()
67+
l_regress = LogisticRegression()
68+
svc_classifier = LinearSVC()
69+
70+
print("Model training is going on")
71+
naive_bayes.fit(X_train_tfidf,y_train)
72+
print("First training completed")
73+
l_regress.fit(X_train_tfidf,y_train)
74+
print("Second training completed")
75+
svc_classifier.fit(X_train_tfidf,y_train)
76+
print("Third training completed")
77+
78+
print("All the models have been trained perfectly!!")
79+
print("Saving the models into data folder....")
80+
joblib.dump(naive_bayes,binary2)
81+
joblib.dump(l_regress,binary1)
82+
joblib.dump(svc_classifier,binary3)
83+
joblib.dump(count_vect,binary4)
84+
print("Done")
85+
86+
87+
88+
if __name__ == "__main__":
89+
model_train()

atarashi/atarashii.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,23 @@
2727
from atarashi.agents.dameruLevenDist import DameruLevenDist
2828
from atarashi.agents.tfidf import TFIDF
2929
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
30+
from atarashi.agents.models import Model
3031

3132
__author__ = "Aman Jain"
3233
__email__ = "[email protected]"
3334
__version__ = "0.0.10"
3435

3536

36-
def atarashii_runner(inputFile, processedLicense, agent_name, similarity="CosineSim", ngramJsonLoc=None, verbose=None):
37+
def atarashii_runner(inputFile, processedLicense, agent_name,
38+
similarity="CosineSim", ngramJsonLoc=None, modelsLoc=None,
39+
verbose=None):
3740
'''
3841
:param inputFile: Input File for scanning of license
3942
:param processedLicense: Processed License List (CSV) path (Default path already provided)
4043
:param agent_name: Specify the agent that you want to use for scanning
4144
:param similarity: Specify the similarity type to be used for the particular agent
4245
:param ngramJsonLoc: Specify N-Gram Json File location
46+
:param modelsLoc: Specify folder location of trained models
4347
:param verbose: Specify if verbose mode is on or not (Default is Off/ None)
4448
:return: Returns the array of JSON with scan results
4549
@@ -56,6 +60,9 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine
5660
scanner = ""
5761
if agent_name == "wordFrequencySimilarity":
5862
scanner = WordFrequencySimilarity(processedLicense)
63+
elif agent_name in ("lr_classifier", "svc_classifier", "nb_classifier"):
64+
scanner = Model(processedLicense, modelsLoc)
65+
scanner.setSimAlgo(agent_name)
5966
elif agent_name == "DLD":
6067
scanner = DameruLevenDist(processedLicense)
6168
elif agent_name == "tfidf":
@@ -91,19 +98,23 @@ def main():
9198
'''
9299
defaultProcessed = resource_filename("atarashi", "data/licenses/processedLicenses.csv")
93100
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
101+
defaultModels = os.path.dirname(resource_filename("atarashi", "data/models/vectorizer.pkl"))
94102
parser = argparse.ArgumentParser()
95103
parser.add_argument("inputFile", help="Specify the input file path to scan")
96104
parser.add_argument("-l", "--processedLicenseList", required=False,
97105
help="Specify the location of processed license list file")
98106
parser.add_argument("-a", "--agent_name", required=True,
99-
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
107+
choices=['wordFrequencySimilarity','lr_classifier','svc_classifier','nb_classifier' ,'DLD', 'tfidf', 'Ngram'],
100108
help="Name of the agent that needs to be run")
101109
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
102110
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
103111
help="Specify the similarity algorithm that you want."
104112
" First 2 are for TFIDF and last 3 are for Ngram")
105113
parser.add_argument("-j", "--ngram_json", required=False,
106114
help="Specify the location of Ngram JSON (for Ngram agent only)")
115+
parser.add_argument("-m", "--models", required=False,
116+
help="Specify the location of models folder (for "
117+
"classifier agents only)", default=defaultModels)
107118
parser.add_argument("-v", "--verbose", help="increase output verbosity",
108119
action="count", default=0)
109120
parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__)
@@ -114,13 +125,17 @@ def main():
114125
verbose = args.verbose
115126
processedLicense = args.processedLicenseList
116127
ngram_json = args.ngram_json
128+
models = args.models
117129

118130
if processedLicense is None:
119131
processedLicense = defaultProcessed
120132
if ngram_json is None:
121133
ngram_json = defaultJSON
134+
if models is None:
135+
models = defaultModels
122136

123-
result = atarashii_runner(inputFile, processedLicense, agent_name, similarity, ngram_json, verbose)
137+
result = atarashii_runner(inputFile, processedLicense, agent_name, similarity,
138+
ngram_json, models, verbose)
124139
if agent_name == "wordFrequencySimilarity":
125140
result = [{
126141
"shortname": str(result),

atarashi/data/models/lr_model.pkl

22.3 MB
Binary file not shown.

atarashi/data/models/nb_model.pkl

44.5 MB
Binary file not shown.

atarashi/data/models/svc_model.pkl

22.3 MB
Binary file not shown.

atarashi/data/models/vectorizer.pkl

153 KB
Binary file not shown.

atarashi/evaluator/evaluator.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ def getCommand(agent_name, similarity):
5151
command = "atarashi -a wordFrequencySimilarity"
5252
elif agent_name == "DLD":
5353
command = "atarashi -a DLD"
54+
elif agent_name == "lr_classifier":
55+
command = "atarashi -a lr_classifier"
56+
elif agent_name == "nb_classifier":
57+
command = "atarashi -a nb_classifier"
58+
elif agent_name == "svc_classifier":
59+
command = "atarashi -a svc_classifier"
5460
elif agent_name == "tfidf":
5561
command = "atarashi -a tfidf"
5662
if similarity == "CosineSim":
@@ -129,9 +135,9 @@ def evaluate(command):
129135
if __name__ == "__main__":
130136
parser = argparse.ArgumentParser()
131137
parser.add_argument("-a", "--agent_name", required=True,
132-
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], help="Name of the agent that you want to evaluate")
138+
choices=['wordFrequencySimilarity', 'DLD',"lr_classifier","svc_classifier","nb_classifier", 'tfidf', 'Ngram'], help="Name of the agent that you want to evaluate")
133139
parser.add_argument("-s", "--similarity", required=False,
134-
default=" ", choices=["ScoreSim", "CosineSim", "DiceSim", " ", "BigramCosineSim"], help="Specify the similarity algorithm that you want to evaluate"
140+
default=" ", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], help="Specify the similarity algorithm that you want to evaluate"
135141
" First 2 are for TFIDF and last 3 are for Ngram")
136142
args = parser.parse_args()
137143
agent_name = args.agent_name

0 commit comments

Comments
 (0)