|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | +Copyright 2018 Kaushlendra Pratap ([email protected]) |
| 6 | +
|
| 7 | +SPDX-License-Identifier: GPL-2.0 |
| 8 | +
|
| 9 | +This program is free software; you can redistribute it and/or |
| 10 | +modify it under the terms of the GNU General Public License |
| 11 | +version 2 as published by the Free Software Foundation. |
| 12 | +This program is distributed in the hope that it will be useful, |
| 13 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | +GNU General Public License for more details. |
| 16 | +
|
| 17 | +You should have received a copy of the GNU General Public License along |
| 18 | +with this program; if not, write to the Free Software Foundation, Inc., |
| 19 | +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
| 20 | +""" |
| 21 | + |
| 22 | +import joblib |
| 23 | +import os |
| 24 | +import argparse |
| 25 | +from atarashi.agents.atarashiAgent import AtarashiAgent |
| 26 | +from atarashi.libs.initialmatch import spdx_identifer |
| 27 | + |
| 28 | + |
| 29 | +class Model(AtarashiAgent): |
| 30 | + |
| 31 | + ''' |
| 32 | + Class Model Inherits the Atarashi Agent class inorder to follow a linear and similar interface. |
| 33 | + Few Methods of parent class are required in Model class. |
| 34 | +
|
| 35 | + :Inherits: Atarashi Agent |
| 36 | + :Inherited_Method_1(__init__): Parent class constructor to verify the provided licenseList |
| 37 | + :Inherited_Method_2(loadFile): Extracting the license text from the source code and returning a pre-processed comment text. |
| 38 | +
|
| 39 | + :Derived Class: Model |
| 40 | + :Method_1(__init__): Initialising absolute path of the models directory |
| 41 | + :Method_2(similarity_calc): Classifying the license name from the input processed comment. |
| 42 | + :Method_3(model_predict): Returning a list containing respective metadata. |
| 43 | + :Method_4(getSimalgo): Getter method |
| 44 | + :Method_5(setSimAlgo): Setter method for assigning the algorithm to use. |
| 45 | + :Method_6(scan): Acts as a control method which allows to move forward when everything asked for is there. |
| 46 | + |
| 47 | + ''' |
| 48 | + |
| 49 | + lr_classifier = "lr_classifier" |
| 50 | + nb_classifier = "nb_classifier" |
| 51 | + svc_classifier = "svc_classifier" |
| 52 | + |
| 53 | + def __init__(self, licenseList, modelsLoc): |
| 54 | + super().__init__(licenseList) |
| 55 | + self.models_folder = os.path.abspath(modelsLoc) |
| 56 | + |
| 57 | + def similarity_calc(self, processed_comment): |
| 58 | + |
| 59 | + ''' |
| 60 | + The function is designed to give the prediction results of the specific model |
| 61 | + asked by the user. Implementation of all three models and their binary files |
| 62 | + is done here. |
| 63 | + |
| 64 | + :param processed_comment: Pre-processed string derived from the input extracted license. |
| 65 | + :return: A list containing the predicted license name by the specific model. |
| 66 | + :rtype: list() |
| 67 | + |
| 68 | + ''' |
| 69 | + |
| 70 | + with open(os.path.join(self.models_folder, 'vectorizer.pkl'), 'rb') as f: |
| 71 | + loaded_vect = joblib.load(f) |
| 72 | + |
| 73 | + if self.getSimAlgo() == self.lr_classifier: |
| 74 | + classifier = joblib.load(os.path.join(self.models_folder, 'lr_model.pkl')) |
| 75 | + elif self.getSimAlgo() == self.nb_classifier: |
| 76 | + classifier = joblib.load(os.path.join(self.models_folder, 'nb_model.pkl')) |
| 77 | + elif self.getSimAlgo() == self.svc_classifier: |
| 78 | + classifier = joblib.load(os.path.join(self.models_folder, 'svc_model.pkl')) |
| 79 | + |
| 80 | + return classifier.predict((loaded_vect.transform([processed_comment]))) |
| 81 | + |
| 82 | + |
| 83 | + def model_predict(self, filePath): |
| 84 | + |
| 85 | + ''' |
| 86 | + The function is designed to give output as the most similar predicted files |
| 87 | + provided by the user. Three different model approaches are designed |
| 88 | + which can result into different similarities. The comments from files are |
| 89 | + extracted and then the prediction is done on the basis of pre-trained |
| 90 | + models in data folder. |
| 91 | +
|
| 92 | + :param filePath: Input file path to scan |
| 93 | + :return: Result with license shortname, sim_score, sim_type and description |
| 94 | + :rtype: list(JSON Format) |
| 95 | + ''' |
| 96 | + |
| 97 | + match = [] |
| 98 | + |
| 99 | + with open(filePath) as file: |
| 100 | + raw_data = file.read() |
| 101 | + |
| 102 | + # Match SPDX identifiers |
| 103 | + spdx_identifiers = spdx_identifer(raw_data, self.licenseList['shortname']) |
| 104 | + match.extend(spdx_identifiers) |
| 105 | + |
| 106 | + processed_comment = super().loadFile(filePath) |
| 107 | + license_name = self.similarity_calc(processed_comment) |
| 108 | + |
| 109 | + match.append({ |
| 110 | + 'shortname': str(license_name[0]), |
| 111 | + 'sim_score': 1, |
| 112 | + 'sim_type': self.getSimAlgo(), |
| 113 | + 'description': "Shortname: is the predicted license by the model" |
| 114 | + }) |
| 115 | + return match |
| 116 | + |
| 117 | + def getSimAlgo(self): |
| 118 | + return self.algo |
| 119 | + |
| 120 | + def setSimAlgo(self, newAlgo): |
| 121 | + if newAlgo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier): |
| 122 | + self.algo = newAlgo |
| 123 | + |
| 124 | + def scan(self, filePath): |
| 125 | + if self.algo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier): |
| 126 | + return self.model_predict(filePath) |
| 127 | + else: |
| 128 | + return -1 |
| 129 | + |
| 130 | + |
| 131 | +if __name__ == "__main__": |
| 132 | + |
| 133 | + parser = argparse.ArgumentParser() |
| 134 | + parser.add_argument("processedLicenseList", help="Specify the processed license list file") |
| 135 | + parser.add_argument("modelFolder", help="Specify the location of folder with models") |
| 136 | + parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") |
| 137 | + parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name") |
| 138 | + args = parser.parse_args() |
| 139 | + |
| 140 | + licenseList = args.processedLicenseList |
| 141 | + filename = args.inputFile |
| 142 | + model = args.modelname |
| 143 | + modelFolder = args.modelFolder |
| 144 | + |
| 145 | + scanner = Model(licenseList, modelFolder) |
| 146 | + scanner.setSimAlgo(model) |
| 147 | + scanner.scan(filename) |
0 commit comments