clean up some code

tobhey · Sep 21, 2023 · 02682a0 · 02682a0
1 parent 6ca3f82
commit 02682a0
Show file tree

Hide file tree

Showing 25 changed files with 58 additions and 80 deletions.
diff --git a/TraceabilityRunner.py b/TraceabilityRunner.py
@@ -22,12 +22,10 @@
 from precalculating.TraceLinkDataStructure import ElementLevelTraceLinkDataStructure, FileLevelTraceLinkDataStructure
 from precalculating.TraceLinkDataStructureFactory import ElementLevelTraceLinkDataStructureFactory, \
     FileLevelTraceLinkDataStructureFactory
-from preprocessing.CodeASTTokenizer import JavaCodeASTTokenizer, CCodeASTTokenizer
 from preprocessing.Preprocessor import CamelCaseSplitter, LowerCaseTransformer, \
     NonLetterFilter, UrlRemover, Separator, JavaCodeStopWordRemover, \
     StopWordRemover, Lemmatizer, WordLengthFilter, Preprocessor, POSFilter
-from preprocessing.Tokenizer import JavaDocDescriptionOnlyTokenizer, \
-    WordAndSentenceTokenizer, UCTokenizer, WordTokenizer, NameAndDescriptionTokenizer
+from preprocessing.Tokenizer import WordAndSentenceTokenizer, UCTokenizer, NameAndDescriptionTokenizer
 from traceLinkProcessing.ElementFilter import ElementFilter
 from traceLinkProcessing.NeighborHandler import NeighborStrategy
 from traceLinkProcessing.SimilarityFilter import SimilarityFilter

diff --git a/embeddingCreator/CodeEmbeddingCreator.py b/embeddingCreator/CodeEmbeddingCreator.py
@@ -63,7 +63,8 @@ def _handle_no_method_vectors_case(self, classifier, class_embedding_container):
             class_embedding_container.set_non_cg_vector(self._build_class_name_voter_key(classifier), class_name_vector)
         return class_embedding_container
 
-    def _build_class_name_voter_key(self, classifier):
+    @staticmethod
+    def _build_class_name_voter_key(classifier):
         return classifier.get_original_name() + "." + ClassEmbeddingContainer.CLASS_NAME_VOTER
 
 

diff --git a/embeddingCreator/EmbeddingCreator.py b/embeddingCreator/EmbeddingCreator.py
@@ -1,4 +1,4 @@
-import abc, logging, traceback
+import abc, logging
 
 from javalang.parser import JavaSyntaxError, JavaParserError
 from javalang.tokenizer import LexerError
@@ -58,7 +58,7 @@ def embedd_all_files_in_directory(self, directory):
             except (JavaParserError, LexerError) as j:
                 log.info(f"SKIPPED: Error on tokenizing {filename} (Note: code files needs to be compilable): {j}")
                 continue
-            except (FileExtensionNotSupportedError):
+            except FileExtensionNotSupportedError:
                 continue
             file_embedding = self._create_embeddings(file_representation)
             if file_embedding:

diff --git a/embeddingCreator/RequirementEmbeddingCreator.py b/embeddingCreator/RequirementEmbeddingCreator.py
@@ -1,10 +1,11 @@
 import logging
+from abc import ABC
 
 from pathlib import Path
 
-from embeddingCreator.EmbeddingContainer import RequirementEmbeddingContainer, EmbeddingContainer
+from embeddingCreator.EmbeddingContainer import RequirementEmbeddingContainer
 from embeddingCreator.EmbeddingCreator import EmbeddingCreator
-from preprocessing.FileRepresentation import FileRepresentation, UseCaseFileRepresentation
+from preprocessing.FileRepresentation import UseCaseFileRepresentation
 from utility import Util
 
 logging.basicConfig(level=logging.INFO)
@@ -13,7 +14,7 @@
 PREPROCESSED_REQ_OUTPUT_DIR = Path(__file__).parent.parent / "output/Preprocessed_Req"
 
 
-class RequirementEmbeddingCreator(EmbeddingCreator):
+class RequirementEmbeddingCreator(EmbeddingCreator, ABC):
 
     def __init__(self, requirements_word_chooser, preprocessor, wordemb_creator,
                   tokenizer, preprocessed_token_output_directory=PREPROCESSED_REQ_OUTPUT_DIR): 
@@ -46,8 +47,8 @@ class RequirementVectorEmbeddingCreator(RequirementEmbeddingCreator):
 
     def _create_embeddings(self, file_representation):
         chosen_word_groups = self._requirements_word_chooser.choose_words_from(file_representation)
-        chosen_word_groups_embeddings = []
-        requirement_element_vectors = []
+        #chosen_word_groups_embeddings = []
+        #requirement_element_vectors = []
         element_vectors = {}
         for word_group_id in chosen_word_groups:
             word_embeddings = self._word_embedding_creator.create_word_list_embedding(chosen_word_groups[word_group_id])

diff --git a/embeddingCreator/SimilarityComparator.py b/embeddingCreator/SimilarityComparator.py
@@ -2,16 +2,6 @@
 from utility import Util
 from _functools import partial
 
-from scipy import spatial
-
-from pyemd import emd
-from gensim.corpora import Dictionary
-
-from numpy import zeros,\
-    double, sqrt,\
-    sum as np_sum
-
-import numpy as np
 WMD_VALUE_MAP_FUNCTION = partial(Util.map_value_range, 0, 2)
 class SimilarityComparator:
 
@@ -39,7 +29,7 @@ def _wmd(self, boe_1, boe_2):
             max = 0
             for vector_2 in boe_2:
                 similarity = self._cosine_similarity(vector_1, vector_2)
-                if(similarity > max):
+                if similarity > max:
                     max = similarity
             sum += max
 

diff --git a/embeddingCreator/WordEmbeddingCreator.py b/embeddingCreator/WordEmbeddingCreator.py
@@ -2,12 +2,8 @@
 import logging
 import random
 
-from pyemd import emd
-
 import gensim.models.wrappers
 
-from gensim.models.word2vec import Word2Vec
-
 from utility import Util
 
 log = logging.getLogger(__name__)

diff --git a/embeddingCreator/unixcoder.py b/embeddingCreator/unixcoder.py
@@ -157,12 +157,12 @@ def __init__(self, size, eos, device):
         self.finished = []
 
     def getCurrentState(self):
-        "Get the outputs for the current timestep."
+        """Get the outputs for the current timestep."""
         batch = self.nextYs[-1].view(-1, 1)
         return batch
 
     def getCurrentOrigin(self):
-        "Get the backpointers for the current timestep."
+        """Get the backpointers for the current timestep."""
         return self.prevKs[-1]
 
     def advance(self, wordLk):

diff --git a/evaluation/Evaluator.py b/evaluation/Evaluator.py
@@ -1,7 +1,6 @@
 from abc import ABC, abstractmethod
 
 from traceLinkProcessing.TraceLink import TraceLink
-from utility import Util
 
 
 class Evaluator(ABC):
@@ -230,7 +229,7 @@ def get_defining_value(self):
         pass
 
 
-class EmptyResultObject():
+class EmptyResultObject(EvalResultObject):
 
     def __init__(self, message: str):
         self._message = message
@@ -281,7 +280,7 @@ def build_prec_recall_f1_print_str(precision, recall, f_1, true_positives, num_f
         return print_str
 
 
-class MAPResultObject(ABC):
+class MAPResultObject(EvalResultObject):
 
     def __init__(self, mAP, k):
         self.mAP = mAP
@@ -299,7 +298,7 @@ def get_defining_value(self):
         return self.mAP
 
 
-class LagResultObject(ABC):
+class LagResultObject(EvalResultObject):
 
     def __init__(self, lag):
         self.lag = lag

diff --git a/evaluation/OutputService.py b/evaluation/OutputService.py
@@ -1,7 +1,6 @@
-from abc import ABC , abstractmethod
+from abc import ABC
 from typing import Dict, List
 import logging
-from pathlib import Path
 
 from autograd.builtins import isinstance
 
@@ -149,7 +148,8 @@ def _add_best_f1_2D_excel_rows(self, excel_array, print_str_dict, best_eval_resu
 
         return excel_array
 
-    def _get_context_thresholds(self, all_threshs, best_thresh):
+    @staticmethod
+    def _get_context_thresholds(all_threshs, best_thresh):
 
         if len(all_threshs) > 1:
             context_threshs = []
@@ -290,9 +290,8 @@ def process_trace_link_dict(self, trace_link_dict: Dict[float, List[TraceLink]])
         excel_array.append([lag_message])
 
         excel_array.append([""])  # Add empty row as divider
-        csv_array = []
-        csv_array.append(["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"])
-        csv_array.append(["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"])
+        csv_array = [["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"],
+                     ["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"]]
         if isinstance(best_eval_result, F1ResultObject):
             default_prec = 0
             default_rec = 0
@@ -363,9 +362,8 @@ def process_trace_link_2D_dict(self, trace_link_2D_dict: Dict[float, Dict[float,
         excel_array.append([lag_message])
 
         excel_array.append([""])  # Add empty row as divider
-        csv_array = []
-        csv_array.append(["Best:","","","","","Default:","","","MAP","LAG"])
-        csv_array.append(["Maj","Final","Precision","Recall","F1","Precision","Recall","F1","MAP","LAG"])
+        csv_array = [["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"],
+                     ["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"]]
         if isinstance(best_eval_result, F1ResultObject):
             default_prec = 0
             default_rec = 0
@@ -399,10 +397,6 @@ def __init__(self, dataset, excel_output_file_path, also_print_eval=True):
     def process_trace_link_2D_dict(self, trace_link_2D_dict: Dict[float, Dict[float, List[TraceLink]]]):
         results = {}
         for maj_thresh in trace_link_2D_dict:
-            best_eval_result = None
-            best_thresh = None
-
-            print_str_dict = {}
             for final_threshold in trace_link_2D_dict[maj_thresh]:
                 # header_row.append(self.FILE_LEVEL_DROP_THRESH_PATTERN.format(final_threshold))
                 eval_result_object = self._evaluator.evaluate(trace_link_2D_dict[maj_thresh][final_threshold])

diff --git a/evaluation/SolutionComparator.py b/evaluation/SolutionComparator.py
@@ -32,7 +32,7 @@ def get_true_positives(self, trace_link_candidates):
             elif self._print_false_positives:
                 false_positives_matrix.add_trace_pair(trace_link.req_key, trace_link.code_key)
         if self._print_false_negatives:
-            self._print_false_negatives(sol_matrix_copy)
+            self.print_false_negatives(sol_matrix_copy)
         if self._print_false_positives:
             log.info("\n\nFalse Positives: {} Links, {} unique Reqs, {} unique Code".format(false_positives_matrix._number_of_trace_links,
                                                                         false_positives_matrix.num_unique_reqs(), false_positives_matrix.num_unique_code()))
@@ -60,10 +60,11 @@ def get_similarity_relevance_dict(self, trace_links):
                 req_dict[req_name] = [sim_rel_tuple_to_add]
 
         if self._print_false_negatives:
-            self._print_false_negatives(sol_matrix_copy)
+            self.print_false_negatives(sol_matrix_copy)
 
         return req_dict
-
-    def _print_false_negatives(self, sol_matrix_with_false_negatives):
+
+    @staticmethod
+    def print_false_negatives(sol_matrix_with_false_negatives):
         log.info(f"\nFalse Negatives: {sol_matrix_with_false_negatives._number_of_trace_links} Links, {sol_matrix_with_false_negatives.num_unique_reqs()} unique Reqs, {sol_matrix_with_false_negatives.num_unique_code()} unique Code")
         log.info("\n" + sol_matrix_with_false_negatives.print_str())
diff --git a/precalculating/TraceLinkDataStructureFactory.py b/precalculating/TraceLinkDataStructureFactory.py
@@ -93,6 +93,7 @@ def _calculate_similarities_to_all_req_elements(self, similarity_matrix, req_emb
                 similarity_matrix.set_value(key, element_key, similarity)
         return similarity_matrix
 
-    def _build_req_element_key(self, req_file_name, index):
+    @staticmethod
+    def _build_req_element_key(req_file_name, index):
         return f"{req_file_name}.{index}"
 
diff --git a/preprocessing/CallGraphUtil.py b/preprocessing/CallGraphUtil.py
@@ -7,7 +7,7 @@
 import logging
 import re
 
-from datasets.Dataset import Dataset, Smos
+from datasets.Dataset import Dataset
 from utility import FileUtil
 
 log = logging.getLogger(__name__)
@@ -103,8 +103,8 @@ def remove_external_calls():
                 continue  # Leave out inner classes
 
             if create_class_callgraph:
-                insert_class(caller_class_name, set([callee_class_name]), set())
-                insert_class(callee_class_name, set(), set([caller_class_name]))
+                insert_class(caller_class_name, {callee_class_name}, set())
+                insert_class(callee_class_name, set(), {caller_class_name})
 
         elif row_split[0] == "M":  # method level call
             # row_split[1] = Class of caller method
@@ -143,8 +143,8 @@ def remove_external_calls():
             # called_by = caller_dict_key
             # calls = callee_dict_key
 
-            insert_entry(caller_dict_key, caller_class, caller_name, caller_param, set(), set([callee_dict_key]))
-            insert_entry(callee_dict_key, callee_class, callee_name, callee_param, set([caller_dict_key]), set())
+            insert_entry(caller_dict_key, caller_class, caller_name, caller_param, set(), {callee_dict_key})
+            insert_entry(callee_dict_key, callee_class, callee_name, callee_param, {caller_dict_key}, set())
 
         else:
             log.error("Unknow start character: " + row_split[0])

diff --git a/preprocessing/CodeASTTokenizer.py b/preprocessing/CodeASTTokenizer.py
@@ -7,7 +7,6 @@
 
 from preprocessing import Tokenizer, JavaLangUtil, PycparserUtil, JSParserUtil
 from pycparser import parse_file
-from esprima import parse
 from preprocessing.CodeFileRepresentation import CodeFileRepresentation, IdentifierString, Classifier
 from preprocessing.CommentParserUtil import parse_and_add_comments_to_file
 from utility import FileUtil

diff --git a/preprocessing/CodeFileRepresentation.py b/preprocessing/CodeFileRepresentation.py
@@ -205,15 +205,15 @@ def get_param_tuples(self) -> [([str], [str])]:  # returns [([param_type], [para
         [([boolean], [is, active]), ([type, tokens], [name, tokens]), ...]
         The type and name string list can contain multiple tokens if a camel case splitter was applied beforehand.
         """
-        self.__check_and_clean_param_list
+        self.__check_and_clean_param_list()
         return [param.get_param_tuple() for param in self.parameters]
 
     def get_param_plain_list(self) -> [str]:
         """
         Returns param tokens as plain string list (No assumptions about order):
         [type1, name1, type2, name2...]
         """
-        self.__check_and_clean_param_list
+        self.__check_and_clean_param_list()
         param_words = []
         for param in self.parameters:
             param_words.extend(param.get_param_words())
@@ -224,7 +224,7 @@ def get_param_names_plain_list(self) -> [str]:
         Returns param names tokens as plain string list (No assumptions about order):
         [name1, name2...]
         """
-        self.__check_and_clean_param_list
+        self.__check_and_clean_param_list()
         param_words = []
         for param in self.parameters:
             param_words.extend(param.get_param_name_words())

diff --git a/preprocessing/CommentParserUtil.py b/preprocessing/CommentParserUtil.py
@@ -4,7 +4,6 @@
 import logging
 from comment_parser import comment_parser
 from preprocessing.CodeFileRepresentation import IdentifierString, Enum_
-from preprocessing.Tokenizer import WordTokenizer
 
 log = logging.getLogger(__name__)
 

diff --git a/preprocessing/FileRepresentation.py b/preprocessing/FileRepresentation.py
@@ -122,13 +122,11 @@ def preprocess(self, preprocessor):
         self.__clean_up_strings()
 
     def get_printable_string(self):
-        print_str = ["UC_NAME || " + "|".join(self.name_words)]
-        print_str.append("DESCRIPTION || " + "|".join(self.description_words))
-        print_str.append("ACTOR || " + "|".join(self.actor_words))
-        print_str.append("PRECOND || " + "|".join(self.precondition_words))
-        print_str.append("EVENTFLOW || " + ", ".join(["|".join(sent_group) for sent_group in self.flow_of_events_words]))
-        print_str.append("POSTCOND || " + "|".join(self.postcondition_words))
-        print_str.append("QUALIREQ || " + "|".join(self.quality_requirement_words))
+        print_str = ["UC_NAME || " + "|".join(self.name_words), "DESCRIPTION || " + "|".join(self.description_words),
+                     "ACTOR || " + "|".join(self.actor_words), "PRECOND || " + "|".join(self.precondition_words),
+                     "EVENTFLOW || " + ", ".join(["|".join(sent_group) for sent_group in self.flow_of_events_words]),
+                     "POSTCOND || " + "|".join(self.postcondition_words),
+                     "QUALIREQ || " + "|".join(self.quality_requirement_words)]
         return "\n".join(print_str)
 
     def get_csv_string(self):

diff --git a/preprocessing/JSParserUtil.py b/preprocessing/JSParserUtil.py
@@ -17,7 +17,6 @@ def extract_FileAST(fileAST, filepath):
     file_name = FileUtil.get_filename_from_path(filepath)
     # Use mock class to hold the methods
     class_name = IdentifierString(file_name, file_name.replace(".jsp", ""))
-    super_classifiers = []
     class_object = Classifier(class_name, IdentifierString(file_name, ""))
     attributes = []
     functions = []

diff --git a/preprocessing/JavaLangUtil.py b/preprocessing/JavaLangUtil.py
@@ -105,7 +105,7 @@ def _extract_attributes(attribute_nodes, file_name):
     """Return an attribute object list."""
     attr_list = []
     for attr in attribute_nodes:
-        if not(ONLY_PUBLIC_ATTRIBUTES) or attr.modifiers.contains("public"):  # logical Implication
+        if not ONLY_PUBLIC_ATTRIBUTES or attr.modifiers.contains("public"):  # logical Implication
             attr_init_value, left_side_identifier = _extract_children_strings(attr.declarators[0].initializer, file_name)
             attr_obj = Attribute(IdentifierString(file_name, attr.type.name),
                                  IdentifierString(file_name, attr.declarators[0].name), attr_init_value, IdentifierString(file_name, ""))
@@ -120,7 +120,7 @@ def _extract_methods(method_nodes, file_name):
     meth_list = []
     for meth in method_nodes:
         #if not(ONLY_PUBLIC_METHODS) or not("private" in meth.modifiers or "protected" in meth.modifiers):
-        if not(ONLY_PUBLIC_METHODS) or "public" in meth.modifiers:  # logical Implication
+        if not ONLY_PUBLIC_METHODS or "public" in meth.modifiers:  # logical Implication
             meth_obj = Method(IdentifierString(file_name, ""), IdentifierString(file_name, meth.name), IdentifierString(file_name, ""),
                               IdentifierString(file_name, ""), IdentifierString(file_name, ""))
             if meth.return_type:
@@ -144,7 +144,7 @@ def _extract_parameters(parameter_nodes, file_name):
     return param_list
 
 
-def _extract_children_strings(body_node, file_name) -> IdentifierString:
+def _extract_children_strings(body_node, file_name) -> tuple[IdentifierString, IdentifierString]:
     """Returns an IdentifierString that contains all identifiers in the given body node"""
 
     strings_in_body, left_side_identifiers = _traverse_node(body_node, file_name)

diff --git a/preprocessing/PycparserUtil.py b/preprocessing/PycparserUtil.py
@@ -17,7 +17,6 @@ def extract_FileAST(fileAST, filepath):
     file_name = FileUtil.get_filename_from_path(filepath)
     # Use mock class to hold the methods
     class_name = IdentifierString(file_name, file_name.replace(".c", ""))
-    super_classifiers = []
     class_object = Classifier(class_name, IdentifierString(file_name, ""))
     attributes = []
     functions = []