Skip to content

Commit

Permalink
clean up some code
Browse files Browse the repository at this point in the history
  • Loading branch information
tobhey committed Sep 21, 2023
1 parent 6ca3f82 commit 02682a0
Show file tree
Hide file tree
Showing 25 changed files with 58 additions and 80 deletions.
4 changes: 1 addition & 3 deletions TraceabilityRunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@
from precalculating.TraceLinkDataStructure import ElementLevelTraceLinkDataStructure, FileLevelTraceLinkDataStructure
from precalculating.TraceLinkDataStructureFactory import ElementLevelTraceLinkDataStructureFactory, \
FileLevelTraceLinkDataStructureFactory
from preprocessing.CodeASTTokenizer import JavaCodeASTTokenizer, CCodeASTTokenizer
from preprocessing.Preprocessor import CamelCaseSplitter, LowerCaseTransformer, \
NonLetterFilter, UrlRemover, Separator, JavaCodeStopWordRemover, \
StopWordRemover, Lemmatizer, WordLengthFilter, Preprocessor, POSFilter
from preprocessing.Tokenizer import JavaDocDescriptionOnlyTokenizer, \
WordAndSentenceTokenizer, UCTokenizer, WordTokenizer, NameAndDescriptionTokenizer
from preprocessing.Tokenizer import WordAndSentenceTokenizer, UCTokenizer, NameAndDescriptionTokenizer
from traceLinkProcessing.ElementFilter import ElementFilter
from traceLinkProcessing.NeighborHandler import NeighborStrategy
from traceLinkProcessing.SimilarityFilter import SimilarityFilter
Expand Down
3 changes: 2 additions & 1 deletion embeddingCreator/CodeEmbeddingCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def _handle_no_method_vectors_case(self, classifier, class_embedding_container):
class_embedding_container.set_non_cg_vector(self._build_class_name_voter_key(classifier), class_name_vector)
return class_embedding_container

def _build_class_name_voter_key(self, classifier):
@staticmethod
def _build_class_name_voter_key(classifier):
return classifier.get_original_name() + "." + ClassEmbeddingContainer.CLASS_NAME_VOTER


Expand Down
4 changes: 2 additions & 2 deletions embeddingCreator/EmbeddingCreator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import abc, logging, traceback
import abc, logging

from javalang.parser import JavaSyntaxError, JavaParserError
from javalang.tokenizer import LexerError
Expand Down Expand Up @@ -58,7 +58,7 @@ def embedd_all_files_in_directory(self, directory):
except (JavaParserError, LexerError) as j:
log.info(f"SKIPPED: Error on tokenizing {filename} (Note: code files needs to be compilable): {j}")
continue
except (FileExtensionNotSupportedError):
except FileExtensionNotSupportedError:
continue
file_embedding = self._create_embeddings(file_representation)
if file_embedding:
Expand Down
11 changes: 6 additions & 5 deletions embeddingCreator/RequirementEmbeddingCreator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
from abc import ABC

from pathlib import Path

from embeddingCreator.EmbeddingContainer import RequirementEmbeddingContainer, EmbeddingContainer
from embeddingCreator.EmbeddingContainer import RequirementEmbeddingContainer
from embeddingCreator.EmbeddingCreator import EmbeddingCreator
from preprocessing.FileRepresentation import FileRepresentation, UseCaseFileRepresentation
from preprocessing.FileRepresentation import UseCaseFileRepresentation
from utility import Util

logging.basicConfig(level=logging.INFO)
Expand All @@ -13,7 +14,7 @@
PREPROCESSED_REQ_OUTPUT_DIR = Path(__file__).parent.parent / "output/Preprocessed_Req"


class RequirementEmbeddingCreator(EmbeddingCreator):
class RequirementEmbeddingCreator(EmbeddingCreator, ABC):

def __init__(self, requirements_word_chooser, preprocessor, wordemb_creator,
tokenizer, preprocessed_token_output_directory=PREPROCESSED_REQ_OUTPUT_DIR):
Expand Down Expand Up @@ -46,8 +47,8 @@ class RequirementVectorEmbeddingCreator(RequirementEmbeddingCreator):

def _create_embeddings(self, file_representation):
chosen_word_groups = self._requirements_word_chooser.choose_words_from(file_representation)
chosen_word_groups_embeddings = []
requirement_element_vectors = []
#chosen_word_groups_embeddings = []
#requirement_element_vectors = []
element_vectors = {}
for word_group_id in chosen_word_groups:
word_embeddings = self._word_embedding_creator.create_word_list_embedding(chosen_word_groups[word_group_id])
Expand Down
12 changes: 1 addition & 11 deletions embeddingCreator/SimilarityComparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,6 @@
from utility import Util
from _functools import partial

from scipy import spatial

from pyemd import emd
from gensim.corpora import Dictionary

from numpy import zeros,\
double, sqrt,\
sum as np_sum

import numpy as np
WMD_VALUE_MAP_FUNCTION = partial(Util.map_value_range, 0, 2)
class SimilarityComparator:

Expand Down Expand Up @@ -39,7 +29,7 @@ def _wmd(self, boe_1, boe_2):
max = 0
for vector_2 in boe_2:
similarity = self._cosine_similarity(vector_1, vector_2)
if(similarity > max):
if similarity > max:
max = similarity
sum += max

Expand Down
4 changes: 0 additions & 4 deletions embeddingCreator/WordEmbeddingCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
import logging
import random

from pyemd import emd

import gensim.models.wrappers

from gensim.models.word2vec import Word2Vec

from utility import Util

log = logging.getLogger(__name__)
Expand Down
4 changes: 2 additions & 2 deletions embeddingCreator/unixcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,12 @@ def __init__(self, size, eos, device):
self.finished = []

def getCurrentState(self):
"Get the outputs for the current timestep."
"""Get the outputs for the current timestep."""
batch = self.nextYs[-1].view(-1, 1)
return batch

def getCurrentOrigin(self):
"Get the backpointers for the current timestep."
"""Get the backpointers for the current timestep."""
return self.prevKs[-1]

def advance(self, wordLk):
Expand Down
7 changes: 3 additions & 4 deletions evaluation/Evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from abc import ABC, abstractmethod

from traceLinkProcessing.TraceLink import TraceLink
from utility import Util


class Evaluator(ABC):
Expand Down Expand Up @@ -230,7 +229,7 @@ def get_defining_value(self):
pass


class EmptyResultObject():
class EmptyResultObject(EvalResultObject):

def __init__(self, message: str):
self._message = message
Expand Down Expand Up @@ -281,7 +280,7 @@ def build_prec_recall_f1_print_str(precision, recall, f_1, true_positives, num_f
return print_str


class MAPResultObject(ABC):
class MAPResultObject(EvalResultObject):

def __init__(self, mAP, k):
self.mAP = mAP
Expand All @@ -299,7 +298,7 @@ def get_defining_value(self):
return self.mAP


class LagResultObject(ABC):
class LagResultObject(EvalResultObject):

def __init__(self, lag):
self.lag = lag
Expand Down
20 changes: 7 additions & 13 deletions evaluation/OutputService.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from abc import ABC , abstractmethod
from abc import ABC
from typing import Dict, List
import logging
from pathlib import Path

from autograd.builtins import isinstance

Expand Down Expand Up @@ -149,7 +148,8 @@ def _add_best_f1_2D_excel_rows(self, excel_array, print_str_dict, best_eval_resu

return excel_array

def _get_context_thresholds(self, all_threshs, best_thresh):
@staticmethod
def _get_context_thresholds(all_threshs, best_thresh):

if len(all_threshs) > 1:
context_threshs = []
Expand Down Expand Up @@ -290,9 +290,8 @@ def process_trace_link_dict(self, trace_link_dict: Dict[float, List[TraceLink]])
excel_array.append([lag_message])

excel_array.append([""]) # Add empty row as divider
csv_array = []
csv_array.append(["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"])
csv_array.append(["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"])
csv_array = [["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"],
["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"]]
if isinstance(best_eval_result, F1ResultObject):
default_prec = 0
default_rec = 0
Expand Down Expand Up @@ -363,9 +362,8 @@ def process_trace_link_2D_dict(self, trace_link_2D_dict: Dict[float, Dict[float,
excel_array.append([lag_message])

excel_array.append([""]) # Add empty row as divider
csv_array = []
csv_array.append(["Best:","","","","","Default:","","","MAP","LAG"])
csv_array.append(["Maj","Final","Precision","Recall","F1","Precision","Recall","F1","MAP","LAG"])
csv_array = [["Best:", "", "", "", "", "Default:", "", "", "MAP", "LAG"],
["Maj", "Final", "Precision", "Recall", "F1", "Precision", "Recall", "F1", "MAP", "LAG"]]
if isinstance(best_eval_result, F1ResultObject):
default_prec = 0
default_rec = 0
Expand Down Expand Up @@ -399,10 +397,6 @@ def __init__(self, dataset, excel_output_file_path, also_print_eval=True):
def process_trace_link_2D_dict(self, trace_link_2D_dict: Dict[float, Dict[float, List[TraceLink]]]):
results = {}
for maj_thresh in trace_link_2D_dict:
best_eval_result = None
best_thresh = None

print_str_dict = {}
for final_threshold in trace_link_2D_dict[maj_thresh]:
# header_row.append(self.FILE_LEVEL_DROP_THRESH_PATTERN.format(final_threshold))
eval_result_object = self._evaluator.evaluate(trace_link_2D_dict[maj_thresh][final_threshold])
Expand Down
9 changes: 5 additions & 4 deletions evaluation/SolutionComparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def get_true_positives(self, trace_link_candidates):
elif self._print_false_positives:
false_positives_matrix.add_trace_pair(trace_link.req_key, trace_link.code_key)
if self._print_false_negatives:
self._print_false_negatives(sol_matrix_copy)
self.print_false_negatives(sol_matrix_copy)
if self._print_false_positives:
log.info("\n\nFalse Positives: {} Links, {} unique Reqs, {} unique Code".format(false_positives_matrix._number_of_trace_links,
false_positives_matrix.num_unique_reqs(), false_positives_matrix.num_unique_code()))
Expand Down Expand Up @@ -60,10 +60,11 @@ def get_similarity_relevance_dict(self, trace_links):
req_dict[req_name] = [sim_rel_tuple_to_add]

if self._print_false_negatives:
self._print_false_negatives(sol_matrix_copy)
self.print_false_negatives(sol_matrix_copy)

return req_dict

def _print_false_negatives(self, sol_matrix_with_false_negatives):

@staticmethod
def print_false_negatives(sol_matrix_with_false_negatives):
log.info(f"\nFalse Negatives: {sol_matrix_with_false_negatives._number_of_trace_links} Links, {sol_matrix_with_false_negatives.num_unique_reqs()} unique Reqs, {sol_matrix_with_false_negatives.num_unique_code()} unique Code")
log.info("\n" + sol_matrix_with_false_negatives.print_str())
3 changes: 2 additions & 1 deletion precalculating/TraceLinkDataStructureFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def _calculate_similarities_to_all_req_elements(self, similarity_matrix, req_emb
similarity_matrix.set_value(key, element_key, similarity)
return similarity_matrix

def _build_req_element_key(self, req_file_name, index):
@staticmethod
def _build_req_element_key(req_file_name, index):
return f"{req_file_name}.{index}"

10 changes: 5 additions & 5 deletions preprocessing/CallGraphUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging
import re

from datasets.Dataset import Dataset, Smos
from datasets.Dataset import Dataset
from utility import FileUtil

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -103,8 +103,8 @@ def remove_external_calls():
continue # Leave out inner classes

if create_class_callgraph:
insert_class(caller_class_name, set([callee_class_name]), set())
insert_class(callee_class_name, set(), set([caller_class_name]))
insert_class(caller_class_name, {callee_class_name}, set())
insert_class(callee_class_name, set(), {caller_class_name})

elif row_split[0] == "M": # method level call
# row_split[1] = Class of caller method
Expand Down Expand Up @@ -143,8 +143,8 @@ def remove_external_calls():
# called_by = caller_dict_key
# calls = callee_dict_key

insert_entry(caller_dict_key, caller_class, caller_name, caller_param, set(), set([callee_dict_key]))
insert_entry(callee_dict_key, callee_class, callee_name, callee_param, set([caller_dict_key]), set())
insert_entry(caller_dict_key, caller_class, caller_name, caller_param, set(), {callee_dict_key})
insert_entry(callee_dict_key, callee_class, callee_name, callee_param, {caller_dict_key}, set())

else:
log.error("Unknow start character: " + row_split[0])
Expand Down
1 change: 0 additions & 1 deletion preprocessing/CodeASTTokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from preprocessing import Tokenizer, JavaLangUtil, PycparserUtil, JSParserUtil
from pycparser import parse_file
from esprima import parse
from preprocessing.CodeFileRepresentation import CodeFileRepresentation, IdentifierString, Classifier
from preprocessing.CommentParserUtil import parse_and_add_comments_to_file
from utility import FileUtil
Expand Down
6 changes: 3 additions & 3 deletions preprocessing/CodeFileRepresentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,15 +205,15 @@ def get_param_tuples(self) -> [([str], [str])]: # returns [([param_type], [para
[([boolean], [is, active]), ([type, tokens], [name, tokens]), ...]
The type and name string list can contain multiple tokens if a camel case splitter was applied beforehand.
"""
self.__check_and_clean_param_list
self.__check_and_clean_param_list()
return [param.get_param_tuple() for param in self.parameters]

def get_param_plain_list(self) -> [str]:
"""
Returns param tokens as plain string list (No assumptions about order):
[type1, name1, type2, name2...]
"""
self.__check_and_clean_param_list
self.__check_and_clean_param_list()
param_words = []
for param in self.parameters:
param_words.extend(param.get_param_words())
Expand All @@ -224,7 +224,7 @@ def get_param_names_plain_list(self) -> [str]:
Returns param names tokens as plain string list (No assumptions about order):
[name1, name2...]
"""
self.__check_and_clean_param_list
self.__check_and_clean_param_list()
param_words = []
for param in self.parameters:
param_words.extend(param.get_param_name_words())
Expand Down
1 change: 0 additions & 1 deletion preprocessing/CommentParserUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
from comment_parser import comment_parser
from preprocessing.CodeFileRepresentation import IdentifierString, Enum_
from preprocessing.Tokenizer import WordTokenizer

log = logging.getLogger(__name__)

Expand Down
12 changes: 5 additions & 7 deletions preprocessing/FileRepresentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,11 @@ def preprocess(self, preprocessor):
self.__clean_up_strings()

def get_printable_string(self):
print_str = ["UC_NAME || " + "|".join(self.name_words)]
print_str.append("DESCRIPTION || " + "|".join(self.description_words))
print_str.append("ACTOR || " + "|".join(self.actor_words))
print_str.append("PRECOND || " + "|".join(self.precondition_words))
print_str.append("EVENTFLOW || " + ", ".join(["|".join(sent_group) for sent_group in self.flow_of_events_words]))
print_str.append("POSTCOND || " + "|".join(self.postcondition_words))
print_str.append("QUALIREQ || " + "|".join(self.quality_requirement_words))
print_str = ["UC_NAME || " + "|".join(self.name_words), "DESCRIPTION || " + "|".join(self.description_words),
"ACTOR || " + "|".join(self.actor_words), "PRECOND || " + "|".join(self.precondition_words),
"EVENTFLOW || " + ", ".join(["|".join(sent_group) for sent_group in self.flow_of_events_words]),
"POSTCOND || " + "|".join(self.postcondition_words),
"QUALIREQ || " + "|".join(self.quality_requirement_words)]
return "\n".join(print_str)

def get_csv_string(self):
Expand Down
1 change: 0 additions & 1 deletion preprocessing/JSParserUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def extract_FileAST(fileAST, filepath):
file_name = FileUtil.get_filename_from_path(filepath)
# Use mock class to hold the methods
class_name = IdentifierString(file_name, file_name.replace(".jsp", ""))
super_classifiers = []
class_object = Classifier(class_name, IdentifierString(file_name, ""))
attributes = []
functions = []
Expand Down
6 changes: 3 additions & 3 deletions preprocessing/JavaLangUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _extract_attributes(attribute_nodes, file_name):
"""Return an attribute object list."""
attr_list = []
for attr in attribute_nodes:
if not(ONLY_PUBLIC_ATTRIBUTES) or attr.modifiers.contains("public"): # logical Implication
if not ONLY_PUBLIC_ATTRIBUTES or attr.modifiers.contains("public"): # logical Implication
attr_init_value, left_side_identifier = _extract_children_strings(attr.declarators[0].initializer, file_name)
attr_obj = Attribute(IdentifierString(file_name, attr.type.name),
IdentifierString(file_name, attr.declarators[0].name), attr_init_value, IdentifierString(file_name, ""))
Expand All @@ -120,7 +120,7 @@ def _extract_methods(method_nodes, file_name):
meth_list = []
for meth in method_nodes:
#if not(ONLY_PUBLIC_METHODS) or not("private" in meth.modifiers or "protected" in meth.modifiers):
if not(ONLY_PUBLIC_METHODS) or "public" in meth.modifiers: # logical Implication
if not ONLY_PUBLIC_METHODS or "public" in meth.modifiers: # logical Implication
meth_obj = Method(IdentifierString(file_name, ""), IdentifierString(file_name, meth.name), IdentifierString(file_name, ""),
IdentifierString(file_name, ""), IdentifierString(file_name, ""))
if meth.return_type:
Expand All @@ -144,7 +144,7 @@ def _extract_parameters(parameter_nodes, file_name):
return param_list


def _extract_children_strings(body_node, file_name) -> IdentifierString:
def _extract_children_strings(body_node, file_name) -> tuple[IdentifierString, IdentifierString]:
"""Returns an IdentifierString that contains all identifiers in the given body node"""

strings_in_body, left_side_identifiers = _traverse_node(body_node, file_name)
Expand Down
1 change: 0 additions & 1 deletion preprocessing/PycparserUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def extract_FileAST(fileAST, filepath):
file_name = FileUtil.get_filename_from_path(filepath)
# Use mock class to hold the methods
class_name = IdentifierString(file_name, file_name.replace(".c", ""))
super_classifiers = []
class_object = Classifier(class_name, IdentifierString(file_name, ""))
attributes = []
functions = []
Expand Down
Loading

0 comments on commit 02682a0

Please sign in to comment.