From f7c5242d7ef1181d6122ad26d4de18b9d5bee3b5 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 4 May 2023 18:09:29 -0400
Subject: [PATCH 01/39] adding support for verbosity test output to a user
 specified file

---
 configs/default_filter_config.json |   4 ++
 src/diagnose_github_repo.py        |   7 +-
 src/diagnose_npm_package.py        |   7 +-
 src/test_JS_repo_lib.py            | 108 +++++++++++++++++++++++++++--
 4 files changed, 120 insertions(+), 6 deletions(-)

diff --git a/configs/default_filter_config.json b/configs/default_filter_config.json
index 872b2bb..14fdabb 100644
--- a/configs/default_filter_config.json
+++ b/configs/default_filter_config.json
@@ -14,6 +14,10 @@
 	},
 	"test": {
 		"track_tests": true,
+		"test_verbose_all_output": {
+			"do_verbose_tracking": false,
+			"verbose_json_output_file": "verbose_test_report.json"
+		},
 		"tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
 								  "mocha", "jest", "ava", "tap", "jasmine"],
 		"timeout": 1000
diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index 040db10..7e7b44c 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -33,6 +33,8 @@ class RepoWalker():
 	COMPUTE_DEP_LISTS = False
 	TRACK_BUILD = True
 	TRACK_TESTS = True
+	TEST_VERBOSE_ALL_OUTPUT = False
+	TEST_VERBOSE_OUTPUT_JSON = "verbose_test_report.json"
 
 	TRACKED_TEST_COMMANDS = ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench", 
 							 "mocha", "jest", "ava", "tap", "jasmine"]
@@ -42,7 +44,7 @@ class RepoWalker():
 
 	# timeouts for stages, in seconds
 	INSTALL_TIMEOUT = 1000
-	# note: these are timeouts pers *script* in the stage of the process
+	# note: these are timeouts per *script* in the stage of the process
 	BUILD_TIMEOUT = 1000
 	TEST_TIMEOUT = 1000
 
@@ -97,6 +99,9 @@ def set_up_config( self, config_file):
 		self.TEST_TIMEOUT = cf_dict.get("timeout", self.TEST_TIMEOUT)
 		self.TRACKED_TEST_COMMANDS = cf_dict.get("tracked_test_commands", self.TRACKED_TEST_COMMANDS)
 		self.TRACK_TESTS = cf_dict.get("track_tests", self.TRACK_TESTS)
+		test_verbose_config = cf_dict.get("test_verbose_all_output", {})
+		self.TEST_VERBOSE_ALL_OUTPUT = test_verbose_config.get("do_verbose_tracking", self.TEST_VERBOSE_ALL_OUTPUT)
+		self.TEST_VERBOSE_OUTPUT_JSON = test_verbose_config.get("verbose_json_output_file", self.TEST_VERBOSE_OUTPUT_JSON)
 
 		cf_dict = config_json.get("QL_output", {})
 		self.QL_CUTOFF = cf_dict.get("QL_cutoff", self.QL_CUTOFF)
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
index 4e4cd78..cc09f8a 100644
--- a/src/diagnose_npm_package.py
+++ b/src/diagnose_npm_package.py
@@ -26,6 +26,8 @@ class NPMSpider(scrapy.Spider):
 	COMPUTE_DEP_LISTS = False
 	TRACK_BUILD = True
 	TRACK_TESTS = True
+	TEST_VERBOSE_ALL_OUTPUT = False
+	TEST_VERBOSE_OUTPUT_JSON = "verbose_test_report.json"
 
 	TRACKED_TEST_COMMANDS = ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench", 
 							 "mocha", "jest", "ava", "tap", "jasmine"]
@@ -35,7 +37,7 @@ class NPMSpider(scrapy.Spider):
 
 	# timeouts for stages, in seconds
 	INSTALL_TIMEOUT = 1000
-	# note: these are timeouts pers *script* in the stage of the process
+	# note: these are timeouts per *script* in the stage of the process
 	BUILD_TIMEOUT = 1000
 	TEST_TIMEOUT = 1000
 	
@@ -89,6 +91,9 @@ def set_up_config( self, config_file):
 		self.TEST_TIMEOUT = cf_dict.get("timeout", self.TEST_TIMEOUT)
 		self.TRACKED_TEST_COMMANDS = cf_dict.get("tracked_test_commands", self.TRACKED_TEST_COMMANDS)
 		self.TRACK_TESTS = cf_dict.get("track_tests", self.TRACK_TESTS)
+		test_verbose_config = cf_dict.get("test_verbose_all_output", {})
+		self.TEST_VERBOSE_ALL_OUTPUT = test_verbose_config.get("do_verbose_tracking", self.TEST_VERBOSE_ALL_OUTPUT)
+		self.TEST_VERBOSE_OUTPUT_JSON = test_verbose_config.get("verbose_json_output_file", self.TEST_VERBOSE_OUTPUT_JSON)
 
 	def parse(self, response):
 		# TODO should we handle specific response codes?
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index f93d3af..2719b0d 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -116,11 +116,90 @@ def run_tests( manager, pkg_json, crawler):
 		test_info.compute_test_infras()
 		test_info.compute_nested_test_commands( test_scripts)
 		test_info.compute_test_stats()
-		# print( test_info[t])
-		# print( get_test_info(error, output))
+		# if we're in verbose testing mode (i.e. getting all timing info for each test, etc)
+		# then, we rerun the test commands with all the commands for adding verbose_mode to 
+		# each of the test infras involved (individually)
+		if crawler.TEST_VERBOSE_ALL_OUTPUT:
+			# we're gonna be adding our new custom scripts for verbosity testing
+			run_command( "mv package.json TEMP_package.json_TEMP")
+			verbosity_index = 0
+			test_verbosity_output = {}
+			for test_infra in test_info.test_infras:
+				verbose_test_json = ("" if verbosity_index == 0 else "infra_" + str(verbosity_index) + "_") + crawler.TEST_VERBOSE_OUTPUT_JSON
+				infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
+				if not infra_verbosity_config: # checks if it's an empty object
+					print("TEST VERBOSE MODE: unsupported test infra " + test_infra)
+					test_verbosity_output[test_infra] = { "error": True }
+					continue
+				infra_verbosity_args = infra_verbosity_config.get("args", "")
+				infra_verbosity_args_pos = infra_verbosity_config.get("position", -1) # default position is at the end
+				infra_verbosity_command = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
+																				verbose_test_json, infra_verbosity_args_pos)
+				verbosity_script_name = "instrumented_verbosity_command_" + str(verbosity_index)
+				pkg_json["scripts"][verbosity_script_name] = infra_verbosity_command
+				with open("package.json", 'w') as f:
+					json.dump( pkg_json, f)
+				print("Running verbosity: " + manager + infra_verbosity_command)
+				verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
+				verbosity_index += 1
+				# get the output
+				test_verbosity_infra = {}
+				test_verbosity_infra["command"] = infra_verbosity_command
+				test_verbosity_infra["output_files"] = verbose_test_json
+				if crawler.VERBOSE_MODE:
+					test_verbosity_infra["test_debug"] = "\nError output: " + verb_error.decode('utf-8') \
+														 + "\nOutput stream: " + verb_output.decode('utf-8')
+				test_verbosity_output[test_infra] = test_verbosity_infra
+			test_info.set_test_verbosity_output(test_verbosity_output)
+			# put the package.json back
+			run_command( "mv TEMP_package.json_TEMP package.json")
 		test_json_summary[t] = test_info.get_json_rep()
 	return( retcode, test_json_summary)
 
+def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity_args, verbose_test_json, infra_verbosity_args_pos):
+	# replace the output file name with the custom output filename
+	# add an index to the filename for the 2nd,+ time the filename shows up
+	# so as to avoid overwriting the files
+	num_files = 0
+	new_infra_verbosity_args = ""
+	for i, sub in enumerate(infra_verbosity_args.split("$PLACEHOLDER_OUTPUT_FILE_NAME$")):
+		# not the file name
+		if sub != "": 
+			new_infra_verbosity_args += sub
+		else:
+			new_infra_verbosity_args += ("" if num_files == 0 else ("out_" + str(num_files) + "_")) + verbose_test_json 
+			num_files += 1
+	infra_verbosity_args = new_infra_verbosity_args
+	# split into sub-commands
+	command_split_chars = [ "&&", ";"]
+	infra_calls = test_script.split(test_infra)
+	instrumented_test_command = []
+	for i, infra_call in enumerate(infra_calls):
+		# if the current call is empty string and the next is non-empty
+		# then this is the call to the testing infra and the next is the arguments 
+		# so, skip this one
+		# if there are no args (i.e. no next non-empty string), then just instrument this one
+		if infra_call == "" and i < len(infra_calls) - 1 and infra_calls[i + 1] != "":
+			instrumented_test_command += [ "" ]
+			continue
+		# if the first call is non-empty, then it's pre-test-infra and we skip it too
+		elif infra_call != "" and i == 0:
+			instrumented_test_command += [ "" ]
+			continue
+		# get the arguments, splitting off from any other non-test commands that might be
+		# in this command (note: we know all the commands started with test_infra)
+		end_command_pos = re.search(r'|'.join(command_split_chars), infra_call)
+		end_command_pos = end_command_pos.start() if not end_command_pos is None else -1
+		sub_command_args = (infra_call[0:end_command_pos] if end_command_pos > -1 else infra_call).split(" ")
+		if infra_verbosity_args_pos != -1:
+			sub_command_args.insert(infra_verbosity_args_pos, infra_verbosity_args)
+		else:
+			sub_command_args.append(infra_verbosity_args)
+		# rebuild the command, re-attaching any extra sub-commands
+		instrumented_test_command += [ " ".join(sub_command_args) + (infra_call[end_command_pos:] if end_command_pos > -1 else "") ]
+	return(test_infra.join(instrumented_test_command))
+	
+
 def called_in_command( str_comm, command, manager):
 	# command ends with command terminator (this list includes \0 end-of-string, 
 	# but this is not available to check in Python so we use endswith)
@@ -195,14 +274,29 @@ class TestInfo:
 				"failing": ("failed", -1)
 			},
 	}
+	# extra args, their position in the arg list, and any post-processing required
+	VERBOSE_TESTS_EXTRA_ARGS = {
+		"jest": {
+			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"position":  -1,
+			"post_processing": None
+		},
+		"mocha": {
+			"args": " -- --reporter xunit --reporter-option output=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"position": -1,
+			"post_processing": None #TODO change this to the xml-to-json parser
+		}
+	}
 	TRACKED_INFRAS = {
 		"mocha": {
 			"name": "mocha", 
-			"output_checkers": [ "mocha", "tap" ]
+			"output_checkers": [ "mocha", "tap" ],
+			"verbose_tests_extra_args": [ "mocha" ]
 		},
 		"jest": {
 			"name": "jest", 
-			"output_checkers": [ "jest" ]
+			"output_checkers": [ "jest" ],
+			"verbose_tests_extra_args": [ "jest" ]
 		},
 		"jasmine": {
 			"name": "jasmine", 
@@ -256,10 +350,14 @@ def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
 		self.num_failing = None
 		self.timed_out = False
 		self.VERBOSE_MODE = VERBOSE_MODE
+		self.test_verbosity_output = None
 
 	def set_test_command( self, test_command):
 		self.test_command = test_command
 
+	def set_test_verbosity_output( self, verbose_output):
+		self.test_verbosity_output = verbose_output
+
 	def compute_test_infras( self):
 		self.test_infras = []
 		self.test_covs = []
@@ -323,6 +421,8 @@ def get_json_rep( self):
 			json_rep["nested_test_commands"] = self.nested_test_commands
 		if "test_infras" not in json_rep:
 			json_rep["RUNS_NEW_USER_TESTS"] = False
+		if self.test_verbosity_output:
+			json_rep["test_verbosity_output"] = self.test_verbosity_output
 		json_rep["timed_out"] = self.timed_out
 		return( json_rep)
 

From 38df6aa9a66e035056e18f7db70b3834253c2cf3 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 5 May 2023 15:36:14 -0400
Subject: [PATCH 02/39] rename output file so no overwrite with multiple test
 scripts; move test reports to specified output dir; add parser for mocha
 output post-processing (xml to json)

---
 .gitignore                             |   1 +
 src/TestInfo.py                        | 251 +++++++++++++++++++++
 src/diagnose_github_repo.py            |   2 +-
 src/diagnose_npm_package.py            |   2 +-
 src/output_parsing/test_output_proc.py |  13 ++
 src/test_JS_repo_lib.py                | 293 +++----------------------
 6 files changed, 298 insertions(+), 264 deletions(-)
 create mode 100644 src/TestInfo.py
 create mode 100644 src/output_parsing/test_output_proc.py

diff --git a/.gitignore b/.gitignore
index 0679a00..34f1701 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ QLDBs/*
 items.json
 *__page_data.html
 *__results.json
+*_verbose_test_report.json
 local_mount/*
 **/node_modules
 
diff --git a/src/TestInfo.py b/src/TestInfo.py
new file mode 100644
index 0000000..47a832f
--- /dev/null
+++ b/src/TestInfo.py
@@ -0,0 +1,251 @@
+import re
+import output_parsing.test_output_proc as TestOutputProc
+
+class TestInfo:
+	OUTPUT_CHECKERS = {
+		"mocha": 
+			{
+				"output_regex_fct" : lambda condition: r'.*\d+ ' + condition + '.*',
+				"passing": ("passing", -1),
+				"failing": ("failing", -1)
+			},
+		"jest": 
+			{
+				"output_regex_fct" : lambda condition: r'Tests:.*\d+ ' + condition,
+				"passing": ("passed", -1),
+				"failing": ("failed", -1)
+			},
+		"tap": {
+				"output_regex_fct" : lambda condition: r'# ' + condition + '.*\d+',
+				"passing": ("pass", 1),
+				"failing": ("fail", 1)
+			},
+		"tap_raw": {
+				"output_regex_fct" : lambda condition: r'' + condition + ' \d+ - (?!.*time=).*$',
+				"passing": (r'^.*(?!not )ok', None), # this "passing" is a regex: count "ok" but not "not ok"
+				"failing":  (r'^.*not ok', None)
+			},
+		"ava": 
+		{
+			"output_regex_fct": lambda condition: r'.*\d+ tests? ' + condition,
+			"passing": ("passed", -2), 
+			"failing": ("failed", -2)
+		},
+		"ava_2": 
+			{
+				"output_regex_fct" : lambda condition: r'.*\d+ ' + condition + '$',
+				"passing": ("passed", -1),
+				"failing": ("failed", -1)
+			},
+	}
+	# extra args, their position in the arg list, and any post-processing required
+    # post-processing is a function that takes 2 arguments: input file and output file
+	VERBOSE_TESTS_EXTRA_ARGS = {
+		"jest": {
+			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"position":  -1,
+			"post_processing": None
+		},
+		"mocha": {
+			"args": " -- --reporter xunit --reporter-option output=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"position": -1,
+			"post_processing": TestOutputProc.xml_to_json
+		}
+	}
+	TRACKED_INFRAS = {
+		"mocha": {
+			"name": "mocha", 
+			"output_checkers": [ "mocha", "tap" ],
+			"verbose_tests_extra_args": [ "mocha" ]
+		},
+		"jest": {
+			"name": "jest", 
+			"output_checkers": [ "jest" ],
+			"verbose_tests_extra_args": [ "jest" ]
+		},
+		"jasmine": {
+			"name": "jasmine", 
+			"output_checkers": [ "mocha" ]
+		},
+		"tap": {
+			"name": "tap", 
+			"output_checkers": [ "tap", "tap_raw" ]
+		},
+		"lab": {
+			"name": "lab", 
+			"output_checkers": []
+		},
+		"ava": {
+			"name": "ava", 
+			"output_checkers": [ "ava", "ava_2" ]
+		},
+		"gulp": {
+			"name": "gulp", 
+			"output_checkers": [ "mocha" ]
+		},
+	}
+	TRACKED_COVERAGE = {
+		"istanbul": "istanbul -- coverage testing",
+		"nyc": "nyc -- coverage testing",
+		"coveralls": "coveralls -- coverage testing",
+		"c8": "c8 -- coverage testing"
+	}
+	TRACKED_LINTERS = {
+		"eslint": "eslint -- linter",
+		"tslint": "tslint -- linter",
+		"xx": "xx -- linter",
+		"standard": "standard -- linter",
+		"prettier": "prettier -- linter",
+		"gulp lint": "gulp lint -- linter"
+	}
+
+	TRACKED_RUNNERS = [ "node", "babel-node", "grunt" ]
+
+	def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
+		self.success = success
+		self.error_stream = error_stream
+		self.output_stream = output_stream
+		self.manager = manager
+		# start all other fields as None
+		self.test_infras = None
+		self.test_covs = None
+		self.test_lints = None
+		self.nested_test_commands = None
+		self.num_passing = None
+		self.num_failing = None
+		self.timed_out = False
+		self.VERBOSE_MODE = VERBOSE_MODE
+		self.test_verbosity_output = None
+
+	def set_test_command( self, test_command):
+		self.test_command = test_command
+
+	def set_test_verbosity_output( self, verbose_output):
+		self.test_verbosity_output = verbose_output
+
+	def compute_test_infras( self):
+		self.test_infras = []
+		self.test_covs = []
+		self.test_lints = []
+		self.nested_test_commands = []
+		if self.test_command:
+			self.test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, self.test_command, self.manager) ]
+			self.test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, self.test_command, self.manager) ]
+			self.test_covs += [ TestInfo.TRACKED_COVERAGE[ti] for ti in TestInfo.TRACKED_COVERAGE if called_in_command(ti, self.test_command, self.manager) ]
+			self.test_lints += [ TestInfo.TRACKED_LINTERS[ti] for ti in TestInfo.TRACKED_LINTERS if called_in_command(ti, self.test_command, self.manager) ]
+		self.test_infras = list(set(self.test_infras))
+		self.test_covs = list(set(self.test_covs))
+		self.test_lints = list(set(self.test_lints))
+		# TODO: maybe we can also figure it out from the output stream
+
+	def compute_nested_test_commands( self, test_commands):
+		# one might think that we should only check the package's own manager
+		# however, it's common to mix and match (esp. to run commands with "npm run" even if the package manager is yarn)
+		self.nested_test_commands += [ tc for tc in test_commands if called_in_command( "npm run " + tc, self.test_command, self.manager) ]
+		self.nested_test_commands += [ tc for tc in test_commands if called_in_command( "yarn " + tc, self.test_command, self.manager) ]
+
+	def compute_test_stats( self):
+		if not self.test_infras or self.test_infras == []:
+			return
+		test_output = self.output_stream.decode('utf-8') + self.error_stream.decode('utf-8')
+		ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+		test_output = ansi_escape.sub('', test_output)
+		self.num_passing = 0
+		self.num_failing = 0
+		self.timed_out = (self.error_stream.decode('utf-8') == "TIMEOUT ERROR")
+		for infra in self.test_infras:
+			output_checker_names = TestInfo.TRACKED_INFRAS.get(infra, {}).get("output_checkers", [])
+			if infra in TestInfo.TRACKED_RUNNERS and output_checker_names == []:
+				output_checker_names = self.OUTPUT_CHECKERS.keys() # all the checkers
+			for checker_name in output_checker_names:
+				div_factor = 2 if checker_name == "ava_2" else 1
+				checker = self.OUTPUT_CHECKERS[ checker_name]
+				self.num_passing += int(test_cond_count( test_output, checker["output_regex_fct"], checker["passing"][0], checker["passing"][1]) / div_factor)
+				self.num_failing += int(test_cond_count( test_output, checker["output_regex_fct"], checker["failing"][0], checker["failing"][1]) / div_factor)
+
+	def get_json_rep( self):
+		json_rep = {}
+		if self.VERBOSE_MODE:
+			json_rep["test_debug"] = ""
+		if not self.success:
+			json_rep["ERROR"] = True
+			if self.VERBOSE_MODE:
+				json_rep["test_debug"] += "\nError output: " + self.error_stream.decode('utf-8')
+		if self.num_passing is not None and self.num_failing is not None:
+			json_rep["num_passing"] = self.num_passing
+			json_rep["num_failing"] = self.num_failing
+		if self.VERBOSE_MODE:
+			json_rep["test_debug"] += "\nOutput stream: " + self.output_stream.decode('utf-8')
+		if self.test_infras and self.test_infras != []:
+			json_rep["test_infras"] = [TestInfo.TRACKED_INFRAS.get(infra, {}).get("name", "Custom Testing: " + infra) for infra in self.test_infras]
+		if self.test_covs and self.test_covs != []:
+			json_rep["test_coverage_tools"] = self.test_covs
+		if self.test_lints and self.test_lints != []:
+			json_rep["test_linters"] = self.test_lints
+		if self.nested_test_commands and self.nested_test_commands != []:
+			json_rep["nested_test_commands"] = self.nested_test_commands
+		if "test_infras" not in json_rep:
+			json_rep["RUNS_NEW_USER_TESTS"] = False
+		if self.test_verbosity_output:
+			json_rep["test_verbosity_output"] = self.test_verbosity_output
+		json_rep["timed_out"] = self.timed_out
+		return( json_rep)
+
+	def __str__(self):
+		to_ret = ""
+		if not self.success:
+			to_ret += "ERROR"
+			if self.VERBOSE_MODE:
+				to_ret += "\nError output: " + self.error_stream.decode('utf-8')
+		else:
+			to_ret += "SUCCESS"
+		if self.num_passing is not None and self.num_failing is not None:
+			to_ret += "\nPassing tests: " + str(self.num_passing) + "\nFailing tests: " + str(self.num_failing)
+		if self.VERBOSE_MODE:
+			to_ret += "\nOutput stream: " + self.output_stream.decode('utf-8')
+		if self.test_infras and self.test_infras != []:
+			to_ret += "\nTest infras: " + str([TestInfo.TRACKED_INFRAS[infra]["name"] for infra in self.test_infras])
+		if self.test_covs and self.test_covs != []:
+			to_ret += "\nCoverage testing: " + str(self.test_covs)
+		if self.test_lints and self.test_lints != []:
+			to_ret += "\nLinter: " + str(self.test_lints)
+		if self.nested_test_commands and self.nested_test_commands != []:
+			to_ret += "\nNested test commands: " + str(self.nested_test_commands)
+		to_ret += "\nTimed out: " + str(self.timed_out)
+		return( to_ret)
+
+def called_in_command( str_comm, command, manager):
+	# command ends with command terminator (this list includes \0 end-of-string, 
+	# but this is not available to check in Python so we use endswith)
+	post_command_chars = [ "" ] if command.endswith(str_comm) else [ " ", "\t", ";"]
+	for pcc in post_command_chars:
+		check_comm = str_comm + pcc
+		if command.find( check_comm) == 0:
+			return( True)
+		if command.find( "&&" + check_comm) > -1 or command.find( "&& " + check_comm) > -1:
+			return( True)
+		if command.find( "cross-env NODE_ENV=test " + check_comm) > -1 or command.find( "cross-env NODE_ENV=production " + check_comm) > -1:
+			return( True)
+		if command.find( "cross-env CI=true " + check_comm) > -1:
+			return( True)
+		if command.find( "opener " + check_comm) > -1:
+			return( True)
+		if command.find( "gulp " + check_comm) > -1:
+			return( True)
+		if command.find( "nyc " + check_comm) > -1:
+			return( True)
+	return( False)
+
+def test_cond_count( test_output, regex_fct, condition, offset):
+	ptrn = re.compile( regex_fct(condition), re.MULTILINE)
+	results = ptrn.findall( test_output)
+	if offset is None:
+		return( len( results)) # just count the number of hits, each hit is an individual test (example: tap "ok" vs "not ok")
+	num_cond = 0
+	for r in results:
+		temp = r.split()
+		try:
+			num_cond += int( temp[temp.index(condition) + offset])  
+		except ValueError:
+			num_cond += 0
+	return( num_cond)
\ No newline at end of file
diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index 7e7b44c..5d65c3a 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -52,7 +52,7 @@ class RepoWalker():
 	
 	def __init__(self, config_file="", output_dir = "."):
 		self.set_up_config( config_file)
-		self.output_dir = output_dir
+		self.output_dir = os.path.abspath(output_dir)
 
 	def set_repo_links(self, repo_links):
 		self.repo_links = repo_links
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
index cc09f8a..aa5dcf1 100644
--- a/src/diagnose_npm_package.py
+++ b/src/diagnose_npm_package.py
@@ -46,7 +46,7 @@ def __init__(self, packages=None, config_file="", output_dir=".", *args, **kwarg
 			self.packages = packages
 		self.start_urls = ['https://www.npmjs.com/package/' + pkg for pkg in self.packages]
 		self.set_up_config( config_file)
-		self.output_dir = output_dir
+		self.output_dir = os.path.abspath(output_dir)
 		super(NPMSpider, self).__init__(*args, **kwargs)
 
 	def set_up_config( self, config_file):
diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
new file mode 100644
index 0000000..c2ff451
--- /dev/null
+++ b/src/output_parsing/test_output_proc.py
@@ -0,0 +1,13 @@
+import json
+import xmltodict
+ 
+# convert an xml file to json
+# used to convert the xunit reporter output from mocha into json
+# note: this overwrites the existing file
+# code from https://www.geeksforgeeks.org/python-xml-to-json/
+def xml_to_json(output_file):
+    with open(output_file) as xml_file:
+        data_dict = xmltodict.parse(xml_file.read())
+        json_data = json.dumps(data_dict)
+        with open(output_file, 'w') as json_file:
+            json_file.write(json_data)
\ No newline at end of file
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index 2719b0d..a8797d0 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -2,6 +2,7 @@
 import subprocess
 import json
 import os
+from TestInfo import *
 
 def run_command( commands, timeout=None):
 	for command in commands.split(";"):
@@ -100,7 +101,7 @@ def run_build( manager, pkg_json, crawler):
 			build_script_list += [b]
 	return( retcode, build_script_list, build_debug)
 
-def run_tests( manager, pkg_json, crawler):
+def run_tests( manager, pkg_json, crawler, cur_dir="."):
 	test_json_summary = {}
 	retcode = 0
 	if len(crawler.TRACKED_TEST_COMMANDS) == 0:
@@ -108,7 +109,7 @@ def run_tests( manager, pkg_json, crawler):
 	test_scripts = [t for t in pkg_json.get("scripts", {}).keys() if not set([ t.find(t_com) for t_com in crawler.TRACKED_TEST_COMMANDS]) == {-1}]
 	test_scripts = [t for t in test_scripts if set([t.find(ig_com) for ig_com in crawler.IGNORED_COMMANDS]) == {-1}]
 	test_scripts = [t for t in test_scripts if set([pkg_json.get("scripts", {})[t].find(ig_sub) for ig_sub in crawler.IGNORED_SUBSTRINGS]) == {-1}]
-	for t in test_scripts:
+	for test_index, t in enumerate(test_scripts):
 		print("Running: " + manager + t)
 		error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
 		test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
@@ -122,10 +123,12 @@ def run_tests( manager, pkg_json, crawler):
 		if crawler.TEST_VERBOSE_ALL_OUTPUT:
 			# we're gonna be adding our new custom scripts for verbosity testing
 			run_command( "mv package.json TEMP_package.json_TEMP")
-			verbosity_index = 0
 			test_verbosity_output = {}
-			for test_infra in test_info.test_infras:
-				verbose_test_json = ("" if verbosity_index == 0 else "infra_" + str(verbosity_index) + "_") + crawler.TEST_VERBOSE_OUTPUT_JSON
+			for verbosity_index, test_infra in enumerate(test_info.test_infras):
+				verbose_test_json = crawler.output_dir + "/" \
+									+ "test_" + str(test_index) + "_"\
+									+ "infra_" + str(verbosity_index) + "_" \
+									+ crawler.TEST_VERBOSE_OUTPUT_JSON
 				infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
 				if not infra_verbosity_config: # checks if it's an empty object
 					print("TEST VERBOSE MODE: unsupported test infra " + test_infra)
@@ -133,7 +136,8 @@ def run_tests( manager, pkg_json, crawler):
 					continue
 				infra_verbosity_args = infra_verbosity_config.get("args", "")
 				infra_verbosity_args_pos = infra_verbosity_config.get("position", -1) # default position is at the end
-				infra_verbosity_command = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
+				infra_verbosity_post_proc = infra_verbosity_config.get("post_processing", None)
+				infra_verbosity_command, out_files = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
 																				verbose_test_json, infra_verbosity_args_pos)
 				verbosity_script_name = "instrumented_verbosity_command_" + str(verbosity_index)
 				pkg_json["scripts"][verbosity_script_name] = infra_verbosity_command
@@ -141,6 +145,9 @@ def run_tests( manager, pkg_json, crawler):
 					json.dump( pkg_json, f)
 				print("Running verbosity: " + manager + infra_verbosity_command)
 				verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
+				if not infra_verbosity_post_proc is None:
+					for out_file in out_files:
+						infra_verbosity_post_proc(out_file)
 				verbosity_index += 1
 				# get the output
 				test_verbosity_infra = {}
@@ -162,12 +169,21 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 	# so as to avoid overwriting the files
 	num_files = 0
 	new_infra_verbosity_args = ""
+	output_files = []
 	for i, sub in enumerate(infra_verbosity_args.split("$PLACEHOLDER_OUTPUT_FILE_NAME$")):
 		# not the file name
 		if sub != "": 
 			new_infra_verbosity_args += sub
 		else:
-			new_infra_verbosity_args += ("" if num_files == 0 else ("out_" + str(num_files) + "_")) + verbose_test_json 
+			path_index = verbose_test_json.rfind("/")
+			if path_index == -1:
+				output_file = "out_" + str(num_files) + "_" + verbose_test_json 
+				new_infra_verbosity_args += output_file
+				output_files += [ output_file ]
+			else:
+				output_file = verbose_test_json[:path_index] + "/out_" + str(num_files) + "_" + verbose_test_json[path_index + 1:]
+				new_infra_verbosity_args += output_file
+				output_files += [ output_file ]
 			num_files += 1
 	infra_verbosity_args = new_infra_verbosity_args
 	# split into sub-commands
@@ -175,11 +191,11 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 	infra_calls = test_script.split(test_infra)
 	instrumented_test_command = []
 	for i, infra_call in enumerate(infra_calls):
-		# if the current call is empty string and the next is non-empty
+		# if the current call is empty string
 		# then this is the call to the testing infra and the next is the arguments 
 		# so, skip this one
-		# if there are no args (i.e. no next non-empty string), then just instrument this one
-		if infra_call == "" and i < len(infra_calls) - 1 and infra_calls[i + 1] != "":
+		# if there are no args (i.e. no next string), then just instrument this one
+		if infra_call == "" and i < len(infra_calls) - 1:
 			instrumented_test_command += [ "" ]
 			continue
 		# if the first call is non-empty, then it's pre-test-infra and we skip it too
@@ -197,259 +213,12 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 			sub_command_args.append(infra_verbosity_args)
 		# rebuild the command, re-attaching any extra sub-commands
 		instrumented_test_command += [ " ".join(sub_command_args) + (infra_call[end_command_pos:] if end_command_pos > -1 else "") ]
-	return(test_infra.join(instrumented_test_command))
-	
-
-def called_in_command( str_comm, command, manager):
-	# command ends with command terminator (this list includes \0 end-of-string, 
-	# but this is not available to check in Python so we use endswith)
-	post_command_chars = [ "" ] if command.endswith(str_comm) else [ " ", "\t", ";"]
-	for pcc in post_command_chars:
-		check_comm = str_comm + pcc
-		if command.find( check_comm) == 0:
-			return( True)
-		if command.find( "&&" + check_comm) > -1 or command.find( "&& " + check_comm) > -1:
-			return( True)
-		if command.find( "cross-env NODE_ENV=test " + check_comm) > -1 or command.find( "cross-env NODE_ENV=production " + check_comm) > -1:
-			return( True)
-		if command.find( "cross-env CI=true " + check_comm) > -1:
-			return( True)
-		if command.find( "opener " + check_comm) > -1:
-			return( True)
-		if command.find( "gulp " + check_comm) > -1:
-			return( True)
-		if command.find( "nyc " + check_comm) > -1:
-			return( True)
-	return( False)
-
-def test_cond_count( test_output, regex_fct, condition, offset):
-	ptrn = re.compile( regex_fct(condition), re.MULTILINE)
-	results = ptrn.findall( test_output)
-	if offset is None:
-		return( len( results)) # just count the number of hits, each hit is an individual test (example: tap "ok" vs "not ok")
-	num_cond = 0
-	for r in results:
-		temp = r.split()
-		try:
-			num_cond += int( temp[temp.index(condition) + offset])  
-		except ValueError:
-			num_cond += 0
-	return( num_cond)
-
-
-class TestInfo:
-	OUTPUT_CHECKERS = {
-		"mocha": 
-			{
-				"output_regex_fct" : lambda condition: r'.*\d+ ' + condition + '.*',
-				"passing": ("passing", -1),
-				"failing": ("failing", -1)
-			},
-		"jest": 
-			{
-				"output_regex_fct" : lambda condition: r'Tests:.*\d+ ' + condition,
-				"passing": ("passed", -1),
-				"failing": ("failed", -1)
-			},
-		"tap": {
-				"output_regex_fct" : lambda condition: r'# ' + condition + '.*\d+',
-				"passing": ("pass", 1),
-				"failing": ("fail", 1)
-			},
-		"tap_raw": {
-				"output_regex_fct" : lambda condition: r'' + condition + ' \d+ - (?!.*time=).*$',
-				"passing": (r'^.*(?!not )ok', None), # this "passing" is a regex: count "ok" but not "not ok"
-				"failing":  (r'^.*not ok', None)
-			},
-		"ava": 
-		{
-			"output_regex_fct": lambda condition: r'.*\d+ tests? ' + condition,
-			"passing": ("passed", -2), 
-			"failing": ("failed", -2)
-		},
-		"ava_2": 
-			{
-				"output_regex_fct" : lambda condition: r'.*\d+ ' + condition + '$',
-				"passing": ("passed", -1),
-				"failing": ("failed", -1)
-			},
-	}
-	# extra args, their position in the arg list, and any post-processing required
-	VERBOSE_TESTS_EXTRA_ARGS = {
-		"jest": {
-			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
-			"position":  -1,
-			"post_processing": None
-		},
-		"mocha": {
-			"args": " -- --reporter xunit --reporter-option output=$PLACEHOLDER_OUTPUT_FILE_NAME$",
-			"position": -1,
-			"post_processing": None #TODO change this to the xml-to-json parser
-		}
-	}
-	TRACKED_INFRAS = {
-		"mocha": {
-			"name": "mocha", 
-			"output_checkers": [ "mocha", "tap" ],
-			"verbose_tests_extra_args": [ "mocha" ]
-		},
-		"jest": {
-			"name": "jest", 
-			"output_checkers": [ "jest" ],
-			"verbose_tests_extra_args": [ "jest" ]
-		},
-		"jasmine": {
-			"name": "jasmine", 
-			"output_checkers": [ "mocha" ]
-		},
-		"tap": {
-			"name": "tap", 
-			"output_checkers": [ "tap", "tap_raw" ]
-		},
-		"lab": {
-			"name": "lab", 
-			"output_checkers": []
-		},
-		"ava": {
-			"name": "ava", 
-			"output_checkers": [ "ava", "ava_2" ]
-		},
-		"gulp": {
-			"name": "gulp", 
-			"output_checkers": [ "mocha" ]
-		},
-	}
-	TRACKED_COVERAGE = {
-		"istanbul": "istanbul -- coverage testing",
-		"nyc": "nyc -- coverage testing",
-		"coveralls": "coveralls -- coverage testing",
-		"c8": "c8 -- coverage testing"
-	}
-	TRACKED_LINTERS = {
-		"eslint": "eslint -- linter",
-		"tslint": "tslint -- linter",
-		"xx": "xx -- linter",
-		"standard": "standard -- linter",
-		"prettier": "prettier -- linter",
-		"gulp lint": "gulp lint -- linter"
-	}
-
-	TRACKED_RUNNERS = [ "node", "babel-node", "grunt" ]
-
-	def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
-		self.success = success
-		self.error_stream = error_stream
-		self.output_stream = output_stream
-		self.manager = manager
-		# start all other fields as None
-		self.test_infras = None
-		self.test_covs = None
-		self.test_lints = None
-		self.nested_test_commands = None
-		self.num_passing = None
-		self.num_failing = None
-		self.timed_out = False
-		self.VERBOSE_MODE = VERBOSE_MODE
-		self.test_verbosity_output = None
-
-	def set_test_command( self, test_command):
-		self.test_command = test_command
-
-	def set_test_verbosity_output( self, verbose_output):
-		self.test_verbosity_output = verbose_output
-
-	def compute_test_infras( self):
-		self.test_infras = []
-		self.test_covs = []
-		self.test_lints = []
-		self.nested_test_commands = []
-		if self.test_command:
-			self.test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, self.test_command, self.manager) ]
-			self.test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, self.test_command, self.manager) ]
-			self.test_covs += [ TestInfo.TRACKED_COVERAGE[ti] for ti in TestInfo.TRACKED_COVERAGE if called_in_command(ti, self.test_command, self.manager) ]
-			self.test_lints += [ TestInfo.TRACKED_LINTERS[ti] for ti in TestInfo.TRACKED_LINTERS if called_in_command(ti, self.test_command, self.manager) ]
-		self.test_infras = list(set(self.test_infras))
-		self.test_covs = list(set(self.test_covs))
-		self.test_lints = list(set(self.test_lints))
-		# TODO: maybe we can also figure it out from the output stream
-
-	def compute_nested_test_commands( self, test_commands):
-		# one might think that we should only check the package's own manager
-		# however, it's common to mix and match (esp. to run commands with "npm run" even if the package manager is yarn)
-		self.nested_test_commands += [ tc for tc in test_commands if called_in_command( "npm run " + tc, self.test_command, self.manager) ]
-		self.nested_test_commands += [ tc for tc in test_commands if called_in_command( "yarn " + tc, self.test_command, self.manager) ]
-
-	def compute_test_stats( self):
-		if not self.test_infras or self.test_infras == []:
-			return
-		test_output = self.output_stream.decode('utf-8') + self.error_stream.decode('utf-8')
-		ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
-		test_output = ansi_escape.sub('', test_output)
-		self.num_passing = 0
-		self.num_failing = 0
-		self.timed_out = (self.error_stream.decode('utf-8') == "TIMEOUT ERROR")
-		for infra in self.test_infras:
-			output_checker_names = TestInfo.TRACKED_INFRAS.get(infra, {}).get("output_checkers", [])
-			if infra in TestInfo.TRACKED_RUNNERS and output_checker_names == []:
-				output_checker_names = self.OUTPUT_CHECKERS.keys() # all the checkers
-			for checker_name in output_checker_names:
-				div_factor = 2 if checker_name == "ava_2" else 1
-				checker = self.OUTPUT_CHECKERS[ checker_name]
-				self.num_passing += int(test_cond_count( test_output, checker["output_regex_fct"], checker["passing"][0], checker["passing"][1]) / div_factor)
-				self.num_failing += int(test_cond_count( test_output, checker["output_regex_fct"], checker["failing"][0], checker["failing"][1]) / div_factor)
-
-	def get_json_rep( self):
-		json_rep = {}
-		if self.VERBOSE_MODE:
-			json_rep["test_debug"] = ""
-		if not self.success:
-			json_rep["ERROR"] = True
-			if self.VERBOSE_MODE:
-				json_rep["test_debug"] += "\nError output: " + self.error_stream.decode('utf-8')
-		if self.num_passing is not None and self.num_failing is not None:
-			json_rep["num_passing"] = self.num_passing
-			json_rep["num_failing"] = self.num_failing
-		if self.VERBOSE_MODE:
-			json_rep["test_debug"] += "\nOutput stream: " + self.output_stream.decode('utf-8')
-		if self.test_infras and self.test_infras != []:
-			json_rep["test_infras"] = [TestInfo.TRACKED_INFRAS.get(infra, {}).get("name", "Custom Testing: " + infra) for infra in self.test_infras]
-		if self.test_covs and self.test_covs != []:
-			json_rep["test_coverage_tools"] = self.test_covs
-		if self.test_lints and self.test_lints != []:
-			json_rep["test_linters"] = self.test_lints
-		if self.nested_test_commands and self.nested_test_commands != []:
-			json_rep["nested_test_commands"] = self.nested_test_commands
-		if "test_infras" not in json_rep:
-			json_rep["RUNS_NEW_USER_TESTS"] = False
-		if self.test_verbosity_output:
-			json_rep["test_verbosity_output"] = self.test_verbosity_output
-		json_rep["timed_out"] = self.timed_out
-		return( json_rep)
-
-	def __str__(self):
-		to_ret = ""
-		if not self.success:
-			to_ret += "ERROR"
-			if self.VERBOSE_MODE:
-				to_ret += "\nError output: " + self.error_stream.decode('utf-8')
-		else:
-			to_ret += "SUCCESS"
-		if self.num_passing is not None and self.num_failing is not None:
-			to_ret += "\nPassing tests: " + str(self.num_passing) + "\nFailing tests: " + str(self.num_failing)
-		if self.VERBOSE_MODE:
-			to_ret += "\nOutput stream: " + self.output_stream.decode('utf-8')
-		if self.test_infras and self.test_infras != []:
-			to_ret += "\nTest infras: " + str([TestInfo.TRACKED_INFRAS[infra]["name"] for infra in self.test_infras])
-		if self.test_covs and self.test_covs != []:
-			to_ret += "\nCoverage testing: " + str(self.test_covs)
-		if self.test_lints and self.test_lints != []:
-			to_ret += "\nLinter: " + str(self.test_lints)
-		if self.nested_test_commands and self.nested_test_commands != []:
-			to_ret += "\nNested test commands: " + str(self.nested_test_commands)
-		to_ret += "\nTimed out: " + str(self.timed_out)
-		return( to_ret)
+	return(test_infra.join(instrumented_test_command), output_files)
 
 def on_diagnose_exit( json_out, crawler, cur_dir, repo_name):
+	# if we still have the temp package.json, restore it
+	if os.path.isfile("TEMP_package.json_TEMP"):
+		run_command( "mv TEMP_package.json_TEMP package.json")
 	# move back to the original working directory
 	if repo_name != "":
 		os.chdir( cur_dir)
@@ -562,7 +331,7 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 		if not crawler.DO_INSTALL:
 			print("Can't run tests without installing (do_install: false) -- skipping")
 		else:
-			(retcode, test_json_summary) = run_tests( manager, pkg_json, crawler)
+			(retcode, test_json_summary) = run_tests( manager, pkg_json, crawler, cur_dir)
 			json_out["testing"] = test_json_summary
 	else:
 		json_out["testing"] = { "track_tests": False }

From 49bb029a281411c2ff2daccc3d698e36c2f71bc0 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 5 May 2023 15:37:28 -0400
Subject: [PATCH 03/39] update: add option for different output file for test
 postproc (defaults to overwriting original)

---
 src/output_parsing/test_output_proc.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index c2ff451..2c1022b 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -5,9 +5,11 @@
 # used to convert the xunit reporter output from mocha into json
 # note: this overwrites the existing file
 # code from https://www.geeksforgeeks.org/python-xml-to-json/
-def xml_to_json(output_file):
+def xml_to_json(output_file, new_output_file=None):
+    if new_output_file is None:
+        new_output_file = output_file
     with open(output_file) as xml_file:
         data_dict = xmltodict.parse(xml_file.read())
         json_data = json.dumps(data_dict)
-        with open(output_file, 'w') as json_file:
+        with open(new_output_file, 'w') as json_file:
             json_file.write(json_data)
\ No newline at end of file

From 89e22ef8b800f99290a73f07bb6d71495b9252e6 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 5 May 2023 15:41:06 -0400
Subject: [PATCH 04/39] output file for verbosity tests now has repo name

---
 src/test_JS_repo_lib.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index a8797d0..87bf2f1 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -101,7 +101,7 @@ def run_build( manager, pkg_json, crawler):
 			build_script_list += [b]
 	return( retcode, build_script_list, build_debug)
 
-def run_tests( manager, pkg_json, crawler, cur_dir="."):
+def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 	test_json_summary = {}
 	retcode = 0
 	if len(crawler.TRACKED_TEST_COMMANDS) == 0:
@@ -126,6 +126,7 @@ def run_tests( manager, pkg_json, crawler, cur_dir="."):
 			test_verbosity_output = {}
 			for verbosity_index, test_infra in enumerate(test_info.test_infras):
 				verbose_test_json = crawler.output_dir + "/" \
+									+ "repo_" + repo_name + "_" \
 									+ "test_" + str(test_index) + "_"\
 									+ "infra_" + str(verbosity_index) + "_" \
 									+ crawler.TEST_VERBOSE_OUTPUT_JSON
@@ -331,7 +332,7 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 		if not crawler.DO_INSTALL:
 			print("Can't run tests without installing (do_install: false) -- skipping")
 		else:
-			(retcode, test_json_summary) = run_tests( manager, pkg_json, crawler, cur_dir)
+			(retcode, test_json_summary) = run_tests( manager, pkg_json, crawler, repo_name, cur_dir)
 			json_out["testing"] = test_json_summary
 	else:
 		json_out["testing"] = { "track_tests": False }

From 8991afd858a89c127becfa1b204b00e9be18d6fd Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 5 May 2023 20:29:21 -0400
Subject: [PATCH 05/39] add more info on the output files to the json output,
 for easy post-processing

---
 src/test_JS_repo_lib.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index 87bf2f1..a3f9c4f 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -146,14 +146,15 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 					json.dump( pkg_json, f)
 				print("Running verbosity: " + manager + infra_verbosity_command)
 				verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
+				# if there's post-processing to be done
 				if not infra_verbosity_post_proc is None:
-					for out_file in out_files:
-						infra_verbosity_post_proc(out_file)
+					for out_file_obj in out_files:
+						infra_verbosity_post_proc(out_file_obj["output_file"])
 				verbosity_index += 1
 				# get the output
 				test_verbosity_infra = {}
 				test_verbosity_infra["command"] = infra_verbosity_command
-				test_verbosity_infra["output_files"] = verbose_test_json
+				test_verbosity_infra["output_files"] = out_files
 				if crawler.VERBOSE_MODE:
 					test_verbosity_infra["test_debug"] = "\nError output: " + verb_error.decode('utf-8') \
 														 + "\nOutput stream: " + verb_output.decode('utf-8')
@@ -172,6 +173,7 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 	new_infra_verbosity_args = ""
 	output_files = []
 	for i, sub in enumerate(infra_verbosity_args.split("$PLACEHOLDER_OUTPUT_FILE_NAME$")):
+		out_file_object = { "test_script": test_script, "test_infra": test_infra }
 		# not the file name
 		if sub != "": 
 			new_infra_verbosity_args += sub
@@ -180,11 +182,12 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 			if path_index == -1:
 				output_file = "out_" + str(num_files) + "_" + verbose_test_json 
 				new_infra_verbosity_args += output_file
-				output_files += [ output_file ]
+				out_file_object["output_file"] = output_file
 			else:
 				output_file = verbose_test_json[:path_index] + "/out_" + str(num_files) + "_" + verbose_test_json[path_index + 1:]
 				new_infra_verbosity_args += output_file
-				output_files += [ output_file ]
+				out_file_object["output_file"] = output_file
+			output_files += [ out_file_object ]
 			num_files += 1
 	infra_verbosity_args = new_infra_verbosity_args
 	# split into sub-commands

From 13b1be81b9362642b0e1b4277e51b6f0faeed49f Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 31 May 2023 15:41:41 -0400
Subject: [PATCH 06/39] script to get the required versions of node and npm for
 a project; to allow docker build

---
 get_rel_project_reqs.js | 166 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 get_rel_project_reqs.js

diff --git a/get_rel_project_reqs.js b/get_rel_project_reqs.js
new file mode 100644
index 0000000..9391e69
--- /dev/null
+++ b/get_rel_project_reqs.js
@@ -0,0 +1,166 @@
+// get the build requirements for the project, if they're present
+// these are:
+// - npm version
+// - node version
+// - OS
+//
+// some notes:
+// - devs can specify a range of engines (npm, node) that their project works on.
+//   If a range is specified we just get one version in the valid range
+// - if the project specifically doesn't work on linux, then we're bailing -- this 
+//   only makes linux docker containers
+
+// also this is in JS instead of python bc the python semver library is garbage
+
+const semver = require('semver');
+const subproc = require('child_process');
+const fs = require('fs').promises;
+
+// can specify OS version: https://docs.npmjs.com/cli/v9/configuring-npm/package-json#os
+// can specify node/npm version: https://docs.npmjs.com/cli/v9/configuring-npm/package-json#engines
+async function get_reqs_from_pkg_json(pkg_json) {
+    let reqs = {}
+
+    let engines = pkg_json["engines"] || {};
+    // if not specified, "*" any version
+    let npm_req = engines["npm"] || "*"; 
+    let node_req = engines["node"] ||  "*";
+
+    // if a range is specified, get a version in the valid range
+    let { node_version, npm_version } = await get_versions_in_range(node_req, npm_req);
+    reqs["node"] = node_version;
+    reqs["npm"] = npm_version;
+
+
+    oss = engines["os"] ||  [];
+    // explicit versions and linux is not listed
+    if (oss.length > 0 && oss.indexOf("linux") == -1)
+        reqs["linux"] = false
+    // explicitly excluding linux :'(
+    else if (oss.indexOf("!linux") != -1)
+        reqs["linux"] = false
+    else
+        reqs["linux"] = true
+
+    return reqs 
+}
+
+const BANNED_VERSION_SUBSTRINGS = ["beta", "alpha", "pre"]
+
+// using semver, let's get a version that matches our specs
+async function get_versions_in_range(node_version, npm_version) {
+    let node_npm_version_pairs = [];
+    try {
+        node_npm_version_pairs = await get_node_npm_version_pairs();
+    } catch(e) {
+        console.log("Error getting npm/node pairs -- proceeding blind: " + e);
+    }
+    
+    // normal route: we have the data.
+    // now just need to find a pair that matches
+    if (node_npm_version_pairs.length > 0) {
+        for (const pair of node_npm_version_pairs) {
+            if (is_banned(pair["npm"]) || is_banned(pair["node"])) {
+                continue;
+            }
+            if (semver.satisfies(pair["npm"], npm_version) && semver.satisfies(pair["node"], node_version)) {
+                return { "node_version": pair["node"], "npm_version": pair["npm"] }
+            }
+        }
+    }
+
+    // if we get here we didn't return in the if above
+    // we don't have the data: get the list of all node versions from nvm: `nvm ls-remote`
+    // and all npm versions from npm itself: `npm view npm versions`
+    // NOTE: node version takes precedence over the npm version bc it's more commonly specified, 
+    // and because it's more important 
+    if (node_version !== "*" ) {
+        // then we care about the node version
+        subproc.exec('nvm ls-remote', { shell: '/bin/bash'}, (err, stdout, stderr) => {
+            let versions = stdout.split("\n").map(v => v.trim().split(" ")[0]); // strip formatting and any space-delimited labels (LTS, etc)
+            for (vers of versions) {
+                if (is_banned(vers)) {
+                    continue;
+                }
+                if (semver.satisfies(vers, node_version)) {
+                    return { "node_version": vers, "npm_version": "*" }
+                }
+            }
+        })
+    }
+
+    // if we get here, then we didn't have the version pair data, and we also didn't care about the node version
+    // so let's get an npm version
+    if (npm_version !== "*") {
+        // then we care about the npm version
+        subproc.exec('npm view npm versions --json', { shell: '/bin/bash'}, (err, stdout, stderr) => {
+            let versions = JSON.parse(stdout);
+            for (vers of versions) {
+                if (is_banned(vers)) {
+                    continue;
+                }
+                if (semver.satisfies(vers, npm_version)) {
+                    return { "node_version": "*", "npm_version": vers }
+                }
+            }
+        })
+    }
+    
+    // no matching pairs: we're flying blind folks
+    return { "node_version": "*", "npm_version": "*" }
+}
+
+// versions of node and the versions of npm they are bundled with
+// see: https://stackoverflow.com/questions/51238643/which-versions-of-npm-came-with-which-versions-of-node
+// read this file in -- from it we can get all the valid versions of npm and node
+// for fetch usage: https://stackoverflow.com/questions/2499567/how-to-make-a-json-call-to-an-url/2499647#2499647
+const NODE_NPM_VERSIONS_URL = 'https://nodejs.org/dist/index.json';
+async function get_node_npm_version_pairs() {
+    let resp = await fetch(NODE_NPM_VERSIONS_URL);
+    // look for errors:
+    if (!resp.ok) {
+        throw new Error("Uh oh: error reaching npm/node version pairs");
+    }
+    let all_data = await resp.json();
+    let node_npm_pairs = []; 
+    for (const vers_data of all_data) {
+        let node_version = vers_data["version"];
+        let npm_version = vers_data["npm"];
+        // if both were in the version data
+        if (node_version && npm_version)
+            node_npm_pairs.push({node: node_version, npm: npm_version})
+    }
+    return node_npm_pairs;
+}  
+
+// check if a version is banned 
+function is_banned(vers) {
+    for (const banned of BANNED_VERSION_SUBSTRINGS) {
+        if (vers.indexOf(banned) > -1) {
+            return true;
+        }
+    }
+    return false;
+}
+   
+async function main(proj_dir) {
+    let pkg_json = {};
+    try {
+        pkg_json = JSON.parse(await fs.readFile(proj_dir + "/package.json", 'utf8'));
+    } catch(e) {
+        console.error(e);//"Error, bailing out: " + proj_dir + " invalid directory, could not load package.json");
+        process.exit();
+    }
+    // get the node and npm versions
+    let reqs = await get_reqs_from_pkg_json(pkg_json);
+    console.log(reqs);
+}
+
+if (process.argv.length != 3) {
+    console.error("Usage: node get_rel_project_req.js path_to_project_dir")
+    process.exit()
+}
+
+let proj_dir = process.argv[2];
+console.log(proj_dir);
+main(proj_dir);

From 7f4f935858ed02cb95e91bf09a167380e929959c Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 31 May 2023 22:12:51 -0400
Subject: [PATCH 07/39] support for custom docker based on node/npm config of a
 project; and, installs deps of a project

---
 Dockerfile              | 12 +++++---
 build.sh                | 62 ++++++++++++++++++++++++++++++++++++++---
 get_rel_project_reqs.js | 15 ++++++----
 3 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 983305e..8e080cc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,11 @@
 FROM ubuntu:latest
 ARG DEBIAN_FRONTEND=noninteractive
 
+# build arg: setting up for a specific repo?
+ARG REPO_LINK
+
 RUN apt-get update \
-	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg nodejs npm xz-utils parallel
+	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
 
 RUN apt update
 RUN apt -y install python3-pip
@@ -10,10 +13,11 @@ RUN pip3 install bs4 scrapy
 
 RUN mkdir -p /home/npm-filter/results
 
-COPY . /home/npm-filter
+COPY src /home/npm-filter/
+COPY *.sh /home/npm-filter/
+COPY get_rel_project_reqs.js /home/npm-filter
 
 WORKDIR /home/npm-filter
 
 RUN git config --global http.sslVerify "false"
-RUN npm config set strict-ssl false
-RUN ./build.sh
+RUN ./build.sh $REPO_LINK
diff --git a/build.sh b/build.sh
index 58b83af..e99329a 100755
--- a/build.sh
+++ b/build.sh
@@ -1,7 +1,15 @@
 #!/bin/bash
 
+# can be building for one specific repo
+repo_link=$1
+
+# install nvm, so we can then use specific versions of node and npm
+curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.37.2/install.sh | /usr/bin/bash
+export NVM_DIR="$HOME/.nvm"
+[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"  # this loads nvm
+
+
 rm build.sh
-rm Dockerfile
 rm runDocker.sh
 if [ -d local_mount ]; then
 	rm -r local_mount
@@ -16,7 +24,6 @@ unzip codeql-linux64.zip
 git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
 
 apt -y install curl dirmngr apt-transport-https lsb-release ca-certificates gnupg build-essential
-curl -sL https://deb.nodesource.com/setup_12.x | bash -
 apt-get update
 
 curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
@@ -28,11 +35,58 @@ source $HOME/.cargo/env
 
 pip3 install --upgrade setuptools setuptools_rust wheel
 
-npm install -g jest mocha tap ava nyc yarn next
-
 echo "export PATH=/home/codeql_home/codeql:$PATH" >> /root/.bashrc
 echo "alias python=python3" >> /root/.bashrc
 echo "alias ipython=ipython3" >> /root/.bashrc
 echo "alias vi=vim" >> /root/.bashrc
 
 cd /home/npm-filter
+
+repo_dir_name=SPEC_REPO_DIR
+node_version='node' # default to just the latest version
+npm_version='*'
+# if there's a repo_link specified
+if [ -n $repo_link ]; then
+	git clone $repo_link $repo_dir_name
+	# this will make the node_version and npm_version variables
+	set_req_vars=`node get_rel_project_reqs.js $repo_dir_name 2>/dev/null`
+	`$set_req_vars`
+
+	if [[ $node_version == "*" ]]; then
+		node_version=node
+	fi
+fi
+
+# set up node and npm, and also add this node/npm config to the bashrc 
+# so that it runs on docker startup too 
+
+nvm install $node_version
+nvm use $node_version
+echo "nvm use $node_version" >> /root/.bashrc
+
+if [[ $npm_version == "*" ]]; then
+	nvm install-latest-npm
+	echo "nvm install-latest-npm" >> /root/.bashrc
+else
+	npm install -g npm@${npm_version}
+	echo "npm install -g npm@${npm_version}" >> /root/.bashrc
+fi
+
+
+# permissive
+npm config set strict-ssl false
+
+# install the dependencies: but use the current version of npm
+npm install -g jest mocha tap ava nyc yarn next semver
+
+if [ -n $repo_link ]; then 
+	cd $repo_dir_name
+	# setup the project
+	if [ -f "yarn.lock" ]; then
+		yarn > /dev/null 
+	else 
+		npm install > /dev/null
+	fi
+	cd ..
+fi
+
diff --git a/get_rel_project_reqs.js b/get_rel_project_reqs.js
index 9391e69..314b233 100644
--- a/get_rel_project_reqs.js
+++ b/get_rel_project_reqs.js
@@ -28,8 +28,8 @@ async function get_reqs_from_pkg_json(pkg_json) {
 
     // if a range is specified, get a version in the valid range
     let { node_version, npm_version } = await get_versions_in_range(node_req, npm_req);
-    reqs["node"] = node_version;
-    reqs["npm"] = npm_version;
+    reqs["node_version"] = node_version;
+    reqs["npm_version"] = npm_version;
 
 
     oss = engines["os"] ||  [];
@@ -142,18 +142,24 @@ function is_banned(vers) {
     }
     return false;
 }
+
+function print_as_bash_vars(reqs) {
+    for ( key in reqs) {
+        console.log(key + "=" + reqs[key]);
+    }
+}
    
 async function main(proj_dir) {
     let pkg_json = {};
     try {
         pkg_json = JSON.parse(await fs.readFile(proj_dir + "/package.json", 'utf8'));
     } catch(e) {
-        console.error(e);//"Error, bailing out: " + proj_dir + " invalid directory, could not load package.json");
+        console.error("Error, bailing out: " + proj_dir + " invalid directory, could not load package.json");
         process.exit();
     }
     // get the node and npm versions
     let reqs = await get_reqs_from_pkg_json(pkg_json);
-    console.log(reqs);
+    print_as_bash_vars(reqs);
 }
 
 if (process.argv.length != 3) {
@@ -162,5 +168,4 @@ if (process.argv.length != 3) {
 }
 
 let proj_dir = process.argv[2];
-console.log(proj_dir);
 main(proj_dir);

From de485ea114988b90632ed1577d96f5e8d30d3c20 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 14 Jun 2023 00:11:38 -0400
Subject: [PATCH 08/39] add build to docker, and fix a few dumb bugs

---
 Dockerfile |  7 +++++--
 build.sh   | 46 +++++++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8e080cc..3ad85f7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,11 +9,14 @@ RUN apt-get update \
 
 RUN apt update
 RUN apt -y install python3-pip
-RUN pip3 install bs4 scrapy
+RUN pip3 install bs4 scrapy xmltodict
 
 RUN mkdir -p /home/npm-filter/results
+RUN mkdir /home/npm-filter/src
+RUN mkdir /home/npm-filter/configs
 
-COPY src /home/npm-filter/
+COPY src /home/npm-filter/src
+COPY configs /home/npm-filter/configs
 COPY *.sh /home/npm-filter/
 COPY get_rel_project_reqs.js /home/npm-filter
 
diff --git a/build.sh b/build.sh
index e99329a..559869d 100755
--- a/build.sh
+++ b/build.sh
@@ -17,11 +17,11 @@ fi
 
 mkdir -p /home/codeql_home
 
-cd /home/codeql_home
-curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
-unzip codeql-linux64.zip 
-# clone stable version
-git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
+# cd /home/codeql_home
+# curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
+# unzip codeql-linux64.zip 
+# # clone stable version
+# git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
 
 apt -y install curl dirmngr apt-transport-https lsb-release ca-certificates gnupg build-essential
 apt-get update
@@ -35,19 +35,25 @@ source $HOME/.cargo/env
 
 pip3 install --upgrade setuptools setuptools_rust wheel
 
-echo "export PATH=/home/codeql_home/codeql:$PATH" >> /root/.bashrc
 echo "alias python=python3" >> /root/.bashrc
 echo "alias ipython=ipython3" >> /root/.bashrc
 echo "alias vi=vim" >> /root/.bashrc
 
 cd /home/npm-filter
 
-repo_dir_name=SPEC_REPO_DIR
+if [ -d TESTING_REPOS ]; then
+	rm -rf TESTING_REPOS
+fi 
+mkdir TESTING_REPOS
+
 node_version='node' # default to just the latest version
 npm_version='*'
 # if there's a repo_link specified
 if [ -n $repo_link ]; then
-	git clone $repo_link $repo_dir_name
+	cd TESTING_REPOS
+	git clone $repo_link
+	# repo dir will be the only thing in TESTING_REPOS
+	repo_dir_name=`ls`
 	# this will make the node_version and npm_version variables
 	set_req_vars=`node get_rel_project_reqs.js $repo_dir_name 2>/dev/null`
 	`$set_req_vars`
@@ -62,14 +68,21 @@ fi
 
 nvm install $node_version
 nvm use $node_version
-echo "nvm use $node_version" >> /root/.bashrc
+
+NVM_DIR=/root/.nvm
+NODE_VERSION=`node --version`
+
+echo "export NODE_VERSION=\"$NODE_VERSION\"" >> /root/.bashrc
+echo "export NVM_DIR=$NVM_DIR" >> /root/.bashrc
+echo "export NODE_PATH=$NVM_DIR/$NODE_VERSION/lib/node_modules" >> /root/.bashrc
+echo "export PATH=$NVM_DIR/$NODE_VERSION/bin:/home/codeql_home/codeql:$PATH" >> /root/.bashrc
+
+# echo "nvm use $node_version" >> /root/.bashrc
 
 if [[ $npm_version == "*" ]]; then
 	nvm install-latest-npm
-	echo "nvm install-latest-npm" >> /root/.bashrc
 else
 	npm install -g npm@${npm_version}
-	echo "npm install -g npm@${npm_version}" >> /root/.bashrc
 fi
 
 
@@ -80,13 +93,8 @@ npm config set strict-ssl false
 npm install -g jest mocha tap ava nyc yarn next semver
 
 if [ -n $repo_link ]; then 
-	cd $repo_dir_name
-	# setup the project
-	if [ -f "yarn.lock" ]; then
-		yarn > /dev/null 
-	else 
-		npm install > /dev/null
-	fi
-	cd ..
+	cd /home/npm-filter
+	# do the install and build
+	python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
 fi
 

From e7a0e0eef74a00b9095f10ca2e5536e8eb60e294 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 14 Jun 2023 01:41:16 -0400
Subject: [PATCH 09/39] output parsing to a csv

---
 Dockerfile                             |  2 +-
 src/TestInfo.py                        |  2 +-
 src/output_parsing/test_output_proc.py | 37 +++++++++++++++++++-------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3ad85f7..6f32d98 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update \
 
 RUN apt update
 RUN apt -y install python3-pip
-RUN pip3 install bs4 scrapy xmltodict
+RUN pip3 install bs4 scrapy xmltodict pandas
 
 RUN mkdir -p /home/npm-filter/results
 RUN mkdir /home/npm-filter/src
diff --git a/src/TestInfo.py b/src/TestInfo.py
index 47a832f..6cf20af 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -49,7 +49,7 @@ class TestInfo:
 		"mocha": {
 			"args": " -- --reporter xunit --reporter-option output=$PLACEHOLDER_OUTPUT_FILE_NAME$",
 			"position": -1,
-			"post_processing": TestOutputProc.xml_to_json
+			"post_processing": TestOutputProc.parse_mocha_json_to_csv
 		}
 	}
 	TRACKED_INFRAS = {
diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index 2c1022b..fcf98ec 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -1,15 +1,32 @@
 import json
 import xmltodict
+import pandas as pd
  
-# convert an xml file to json
-# used to convert the xunit reporter output from mocha into json
-# note: this overwrites the existing file
-# code from https://www.geeksforgeeks.org/python-xml-to-json/
-def xml_to_json(output_file, new_output_file=None):
+def parse_mocha_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
-        new_output_file = output_file
+        new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
+    # convert an xml file to json
+    # used to convert the xunit reporter output from mocha into json 
+    # code from https://www.geeksforgeeks.org/python-xml-to-json/
     with open(output_file) as xml_file:
-        data_dict = xmltodict.parse(xml_file.read())
-        json_data = json.dumps(data_dict)
-        with open(new_output_file, 'w') as json_file:
-            json_file.write(json_data)
\ No newline at end of file
+        data_dict = xmltodict.parse(xml_file.read()).get("testsuite", {})
+    # the format: all the tests are in a top-level list called "testcase"
+    test_suites = []
+    test_names = []
+    test_runtimes = []
+    test_stdout = []
+    test_pass_fail = []
+    for test in data_dict.get("testcase", []):
+        test_suites += [test.get("@classname", "").strip()]
+        test_names += [test.get("@name", "").strip()]
+        test_runtimes += [float(test.get("@time", "NaN"))]
+        if test.get("failure", False):
+            test_stdout += [test["failure"]]
+            test_pass_fail += ["Fail"]
+        else:
+            test_stdout += [""]
+            test_pass_fail += ["Pass"]
+    res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
+    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+    with open(new_output_file, 'w') as csv_file:
+        csv_file.write(res_df.to_csv())
\ No newline at end of file

From ecd7b9b10b3ee794d3eda80097b946eebf4d37ef Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Wed, 14 Jun 2023 15:56:48 +0000
Subject: [PATCH 10/39] Adds CI workflow for end-to-end testing of NPM Filter

---
 .github/workflows/barbosa23.yml        | 285 +++++++++++++++++++++++++
 .github/workflows/barbosa23flaky.yml   |  36 ++++
 .github/workflows/end2end.yml          |  99 +++++++++
 .github/workflows/smoketest.yml        |  31 +++
 output_proc_scripts/count_tests_run.py |  71 ++++++
 runDocker.sh                           |   2 +-
 6 files changed, 523 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/barbosa23.yml
 create mode 100644 .github/workflows/barbosa23flaky.yml
 create mode 100644 .github/workflows/end2end.yml
 create mode 100644 .github/workflows/smoketest.yml
 create mode 100644 output_proc_scripts/count_tests_run.py

diff --git a/.github/workflows/barbosa23.yml b/.github/workflows/barbosa23.yml
new file mode 100644
index 0000000..df065d9
--- /dev/null
+++ b/.github/workflows/barbosa23.yml
@@ -0,0 +1,285 @@
+name: Test on all of Barbosa23 JS (well, first 250ish)
+
+on:
+  workflow_dispatch
+
+env:
+  PROJECTS_JSON: |
+    { projects: [
+      {"project_url": "adriantoine/enzyme-to-json", "project_sha": "7d90cdf5f1878815a46b3a53f4e1e1b63418b38f"},
+      {"project_url": "agenda/agenda", "project_sha": "41a2b3793400073f564c37f7d2d0ec2d7e237bf2"},
+      {"project_url": "airbnb/mocha-wrap", "project_sha": "e6bf4f6cff6d40425b2af323186cc1e69d05a270"},
+      {"project_url": "allenmyao/canvas-graph-creator", "project_sha": "fadcd223a82ff665ee34685a1845d8087b997ee3"},
+      {"project_url": "americanexpress/one-app-cli", "project_sha": "23a992558cc32cdc8a51c11e4fe80c2e2924aaf9"},
+      {"project_url": "amireh/happypack", "project_sha": "e45926e9754f42098d882ff129269b15907ef00e"},
+      {"project_url": "andreypopov/node-red-contrib-deconz", "project_sha": "7a7cdb10e4c9430a10dfe28fc9295abeaf107af5"},
+      {"project_url": "andyholmes/gnome-shell-extension-gsconnect", "project_sha": "370493b76ab4ee7f30ba154b1e5b554a02413703"},
+      {"project_url": "angular-translate/angular-translate", "project_sha": "1114534c064eddfb77fc4243b0deb61c37f5f41f"},
+      {"project_url": "angular-ui/ui-sortable", "project_sha": "e763b5765eea87743c8463ddf045a53015193c20"},
+      {"project_url": "apache/cordova-lib", "project_sha": "797286963eb526a2f5ad673291ff5733d6fb275b"},
+      {"project_url": "apache/incubator-ponymail-foal", "project_sha": "f5addb5824e0c4d08474b22840ce556deade48f6"},
+      {"project_url": "apiaryio/dredd", "project_sha": "5ab7b162afbbd8881cd716f27627dc2d05213eb7"},
+      {"project_url": "apiaryio/dredd-transactions", "project_sha": "57477169b82a2980cb279c80a9caae5825754826"},
+      {"project_url": "appium/appium", "project_sha": "2d124323c5973ef9d3e190f7401e67106886ffd4"},
+      {"project_url": "appium/appium-desktop", "project_sha": "12a988aa08b9822e97056a09486c9bebb3aad8fe"},
+      {"project_url": "atom-community/atom", "project_sha": "0f7c5c14eaad9643bdc16cf80579b457baa2dd8a"},
+      {"project_url": "atom/atom", "project_sha": "1c3bd35ce238dc0491def9e1780d04748d8e18af"},
+      {"project_url": "atom/find-and-replace", "project_sha": "7871ad213e2c09f99e003c8f97cd7d4b7f9f2d82"},
+      {"project_url": "aurelia/cli", "project_sha": "82091bbeebcc4b08c9929e37a8cd91c5b5025791"},
+      {"project_url": "Automattic/kue", "project_sha": "c5647b1a8890319169fa4ce2cf4ed4122c1c704a"},
+      {"project_url": "avajs/ava", "project_sha": "568fe40c987dd6c593dfbcf4144d1d1627955d46"},
+      {"project_url": "axa-ch-webhub-cloud/pattern-library", "project_sha": "04d7e0f227f85d7b39eb0a6bfa9911076027e924"},
+      {"project_url": "axa-ch/patterns-library", "project_sha": "04d7e0f227f85d7b39eb0a6bfa9911076027e924"},
+      {"project_url": "azachar/protractor-screenshoter-plugin", "project_sha": "989f8e0b52b986f7ddb07831b5b92dca6dceeb07"},
+      {"project_url": "Azure/azure-iot-sdk-node", "project_sha": "450c672001eb96d99587eaeae5fe75ab0912e5d6"},
+      {"project_url": "babel/babel-eslint", "project_sha": "b5b9a09edbac4350e4e51033a4608dd95dad1f67"},
+      {"project_url": "badges/shields", "project_sha": "14892e3943a4677332618d8b9f584766f7940ee7"},
+      {"project_url": "bbc/simorgh", "project_sha": "4c7e7d1ecc525dd62fb14bd98035a5e739c14290"},
+      {"project_url": "bcgov/name-examination", "project_sha": "b55fc1127e0db98dc4fe780ad80831f4b1a2872e"},
+      {"project_url": "bcoin-org/bcoin", "project_sha": "b0058696cc10c8f9b17190b31fd2cd907d85d047"},
+      {"project_url": "beakerbrowser/beaker", "project_sha": "764bdefeeed9558dbf10aec77df262a896f57236"},
+      {"project_url": "bee-queue/bee-queue", "project_sha": "f6d901308f3b6433f2531edc4a9ac354aab434e5"},
+      {"project_url": "bkimminich/juice-shop", "project_sha": "b156c969d7bc8f24544f162f482c6285f58b4285"},
+      {"project_url": "blocknative/assist", "project_sha": "3fb619e3994752eacbddba4078d2bf0cbc7e2c9c"},
+      {"project_url": "bmazzarol/atom-plsql-linter", "project_sha": "02f6a1d48c4b5dbaa375dfb13d52703fc14b90a0"},
+      {"project_url": "BookMyComics-Developers/BookMyComics", "project_sha": "1efe6adb3490d7f62e7b31e3d75ac15b3b981875"},
+      {"project_url": "brave/brave-browser", "project_sha": "870d381ff8b08cb70d2b9fdea4b320d17bfe68f7"},
+      {"project_url": "brion/ogv.js", "project_sha": "5ce404a6aa8f53b7cef220916b89e613ac58fd17"},
+      {"project_url": "busterjs/buster", "project_sha": "5e20f3e23aeb7ea996be7a669e520c054b8f1035"},
+      {"project_url": "CalebMorris/react-moment-proptypes", "project_sha": "89a61c17250ea7b71d55d2855f6739ae4071529a"},
+      {"project_url": "CartoDB/cartodb", "project_sha": "9518ec6917e4091a56dc7b9d5fbf089bcb003271"},
+      {"project_url": "cerner/terra-core", "project_sha": "15458289ff022f302144932e047a6669b6c461a5"},
+      {"project_url": "cerner/terra-dev-site", "project_sha": "80a0e471548f553b7e58e30a2a0b6e8c0e7682fc"},
+      {"project_url": "cloudfoundry-attic/cf-abacus", "project_sha": "68aad9e2d497335d3a2e0da736bb9f01fe54dfb3"},
+      {"project_url": "cncf/landscapeapp", "project_sha": "62fa27892cd9e9095567c0c7e5d84fd514149cd9"},
+      {"project_url": "codeceptjs/CodeceptJS", "project_sha": "3fb39ae1d4f9b00438b1398cefba0dc677260aeb"},
+      {"project_url": "codetheweb/tuyapi", "project_sha": "905670c7cf7a8ad5756ea08eeca115178121423b"},
+      {"project_url": "covidwatchorg/portal", "project_sha": "95e36eeb777fca76318b5b0680c82f43f502fee3"},
+      {"project_url": "cryptee/web-client", "project_sha": "10f96daff7214a0e5afb71e56eed7256e59e17b0"},
+      {"project_url": "ctrlplusb/react-universally", "project_sha": "83d533a9c780716d18f034f7fb52dbd3a1c4051b"},
+      {"project_url": "cypress-io/cypress-example-recipes", "project_sha": "292325e6638bb4626861bc2f6df99d26ab8e7bff"},
+      {"project_url": "DataDog/dd-trace-js", "project_sha": "71a5288dea5df31c6a492ce22ff8169552548d47"},
+      {"project_url": "DeComX/pantheon", "project_sha": "deepforge-dev - deepforge"},
+      {"project_url": "deepforge-dev/deepforge", "project_sha": "f9cb1ff12644f64c01ca4d71ca66e6e22506b084"},
+      {"project_url": "dhis2/ui", "project_sha": "625c9c9391cdc6f625c927d20a39eef37f550a4a"},
+      {"project_url": "digidem/mapeo-core", "project_sha": "fd782a55cebb5f54a45f2f042287218c849b5f35"},
+      {"project_url": "dmitriz/min-karma", "project_sha": "8f1bcd25315d34a304d0d358166b9cb95a8a7871"},
+      {"project_url": "Dogfalo/materialize", "project_sha": "824e78248b3de81e383445e76ffb04cc3264fe7d"},
+      {"project_url": "domvm/domvm", "project_sha": "67de1a0cdf1879ad87926dafde0b8961f660c906"},
+      {"project_url": "duckduckgo/tracker-radar-collector", "project_sha": "3e9f49e46e5051e9f3d26bcd3be054447af887e4"},
+      {"project_url": "dukecon/dukecon_pwa", "project_sha": "127e8425ccff201a394448864407403c4e80d691"},
+      {"project_url": "dustinspecker/generator-ng-poly", "project_sha": "53f0beec9ad9a33a9f6b47649ca34a4d6bae95f8"},
+      {"project_url": "EFForg/privacybadger", "project_sha": "6f81b217e5717c46867cfec9e9b378da9354a84a"},
+      {"project_url": "elastic/apm-agent-nodejs", "project_sha": "9f13472d69523109d69315c6bb212957e46809cb"},
+      {"project_url": "elifesciences/elife-xpub", "project_sha": "bccea1e199bd213eef8ad03fca33d66727e34ccd"},
+      {"project_url": "ember-app-scheduler/ember-app-scheduler", "project_sha": "fb0b4e0075cf8847664e5459cd59bf74a0a1d379"},
+      {"project_url": "ember-batcher/ember-batcher", "project_sha": "231fb12ae51fde5e42704fa0e1daece8dd371532"},
+      {"project_url": "ember-cli/ember-cli", "project_sha": "b851c0edcae99701335e3e90efe0c225951c4f0b"},
+      {"project_url": "emberjs/ember.js", "project_sha": "3fa9068831b1e3cf8e594647a880adc0809861f3"},
+      {"project_url": "eobrain/bajel", "project_sha": "ecbfe18a990e97f677e522a7240617df29d47cd6"},
+      {"project_url": "eslint/eslint", "project_sha": "9e3d77cba65d0e38e07996e57961fb04f30d9303"},
+      {"project_url": "ether/etherpad-lite", "project_sha": "7656c6b9f195a79bb07bd3b77b55de1393ab71f4"},
+      {"project_url": "ethereum/web3.js", "project_sha": "f8a2533c2b09ce0a62f8414f2f6eed83ab78ca1f"},
+      {"project_url": "ExpressGateway/express-gateway", "project_sha": "a294cac39c98d66f5750c424a24e0bb8ce351c1c"},
+      {"project_url": "facebook/metro", "project_sha": "c6a94bc170cf95a6bb21b5638929ec3311a9a5b7"},
+      {"project_url": "facebook/prepack", "project_sha": "5beedbe85bd5b9d2de1264abafbb3b76f8584297"},
+      {"project_url": "facebook/react-native", "project_sha": "af99a6890b84713d002fbbd872f10fe2e6304861"},
+      {"project_url": "fastify/fastify", "project_sha": "d1ad6c17ce9731f1bc28377318b010966ca339cd"},
+      {"project_url": "flow-typed/flow-typed", "project_sha": "0e28de5e8a69def522d61f06ddffb624d465bceb"},
+      {"project_url": "FlowCrypt/flowcrypt-browser", "project_sha": "92d0188c66572d2c14ef4ed24602b8a58445630c"},
+      {"project_url": "FormidableLabs/nodejs-dashboard", "project_sha": "885fc96fec262b668da9282f57374966f7512b76"},
+      {"project_url": "freeboardgames/FreeBoardGames.org", "project_sha": "b11dbaa3715d71605bced4c8f04a40a79bd7cfef"},
+      {"project_url": "freedomjs/freedom", "project_sha": "9638e840aec9598c4d60383ed22444c525aefbf5"},
+      {"project_url": "freedomjs/freedom-for-chrome", "project_sha": "0154d345e99ac781460a790a31772c4352cb41b6"},
+      {"project_url": "freedomjs/freedom-for-firefox", "project_sha": "3a2922f378a9dbbb58f302b0216b56ec23cf17b3"},
+      {"project_url": "getgauge/taiko", "project_sha": "532c62c69da79852ef3cf8abd2325d2fff903a15"},
+      {"project_url": "GioBonvi/GoogleContactsEventsNotifier", "project_sha": "7e657a9e606f449fef22feae68d448d11083122b"},
+      {"project_url": "google/shaka-player", "project_sha": "a543b80648f429524c522295b0f4f60039c2e0ea"},
+      {"project_url": "googleads/videojs-ima", "project_sha": "11ecbefa37fbdbd6877fece63c38c11338b9e913"},
+      {"project_url": "GoogleChrome/lighthouse", "project_sha": "b981a38e7b3becc512f0a7985b1d2a64320da235"},
+      {"project_url": "GoogleChrome/workbox", "project_sha": "ee62b5b5b9ed321af457a2d962b2a34196a80263"},
+      {"project_url": "hack4impact-uiuc/life-after-hate", "project_sha": "9cad8555b52ff6bd98c7d15fae456e2f8b7a2a8a"},
+      {"project_url": "hapijs/lab", "project_sha": "aaaebb95108d3fdcb264a56e836c3459380844b1"},
+      {"project_url": "hapijs/nes", "project_sha": "977750a158e0b0105c719e0e2d4bd354154bf0a8"},
+      {"project_url": "hapijs/subtext", "project_sha": "ae0a2dd48ab8c6e2b8ebdebbc31baddb6b4c49b7"},
+      {"project_url": "hapipal/hpal", "project_sha": "4661f17ac8bdb1d3915695b2f819ff2336730131"},
+      {"project_url": "hapipal/schwifty", "project_sha": "088088572e7aac82b77a78d9c8ed05e7f1d5e957"},
+      {"project_url": "Haufe-Lexware/wicked.haufe.io", "project_sha": "1efadeabae7b7ccb4b17473e9aa5d0af60796adb"},
+      {"project_url": "hden/node-serf", "project_sha": "d176dede5c87e0285c383f7bbda3848584d6a2ad"},
+      {"project_url": "HSLdevcom/transitlog-ui", "project_sha": "316a7843c2a8e6d66db7f4c9181f775f95f926ed"},
+      {"project_url": "html-next/vertical-collection", "project_sha": "fd928512a33d44155a724ed65c5ba21cf7950d86"},
+      {"project_url": "Human-Connection/Human-Connection", "project_sha": "72a8f3d7f567442ca5e191672abfb47ea1b825a6"},
+      {"project_url": "hyperledger/cactus", "project_sha": "334612d251c56811a844b3308dc1561dcd6fc460"},
+      {"project_url": "IBM-Cloud/gp-js-client", "project_sha": "8ac9e9b0ebee3264d446d68ff487ef995173bff0"},
+      {"project_url": "ikydd/blackat", "project_sha": "26a8ba8dac8be027978b5fc046131936aadb76ec"},
+      {"project_url": "IMA-WorldHealth/bhima", "project_sha": "f76ac0085b2566d249cdd6ab135950faf0e10da3"},
+      {"project_url": "ing-bank/lion", "project_sha": "02e61285ddc83e4cb2ec7d2acc6d6a6620a94924"},
+      {"project_url": "iodide-project/iodide", "project_sha": "f9dd78a725ce1a2aa96784a46b527b740605431b"},
+      {"project_url": "ipfs-inactive/js-ipfs-http-client", "project_sha": "995abb41b83c8345b16cba67151e9ccb9cbea4de"},
+      {"project_url": "israelroldan/grunt-ssh", "project_sha": "7175b5548291bb2105a33a45d772573cb888430d"},
+      {"project_url": "istanbuljs/nyc", "project_sha": "ab7c53b2f340b458789a746dff2abd3e2e4790c3"},
+      {"project_url": "jaggedsoft/node-binance-api", "project_sha": "950d773a5f2c3a61c6e29b53e3af57594921a239"},
+      {"project_url": "jamesshore/quixote", "project_sha": "6b5c07b4d202d44e0ee6ecd99c22df4547558c17"},
+      {"project_url": "jamhall/s3rver", "project_sha": "f834192dbb07da4548b48c95066bae50cfaac819"},
+      {"project_url": "JeroenDeDauw/Maps", "project_sha": "f9bec919e77d671c4e96f9aa16d0452d17f700c7"},
+      {"project_url": "jivid/akobi", "project_sha": "ccd8d4de55b2066db9c11f9f00ffeed36ea33673"},
+      {"project_url": "jorgebucaran/hyperapp", "project_sha": "c3717e3ff78b6fa8663575d34d330d68929a0974"},
+      {"project_url": "jrcasso/mean-demo", "project_sha": "31f3e21420fd5ef13cc7555a56e3106a31dd4a36"},
+      {"project_url": "json-schema-faker/json-schema-faker", "project_sha": "9bbe0e895cc9ebce939d5f358385f151d72c739c"},
+      {"project_url": "jwplayer/jwplayer", "project_sha": "30353cd1e1f3017a96ef2854ef758fb4f479cd7a"},
+      {"project_url": "kaliber5/ember-bootstrap", "project_sha": "c92d1898b715da0ebd534a813a4ce592d1ed115c"},
+      {"project_url": "kategengler/ember-cli-code-coverage", "project_sha": "46dc079ab518bddc325fb305790d58adf2c28aae"},
+      {"project_url": "keystonejs/keystone", "project_sha": "67f0f2ce7fa58288cf06d198e4b1a5c51d265bcf"},
+      {"project_url": "kgiszewski/Archetype", "project_sha": "2e0bce99b9f386aa24a56be02fca8cd7388b39bd"},
+      {"project_url": "kiwicom/smart-faq", "project_sha": "2131be6290020a11dc6ad236eb82c5bde75945d8"},
+      {"project_url": "Lambda-School-Labs/labs-spa-starter", "project_sha": "2d1bbf41db2a97574c62cc3d6745cc0b2e644ead"},
+      {"project_url": "lekoder/consul-kv-object", "project_sha": "5cf3c44f416d28d11c567c9caab86b27e3e0f0a0"},
+      {"project_url": "liferay/senna.js", "project_sha": "fd89ca02de0ad57e7697c5088f4e490f8d181958"},
+      {"project_url": "linkeddata/dokieli", "project_sha": "52f9c3cc8519d45339996f2a926bae18c37bf5d8"},
+      {"project_url": "LLK/scratch-vm", "project_sha": "e4bb21f1817a2b7bbca9be19da6eba529291ed0c"},
+      {"project_url": "magda-io/magda", "project_sha": "754ec4cf2aff491549007cd82f676da4c3759061"},
+      {"project_url": "magento/pwa-studio", "project_sha": "836aa40608465ccc28066d4fbdddee3a6a560b75"},
+      {"project_url": "marcos8896/nutrition-care-node-api", "project_sha": "20b08a443d4d7714dc8ea137b3ffcce51f5524c0"},
+      {"project_url": "marionettejs/backbone.marionette", "project_sha": "85936fc518dd7bb0934faf231123172e3eee0169"},
+      {"project_url": "marklogic-community/marklogic-samplestack", "project_sha": "5449924fe9abd1712d3ef20ca2f25f2e291578e0"},
+      {"project_url": "material-components/material-components-web", "project_sha": "a9ff9866f237fbeebe94e655ae578b68ce675a04"},
+      {"project_url": "mbland/custom-links", "project_sha": "3e58bb2b4ea335451489d9b81226a414d7352c3f"},
+      {"project_url": "mcollina/autocannon", "project_sha": "ba3a2124fa68be6f263e860001be419d71de39d9"},
+      {"project_url": "meteor/meteor", "project_sha": "dc38e4325dcd88fb3c6d1be1639680c6ff6f5e80"},
+      {"project_url": "microsoft/ChakraCore", "project_sha": "c3ead3f8a6e0bb8e32e043adc091c68cba5935e9"},
+      {"project_url": "mikakaraila/node-red-contrib-opcua", "project_sha": "aec7272f4f7554a7473daf19136e6fa8c9dfc681"},
+      {"project_url": "milieuinfo/webcomponent-vl-ui-wizard", "project_sha": "efecc0c4f3659ac1348ae456604534d42e6b90b7"},
+      {"project_url": "mishoo/UglifyJS", "project_sha": "f0ca9cfbe65efc919149e7cd74cedd186d6413ee"},
+      {"project_url": "mitodl/open-discussions", "project_sha": "462c242eab04f68552e80a6f416c18c4b0b57cb0"},
+      {"project_url": "mocha-parallel/mocha-parallel-tests", "project_sha": "d1b2e88fa6bad71d0a5d7487809fcb4be4030b9d"},
+      {"project_url": "mohsen1/yawn-yaml", "project_sha": "aab6ee95ead9da9b7f1b1bbfb7325b2e90d7d3f5"},
+      {"project_url": "moorara/microservices-demo", "project_sha": "bc16c5eeb6091392e62d0c260d2acfe48aef4b06"},
+      {"project_url": "mozilla/blok", "project_sha": "faac2281c48cd226b4fb8c4e22de588a02328c31"},
+      {"project_url": "mui-org/material-ui", "project_sha": "6e8b99d133025c9e785a778a183fa81383998a42"},
+      {"project_url": "n5ro/aframe-extras", "project_sha": "5c20172a159aba54e7b6f7f243a864f76905448e"},
+      {"project_url": "nasa-gibs/worldview", "project_sha": "c4769a03394676dd4ec7126cc14a7c67dc7e4eaf"},
+      {"project_url": "NativeScript/nativescript-cli", "project_sha": "eb918011d6f0be9a8ccb6b569628e3960fd4f8b9"},
+      {"project_url": "nccgroup/tracy", "project_sha": "6ce4714a3b3b407503cecd8c9842132fe4dc37e4"},
+      {"project_url": "neffo/earth-view-wallpaper-gnome-extension", "project_sha": "016c982dccd9e7b454b84e9f50b4accc1b4348d6"},
+      {"project_url": "NetsBlox/NetsBlox", "project_sha": "419ca83482c562a0cfa5af1d2dd9907b7387f7ef"},
+      {"project_url": "nightwatchjs/nightwatch", "project_sha": "4b09cb57c8a9fb29d6b6795e59c64b4942bddf67"},
+      {"project_url": "noble/bleno", "project_sha": "72028bc995d55cb9dcf223f9b0ffce563d091212"},
+      {"project_url": "nock/nock", "project_sha": "8a38f41a28b36fef50d5723daa94cf21a6490fc5"},
+      {"project_url": "node-alarm-dot-com/homebridge-node-alarm-dot-com", "project_sha": "26516177a2324aa53b0cfbb8af52fb1354be78be"},
+      {"project_url": "nodejs/citgm", "project_sha": "460c3a008f1c33bda2e136631d0162479419ed36"},
+      {"project_url": "nodejs/node-chakracore", "project_sha": "770c8dcd1bc3e0fce2d4497b4eec3fe49d829d43"},
+      {"project_url": "nodejs/undici", "project_sha": "c415fbbb59e2b898c5db6a681265cf3da865d02c"},
+      {"project_url": "npm/cli", "project_sha": "29622c1349b38173924058a1fb0ede9edf8a5f6f"},
+      {"project_url": "NSWSESMembers/availability-poc", "project_sha": "7ebc17b6005a3c1573e6c68bd5411b0657c98f71"},
+      {"project_url": "nwjs-community/nw-builder", "project_sha": "a1d4fb5148255e2b6fa5bce4a2167c9be8cc71d6"},
+      {"project_url": "observablehq/plot", "project_sha": "4d3cd1586e7412b95687157d12c792fde84a2229"},
+      {"project_url": "ocadotechnology/rapid-router", "project_sha": "38adf70a3e76a05fa814a7d3c0e1c61e4ba125c2"},
+      {"project_url": "ONSdigital/eq-author-app", "project_sha": "8bb1621cd4973281730a38378765b1718b08ca54"},
+      {"project_url": "Ontotext-AD/graphdb.js", "project_sha": "d0880dabf966e82def44537a720bf620d6d29f5e"},
+      {"project_url": "open-wc/open-wc", "project_sha": "57ddb3ccfff6b00468d3a7ebabbc15cfe966f7a9"},
+      {"project_url": "OpenEnergyPlatform/oeplatform", "project_sha": "1ce978f8faade3effe4cf7d3eec7522e990df910"},
+      {"project_url": "openseadragon/openseadragon", "project_sha": "ebab356c207e626b6622f88ffcb0cd28b918f85d"},
+      {"project_url": "openstyles/stylus", "project_sha": "50a0a115d1c6587d221f3253feeb4cb88b6f5336"},
+      {"project_url": "Opentrons/opentrons", "project_sha": "f8f7e699d512f59e1a2f4a9969428744e86a6a22"},
+      {"project_url": "OpenZeppelin/openzeppelin-contracts", "project_sha": "604025400f9be5c32581bb6ab03a46bbc09c5562"},
+      {"project_url": "OriginProtocol/origin", "project_sha": "57c55c023188e3a53cb9ee3dfafe0bf3210e0cf8"},
+      {"project_url": "owncloud/contacts", "project_sha": "efb06fef530dbf1812cbb98d651ec87680de97a1"},
+      {"project_url": "palantir/eclipse-typescript", "project_sha": "007579ba58d2979a5989caf04733a9d5dfcc56de"},
+      {"project_url": "particle-iot/particle-cli", "project_sha": "07dfa4e7d928d9641be368881b2216c6fb017c6c"},
+      {"project_url": "perfsonar/toolkit", "project_sha": "d4c8906acdf7d8be49cf37b59939748945e526d9"},
+      {"project_url": "pingyhq/pingy-cli", "project_sha": "53721434b698f53ba195c4824ca8d1f87ea8b60c"},
+      {"project_url": "poanetwork/tokenbridge", "project_sha": "961b12b9f3545830a04044e109762277efcea6ef"},
+      {"project_url": "postmanlabs/newman", "project_sha": "89941554304362d0cfec2914d134f738348b27c5"},
+      {"project_url": "postmanlabs/postman-runtime", "project_sha": "7855b3ae5858734bfb6f0c5985592d8b2957f4d1"},
+      {"project_url": "pouchdb/pouchdb", "project_sha": "546c8bb696872f86816574d02d47131ace0b4d18"},
+      {"project_url": "PowerlineApp/powerline-mobile", "project_sha": "2030817dc80a07f3cfc2129bd830ce33ab50373d"},
+      {"project_url": "premasagar/sqwidget", "project_sha": "7edc6d21997bb18da7daa59068926a082028d6f0"},
+      {"project_url": "PrismJS/prism", "project_sha": "59e5a3471377057de1f401ba38337aca27b80e03"},
+      {"project_url": "probcomp/metaprob", "project_sha": "43c4bea80772ed8b2baa51cd5ac6c593a34a3a8b"},
+      {"project_url": "ProjectMirador/mirador", "project_sha": "3c121dbe99bae4eab910cb2df00e93904bc123ea"},
+      {"project_url": "Quicksaver/Tab-Groups", "project_sha": "29ea6517e73eb5d58b2f0b9fc2d65d589d910e8a"},
+      {"project_url": "regl-project/regl", "project_sha": "3d90d57d473b5dee6680dc97897f4a9fba465501"},
+      {"project_url": "reportportal/service-ui", "project_sha": "049abcb8fc70ee131625914e9da4a748e23d2230"},
+      {"project_url": "restify/node-restify", "project_sha": "89e7ac81a4cc885d153df6f07d5cf35ed75fd4d0"},
+      {"project_url": "rtfeldman/node-test-runner", "project_sha": "16cd4b9c8e5dab3ce297039f5d72d372bdd63de9"},
+      {"project_url": "ruiquelhas/blaine", "project_sha": "a69cdad6e59ebb19493018eacb7b7602f2225ce1"},
+      {"project_url": "ruiquelhas/copperfield", "project_sha": "fe5629ed8f5edea740ca4917dfac6a779e644b45"},
+      {"project_url": "ruiquelhas/electron-recipes", "project_sha": "9bcfc2520ad383c1e5bebe9c427214cab1d0a0da"},
+      {"project_url": "ruiquelhas/fischbacher", "project_sha": "35eb4dcf0225a8899e13a3ab63c3e878d9d434ca"},
+      {"project_url": "ruiquelhas/henning", "project_sha": "ac75e0b1cebdbb123eccb05277bc5c663f8e6696"},
+      {"project_url": "ruiquelhas/houdin", "project_sha": "4a700f66748b3a57a1c1ab6ee7bbe425ce20c526"},
+      {"project_url": "ruiquelhas/lafayette", "project_sha": "038578c360b22ff846daa7b3e6e0aeb712b145b2"},
+      {"project_url": "ruiquelhas/thurston", "project_sha": "071f9ee5265f64f47335b428a498df22895e549c"},
+      {"project_url": "sampotts/plyr", "project_sha": "0c9759455cbfcce888c66925c3b457ce06cee31e"},
+      {"project_url": "scalableminds/webknossos", "project_sha": "b91b15ff4180b2288c40ad9e3a86678258dcd5c9"},
+      {"project_url": "scality/Arsenal", "project_sha": "96cbaeb821d8045cbe8eabd00092290e13e46784"},
+      {"project_url": "SeleniumBuilder/se-builder", "project_sha": "8230ad58a526d3eb905d32a780daeaea1fb56a55"},
+      {"project_url": "serverless/serverless", "project_sha": "17d64e6c94b88a5daf36f28a4fa192c231052cfb"},
+      {"project_url": "SGrondin/bottleneck", "project_sha": "b83528333ba4d27cf70b81cc2be12e09d7ff692f"},
+      {"project_url": "signalapp/Signal-Desktop", "project_sha": "bd14b74e638dce03928e08ffbe2a83a6c047406e"},
+      {"project_url": "sindresorhus/npm-name", "project_sha": "7aef07b69ed35f584e0a8bf6cece96750becaf00"},
+      {"project_url": "sindresorhus/serialize-error", "project_sha": "a212a8c3902fa1ff1fdef8f7625dd0cc6d6e89a1"},
+      {"project_url": "skarfacegc/FlowTrack2", "project_sha": "990a2566f30b8dd84a61ea1ff6f58076016a7796"},
+      {"project_url": "solid/node-solid-server", "project_sha": "bbb8d78df7e8908e20e3052ae6655722aa6fa6de"},
+      {"project_url": "SolidarityEconomyAssociation/sea-map", "project_sha": "17fa76b9b4070354c31faae81ba0162b8f27bf1b"},
+      {"project_url": "soscripted/sox", "project_sha": "4be396373c06bb8340d740089018e364729bec70"},
+      {"project_url": "sourcecred/sourcecred", "project_sha": "3da222ebe44c110f265063cfa99316ed5c1fa0b3"},
+      {"project_url": "spark-notebook/spark-notebook", "project_sha": "69174f3923d0564d2078c0e0c70125245157d5b5"},
+      {"project_url": "stanford-oval/thingengine-core", "project_sha": "b69f7b0166d256428a08ba2dac3fc3ca8dddf611"},
+      {"project_url": "stealjs/steal-tools", "project_sha": "05f60d58e3ee56dbb8428c83121fdb6ee2b1825c"},
+      {"project_url": "stellar/js-stellar-sdk", "project_sha": "52947e81e487edf179a6003efb40a1425a4f7ff2"},
+      {"project_url": "stimulusreflex/stimulus_reflex", "project_sha": "52aa993165a656eccbe2cefaca9f5388509d014d"},
+      {"project_url": "streamr-dev/network", "project_sha": "4cdabba71db0a6c531c63368d1a78361fff01dce"},
+      {"project_url": "strongloop/loopback", "project_sha": "13371fd2a138a6f39db77e5a455b3170e5d4a0f5"},
+      {"project_url": "studentinsights/studentinsights", "project_sha": "4bb09f97eb9c0473a9ac6ee076171de12855e721"},
+      {"project_url": "sumup-oss/circuit-ui", "project_sha": "00ceacbd82b6cd3a71592ea9d2da5b95892f965b"},
+      {"project_url": "superscriptjs/superscript", "project_sha": "5e3e1b51654a54518dfada17c0cd9dc146c8e48a"},
+      {"project_url": "sveltejs/kit", "project_sha": "c4476c6d106b41dd8e6badbbdd0128b78be49d5c"},
+      {"project_url": "tarantool/graphql.0", "project_sha": "05f39946299cb2f35a97be326b992aace0205eaf"},
+      {"project_url": "testem/testem", "project_sha": "42fe29451b187bd1cd1e546228fa1bfbe11084f3"},
+      {"project_url": "thaliproject/jxcore", "project_sha": "d3ccd242a592416b6537dfea8ce539bd6208dd54"},
+      {"project_url": "thejoshwolfe/snakefall", "project_sha": "62bdfe3718f86ef85fc8c11e600bf621fa2a586c"},
+      {"project_url": "themgoncalves/react-loadable-ssr-addon", "project_sha": "2036a6f12e9048d8a6e3eb0a8097455fa0fe1ebc"},
+      {"project_url": "TheScienceMuseum/collectionsonline", "project_sha": "ef486c650bce9f2dccf25b7188dbe986d4b63c3c"},
+      {"project_url": "tmijs/tmi.js", "project_sha": "3904ae743a12b984aa1a175740e8b5bae08a03e4"},
+      {"project_url": "transloadit/uppy", "project_sha": "f07697e7f45e471ca16bac8751fa7221d9445605"},
+      {"project_url": "tristanHessell/mess-around", "project_sha": "19cdf7aa58eaf165a88ac7a3954fc7a33e5685bc"},
+      {"project_url": "trufflesuite/truffle", "project_sha": "0f17cf9680ac0dc7aa6a314ad3b78ad569daa896"},
+      {"project_url": "TryGhost/Ghost", "project_sha": "4da658e72ad42cf251e4fb100ca651a7d4dca79e"},
+      {"project_url": "tubbo/openrct2-benchwarmer", "project_sha": "504d75bfaf1b158dbe23e4bbfb926502189a0ff6"},
+      {"project_url": "tulios/kafkajs", "project_sha": "ff3b1117f316d527ae170b550bc0f772614338e9"},
+      {"project_url": "TypeStrong/ts-loader", "project_sha": "cf5326d9b5f1b804ff8d817f88fb127bc45ad9d1"},
+      {"project_url": "uber/baseweb", "project_sha": "65c791a6b5ac50722f34e2a7b1282b08c539f58a"},
+      {"project_url": "usdigitalresponse/neighbor-express", "project_sha": "130d9edd9ac09f2a8aa947b0d21f054d4dfc0462"},
+      {"project_url": "vega/vega", "project_sha": "b45cf431cd6c0d0c0e1567f087f9b3b55bc236fa"},
+      {"project_url": "video-dev/hls.js", "project_sha": "59d421479b5002993a5f3b36d4505adff3209fb5"},
+      {"project_url": "visgl/luma.gl", "project_sha": "044c0ef5f767cd56974e30475a30dd3f24305983"},
+      {"project_url": "w3c/aria-practices", "project_sha": "4adb78ea96b22db559577aa6ed64c9059596ab4a"},
+      {"project_url": "waiterio/api", "project_sha": "9948b542f5da1957c3f656d959c4f5957d364eb1"},
+      {"project_url": "web-animations/web-animations-js-legacy", "project_sha": "6a1c45473f9ba2db1ccad34f879bca829f77264d"},
+      {"project_url": "webdriverio/cucumber-boilerplate", "project_sha": "f91d34ff0bf9112d02830dc474f1a97ff6e8d9d3"},
+      {"project_url": "webex/webex-js-sdk", "project_sha": "cc743f187c646290dab21322431cbf8f1ce771a2"},
+      {"project_url": "webpack/webpack", "project_sha": "16143f5fa835ad8c7181b8aeedc52f9cdd0fd39d"},
+      {"project_url": "webpack/webpack-cli", "project_sha": "4e1c45ad8de888dea13247855c78848632475653"},
+    ]}
+
+jobs:
+  build-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix-projects: ${{ steps.set-matrix.outputs.matrix-projects }}
+    steps:
+      - id: set-matrix
+        run: |
+          echo "matrix-projects<<__EOF__" >> $GITHUB_OUTPUT
+          echo $PROJECTS_JSON >> $GITHUB_OUTPUT
+          echo "__EOF__" >> $GITHUB_OUTPUT
+  execute:
+    needs: [build-matrix]
+    strategy:
+      matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
+      fail-fast: false
+    uses: ./.github/workflows/end2end.yml
+    with:
+      project_url: ${{ matrix.projects.project_url }}
+      project_sha: ${{ matrix.projects.project_sha }}
diff --git a/.github/workflows/barbosa23flaky.yml b/.github/workflows/barbosa23flaky.yml
new file mode 100644
index 0000000..e16758a
--- /dev/null
+++ b/.github/workflows/barbosa23flaky.yml
@@ -0,0 +1,36 @@
+name: Test on Barbosa23 JS projects with >=5 flaky tests
+
+on:
+  push
+
+env:
+  PROJECTS_JSON: |
+    { projects: [
+      {"project_url": "appium/appium", "project_sha": "2d124323c5973ef9d3e190f7401e67106886ffd4"},
+      {"project_url": "badges/shields", "project_sha": "14892e3943a4677332618d8b9f584766f7940ee7"},
+      {"project_url": "facebook/react-native", "project_sha": "af99a6890b84713d002fbbd872f10fe2e6304861"},
+      {"project_url": "FlowCrypt/flowcrypt-browser", "project_sha": "92d0188c66572d2c14ef4ed24602b8a58445630c"},
+      {"project_url": "meteor/meteor", "project_sha": "dc38e4325dcd88fb3c6d1be1639680c6ff6f5e80"},
+      {"project_url": "yui/yui3", "project_sha": "25264e3629b1c07fb779d203c4a25c0879ec862c"}
+    ]}
+
+jobs:
+  build-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix-projects: ${{ steps.set-matrix.outputs.matrix-projects }}
+    steps:
+      - id: set-matrix
+        run: |
+          echo "matrix-projects<<__EOF__" >> $GITHUB_OUTPUT
+          echo $PROJECTS_JSON >> $GITHUB_OUTPUT
+          echo "__EOF__" >> $GITHUB_OUTPUT
+  execute:
+    needs: [build-matrix]
+    strategy:
+      matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
+      fail-fast: false
+    uses: ./.github/workflows/end2end.yml
+    with:
+      project_url: ${{ matrix.projects.project_url }}
+      project_sha: ${{ matrix.projects.project_sha }}
diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml
new file mode 100644
index 0000000..7238098
--- /dev/null
+++ b/.github/workflows/end2end.yml
@@ -0,0 +1,99 @@
+name: Test NPMFilter End to End on a Project 
+
+on:
+  workflow_dispatch:
+    inputs:
+        project_url:
+            description: 'GitHub suffix of project to test (username/project)'
+            required: true
+            type: string
+        project_sha:
+            description: 'SHA of project to test'
+            required: true
+            type: string
+  workflow_call:
+    inputs:
+        project_url:
+            description: 'GitHub suffix of project to test (username/project)'
+            required: true
+            type: string
+        project_sha:
+            description: 'SHA of project to test'
+            required: true
+            type: string
+jobs:
+  execute:
+    runs-on: self-hosted
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Run NPMFilter
+        id: run-npm-filter
+        env:
+            SHA: ${{ inputs.project_sha }}
+            URL: ${{ inputs.project_url }}
+        run: |
+            IFS="/" read -r -a projectArray <<< "$URL"
+            OrgName=${projectArray[0]} 
+            ProjectName=${projectArray[1]}
+            LogDir=${URL//\//-}
+            echo "LogDir=$LogDir" >> $GITHUB_OUTPUT
+
+            echo "Running NPMFilter on $OrgName/$ProjectName@$SHA"
+
+            mkdir -p docker_configs/
+            cat >docker_configs/debug_filter_config.json <<EOL
+            {
+              "install": {
+                      "timeout": 1000,
+                      "do_install": true
+              },
+              "dependencies": {
+                      "track_deps": false,
+                      "include_dev_deps": false
+              },
+              "build": {
+                      "track_build": true,
+                      "tracked_build_commands": ["build", "compile", "init"],
+                      "timeout": 1000
+              },
+              "test": {
+                      "track_tests": true,
+                      "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
+                                                                        "mocha", "jest", "ava", "tap", "jasmine"],
+                      "timeout": 1000
+              },
+              "meta_info": {
+                      "VERBOSE_MODE": true,
+                      "ignored_commands": ["watch", "debug"],
+                      "ignored_substrings": ["--watch", "nodemon"],
+                      "rm_after_cloning": false,
+                      "scripts_over_code": [ ],
+                      "QL_queries": [ ]
+              }
+            }
+            EOL
+
+            # Run NPMFilter
+            ./runDocker.sh python3 src/diagnose_github_repo.py --repo_link_and_SHA https://github.com/$URL $SHA --config docker_configs/debug_filter_config.json
+
+            # Get tests overview
+            python3 output_proc_scripts/count_tests_run.py npm_filter_docker_results/ > tests-overview.csv
+
+            # Check if tests were found
+            TestData=$(cat tests-overview.csv)
+            IFS="," read -r -a testCount <<< $(python3 output_proc_scripts/count_tests_run.py npm_filter_docker_results/)
+            TestsRun=${testCount[0]}
+            if [ $TestsRun -le 2 ]; then
+                echo "ERROR: No tests found."
+                exit -1
+            else
+                echo "OK: ${TestsRun} tests found!"
+            fi
+      - name: Upload output
+        uses: actions/upload-artifact@v2
+        with:
+            name: npm_filter_docker_results
+            path: npm_filter_docker_results
\ No newline at end of file
diff --git a/.github/workflows/smoketest.yml b/.github/workflows/smoketest.yml
new file mode 100644
index 0000000..d1fa45e
--- /dev/null
+++ b/.github/workflows/smoketest.yml
@@ -0,0 +1,31 @@
+name: Test NPMFilter End to End on a toy project
+
+on:
+  push:
+
+env:
+  PROJECTS_JSON: |
+    { projects: [
+    {"project_url": "mtiller/ts-jest-sample", "project_sha": "6739c576d4590c53296f3e4fcdf3074e582ae297"},
+    ]}
+
+jobs:
+  build-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix-projects: ${{ steps.set-matrix.outputs.matrix-projects }}
+    steps:
+      - id: set-matrix
+        run: |
+          echo "matrix-projects<<__EOF__" >> $GITHUB_OUTPUT
+          echo $PROJECTS_JSON >> $GITHUB_OUTPUT
+          echo "__EOF__" >> $GITHUB_OUTPUT
+  execute:
+    needs: [build-matrix]
+    strategy:
+      matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
+      fail-fast: false
+    uses: ./.github/workflows/end2end.yml
+    with:
+      project_url: ${{ matrix.projects.project_url }}
+      project_sha: ${{ matrix.projects.project_sha }}
diff --git a/output_proc_scripts/count_tests_run.py b/output_proc_scripts/count_tests_run.py
new file mode 100644
index 0000000..03cba8d
--- /dev/null
+++ b/output_proc_scripts/count_tests_run.py
@@ -0,0 +1,71 @@
+import json
+import os
+import sys
+
+# simple, unrefined script for parsing npm-filter output files
+# for the current directory, get all files named *__results.json
+# (wildcard represents the project name)
+# prints out (Number of tests passing),(Number of tests failing)
+
+
+# JSON specifying possible errors
+# that should be avoided if an input JSON will pass the filter check
+
+JSON_filter = { 	
+	"setup": { 
+		"repo_cloning_ERROR": True,
+		"pkg_json_ERROR": True
+	},
+	"installation": { 
+		"ERROR": True 
+	},
+}
+
+# input to the function is a JSON of undesirable elements
+# return true if the JSON to be filtered has any of the filter elements
+def json_contains_issues(json_check, json_filter):
+	contains_issues = False
+	for filter_key, filter_val in json_filter.items():
+		# recursive case
+		if isinstance( filter_val, dict):
+			contains_issues = contains_issues or json_contains_issues( json_check.get(filter_key, {}), filter_val)
+		# base case
+		contains_issues = contains_issues or (json_check.get(filter_key, {}) == filter_val)
+	return( contains_issues)
+
+# by default, there needs to be at least one passing test
+def get_num_tests_run(json_check): 
+	test_dict = json_check.get("testing", {})
+	num_passing = 0
+	num_failing = 0
+	passing_commands = []
+	for test_com, test_out in test_dict.items():
+		if test_out.get("timed_out", False) or (not test_out.get("RUNS_NEW_USER_TESTS", True)) or test_out.get("ERROR", False): 
+			continue
+		num_passing += test_out.get("num_passing")
+		num_failing += test_out.get("num_failing")
+	return [num_passing, num_failing]
+
+output_proc_dir = "."
+if len(sys.argv) == 2:
+	output_proc_dir = sys.argv[1]
+else:
+	print("No output directory specified: looking at current directory")
+
+# get all relevant files
+all_files = [ output_proc_dir + "/" + fname for fname in os.listdir(output_proc_dir) if fname.find("__results.json") != -1]
+passing_files = []
+total_passing_tests = 0
+total_failing_tests = 0
+for file in all_files:
+	with open(file) as f:
+		json_check = json.load(f)
+	proj_name = file[ : file.index("__results.json")]
+	if json_contains_issues( json_check, JSON_filter):
+		# print(proj_name + " has setup/install errors")
+		continue
+	num_tests = get_num_tests_run( json_check)
+	total_passing_tests += num_tests[0]
+	total_failing_tests += num_tests[1]
+
+print(f"{total_passing_tests},{total_failing_tests}")
\ No newline at end of file
diff --git a/runDocker.sh b/runDocker.sh
index e4ec817..4d3f8a6 100755
--- a/runDocker.sh
+++ b/runDocker.sh
@@ -15,6 +15,6 @@ docker run --mount type=bind,source=`pwd`/local_mount,destination=/mount \
 		   --volume `pwd`/npm_filter_docker_results:/home/npm-filter/results \
 		   --volume `pwd`/docker_configs:/home/npm-filter/docker_configs\
 		   -w /home/npm-filter \
-		   -it emarteca/npm-filter:latest \
+		   emarteca/npm-filter:latest \
 		   bash -c "PATH=/home/codeql_home/codeql:$PATH; $npm_filter_command --output_dir results"
 rm -r local_mount

From 1671e31d70449299fa4d00c3fd7cf38248097249 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 14 Jun 2023 13:04:25 -0400
Subject: [PATCH 11/39] jest parsing; also making default node latest LTS so it
 doesnt break yarn

---
 build.sh                               |  2 +-
 src/TestInfo.py                        |  2 +-
 src/output_parsing/test_output_proc.py | 38 ++++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/build.sh b/build.sh
index 559869d..3d8aa8f 100755
--- a/build.sh
+++ b/build.sh
@@ -46,7 +46,7 @@ if [ -d TESTING_REPOS ]; then
 fi 
 mkdir TESTING_REPOS
 
-node_version='node' # default to just the latest version
+node_version='v18.16.0' # default to just the latest LTS version
 npm_version='*'
 # if there's a repo_link specified
 if [ -n $repo_link ]; then
diff --git a/src/TestInfo.py b/src/TestInfo.py
index 6cf20af..61f08f0 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -44,7 +44,7 @@ class TestInfo:
 		"jest": {
 			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
 			"position":  -1,
-			"post_processing": None
+			"post_processing": TestOutputProc.parse_jest_json_to_csv
 		},
 		"mocha": {
 			"args": " -- --reporter xunit --reporter-option output=$PLACEHOLDER_OUTPUT_FILE_NAME$",
diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index fcf98ec..bd52da5 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -22,10 +22,44 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
         test_runtimes += [float(test.get("@time", "NaN"))]
         if test.get("failure", False):
             test_stdout += [test["failure"]]
-            test_pass_fail += ["Fail"]
+            test_pass_fail += ["failed"]
         else:
             test_stdout += [""]
-            test_pass_fail += ["Pass"]
+            test_pass_fail += ["passed"]
+    res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
+    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+    with open(new_output_file, 'w') as csv_file:
+        csv_file.write(res_df.to_csv())
+
+def parse_jest_json_to_csv(output_file, new_output_file=None):
+    if new_output_file is None:
+        new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
+    with open(output_file) as json_file:
+        data_dict = json.loads(json_file.read())
+    # the format: all tests are in a top level list called "testResults"
+    # this is a list of objects that have "assertionResults" representing the test suites
+    # "assertionResults" is a list of objects that have the test data
+    test_suites = []
+    test_names = []
+    test_runtimes = []
+    test_stdout = []
+    test_pass_fail = []
+    for test_suite in data_dict.get("testResults", []):
+        test_suite_results = test_suite.get("assertionResults", [])
+        test_suite_name = test_suite.get("name", "")
+        for test_results in test_suite_results:
+            test_status = test_results.get("status", "failed")
+            test_duration = test_results.get("duration")
+            # if it can't convert to a string, could be missing/nonetype (None duration for pending tests)
+            try:
+                test_duration = float(test_duration)
+            except:
+                test_duration = float("NaN")
+            test_suites += [test_suite_name]
+            test_names += [test_results.get("fullName", "")]
+            test_runtimes += [test_duration]
+            test_stdout += [";".join(test_results.get("failureMessages", []))]
+            test_pass_fail += [test_status] # passed/failed/pending -- if not present assume failed
     res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
     res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
     with open(new_output_file, 'w') as csv_file:

From 2ef737f3c78abfb074be77eb032c344ab07e8116 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 14 Jun 2023 13:10:46 -0400
Subject: [PATCH 12/39] verbose config file

---
 configs/verbose.json | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 configs/verbose.json

diff --git a/configs/verbose.json b/configs/verbose.json
new file mode 100644
index 0000000..b29a4f1
--- /dev/null
+++ b/configs/verbose.json
@@ -0,0 +1,5 @@
+{
+	"test": {
+		"test_verbose_all_output": { "do_verbose_tracking": true }
+	}
+}

From 528dc055fad85bc4bad01cc9576334bbc53f74d1 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Wed, 14 Jun 2023 17:00:52 -0400
Subject: [PATCH 13/39] more bug fixes :')

---
 Dockerfile              |  5 ++--
 build.sh                | 56 ++++++++++++++++++++++++++++++-----------
 get_rel_project_reqs.js |  2 +-
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6f32d98..8508639 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,9 @@
 FROM ubuntu:latest
 ARG DEBIAN_FRONTEND=noninteractive
 
-# build arg: setting up for a specific repo?
+# build arg: setting up for a specific repo? at a specific commit?
 ARG REPO_LINK
+ARG REPO_COMMIT
 
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
@@ -23,4 +24,4 @@ COPY get_rel_project_reqs.js /home/npm-filter
 WORKDIR /home/npm-filter
 
 RUN git config --global http.sslVerify "false"
-RUN ./build.sh $REPO_LINK
+RUN ./build.sh $REPO_LINK $REPO_COMMIT
diff --git a/build.sh b/build.sh
index 3d8aa8f..0c6260b 100755
--- a/build.sh
+++ b/build.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 
-# can be building for one specific repo
+# can be building for one specific repo, at a specific commit 
+# (if theyre not specified theyre just empty string, that's fine)
 repo_link=$1
+repo_commit=$2
 
 # install nvm, so we can then use specific versions of node and npm
 curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.37.2/install.sh | /usr/bin/bash
@@ -49,14 +51,37 @@ mkdir TESTING_REPOS
 node_version='v18.16.0' # default to just the latest LTS version
 npm_version='*'
 # if there's a repo_link specified
-if [ -n $repo_link ]; then
+if [ ! -z "$repo_link" ]; then
 	cd TESTING_REPOS
 	git clone $repo_link
 	# repo dir will be the only thing in TESTING_REPOS
 	repo_dir_name=`ls`
+	if [ ! -z "$repo_commit" ]; then
+		cd $repo_dir_name
+		git checkout $repo_commit
+	fi
+	cd /home/npm-filter
+
 	# this will make the node_version and npm_version variables
-	set_req_vars=`node get_rel_project_reqs.js $repo_dir_name 2>/dev/null`
+	# it's ok to use the generic version here -- just using it for the vars
+	# need these dependencies for my get_rel_project_reqs.js script
+	nvm install $node_version
+	nvm use $node_version
+	nvm install-latest-npm
+
+	npm install semver node-fetch
+
+	# script to set the env variables for node_version etc
+	echo "#!/bin/bash" > req_vars.sh
+	node get_rel_project_reqs.js TESTING_REPOS/${repo_dir_name} >> req_vars.sh
+	chmod 700 req_vars.sh
+	# source in current shell: so we set the variables in the current shell
+	. req_vars.sh
+	rm req_vars.sh
+
+	echo $node_version
 	`$set_req_vars`
+	rm -r node_modules
 
 	if [[ $node_version == "*" ]]; then
 		node_version=node
@@ -69,6 +94,12 @@ fi
 nvm install $node_version
 nvm use $node_version
 
+if [[ $npm_version == "*" ]]; then
+	nvm install-latest-npm
+else
+	npm install -g npm@${npm_version}
+fi
+
 NVM_DIR=/root/.nvm
 NODE_VERSION=`node --version`
 
@@ -77,24 +108,19 @@ echo "export NVM_DIR=$NVM_DIR" >> /root/.bashrc
 echo "export NODE_PATH=$NVM_DIR/$NODE_VERSION/lib/node_modules" >> /root/.bashrc
 echo "export PATH=$NVM_DIR/$NODE_VERSION/bin:/home/codeql_home/codeql:$PATH" >> /root/.bashrc
 
-# echo "nvm use $node_version" >> /root/.bashrc
-
-if [[ $npm_version == "*" ]]; then
-	nvm install-latest-npm
-else
-	npm install -g npm@${npm_version}
-fi
-
-
 # permissive
 npm config set strict-ssl false
 
 # install the dependencies: but use the current version of npm
-npm install -g jest mocha tap ava nyc yarn next semver
+npm install -g jest mocha tap ava nyc yarn next
 
-if [ -n $repo_link ]; then 
+if [ ! -z "$repo_link" ]; then 
 	cd /home/npm-filter
 	# do the install and build
-	python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
+	if [ ! -z "$repo_commit" ]; then
+		python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config configs/build_only_config.json --output_dir results
+	else 
+		python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
+	fi
 fi
 
diff --git a/get_rel_project_reqs.js b/get_rel_project_reqs.js
index 314b233..5a4bd6d 100644
--- a/get_rel_project_reqs.js
+++ b/get_rel_project_reqs.js
@@ -145,7 +145,7 @@ function is_banned(vers) {
 
 function print_as_bash_vars(reqs) {
     for ( key in reqs) {
-        console.log(key + "=" + reqs[key]);
+        console.log("export " + key + "=" + reqs[key]);
     }
 }
    

From c9de12410c078402177eb192f82b80a6d60bcc74 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 15 Jun 2023 21:40:16 -0400
Subject: [PATCH 14/39] adding docker default command (run tests for repo link
 and commit provided); adding option to still diagnose tests if we skip
 install as long as there is node_modules (prev install); and adding config
 option for pre-install scripts

---
 Dockerfile                         | 10 ++++++++
 build.sh                           | 30 ++++++++++++-----------
 configs/build_only_config.json     |  5 ++++
 configs/verbose_only.json          | 11 +++++++++
 run_verbose_for_repo_and_config.sh | 18 ++++++++++++++
 src/diagnose_github_repo.py        |  5 +++-
 src/diagnose_npm_package.py        |  3 +++
 src/test_JS_repo_lib.py            | 39 ++++++++++++++++++++++++------
 8 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 configs/build_only_config.json
 create mode 100644 configs/verbose_only.json
 create mode 100755 run_verbose_for_repo_and_config.sh

diff --git a/Dockerfile b/Dockerfile
index 8508639..72c3292 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,3 +25,13 @@ WORKDIR /home/npm-filter
 
 RUN git config --global http.sslVerify "false"
 RUN ./build.sh $REPO_LINK $REPO_COMMIT
+# source the env variables produced by the build script (node version, etc)
+RUN . /envfile
+
+# add a default command for running the tests for repo_link and commit provided
+# this runs in verbose mode
+# need to use ENV instead of ARG in the CMD b/c docker is 10/10
+ENV ENV_REPO_COMMIT=$REPO_COMMIT
+ENV ENV_REPO_LINK=$REPO_LINK
+# gotta source our env vars so the command can run and use npm/node/etc :-)
+CMD . /envfile; ./run_verbose_for_repo_and_config.sh $ENV_REPO_LINK $ENV_REPO_COMMIT
\ No newline at end of file
diff --git a/build.sh b/build.sh
index 0c6260b..85863e7 100755
--- a/build.sh
+++ b/build.sh
@@ -19,11 +19,11 @@ fi
 
 mkdir -p /home/codeql_home
 
-# cd /home/codeql_home
-# curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
-# unzip codeql-linux64.zip 
-# # clone stable version
-# git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
+cd /home/codeql_home
+curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
+unzip codeql-linux64.zip 
+# clone stable version
+git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
 
 apt -y install curl dirmngr apt-transport-https lsb-release ca-certificates gnupg build-essential
 apt-get update
@@ -103,10 +103,12 @@ fi
 NVM_DIR=/root/.nvm
 NODE_VERSION=`node --version`
 
-echo "export NODE_VERSION=\"$NODE_VERSION\"" >> /root/.bashrc
-echo "export NVM_DIR=$NVM_DIR" >> /root/.bashrc
-echo "export NODE_PATH=$NVM_DIR/$NODE_VERSION/lib/node_modules" >> /root/.bashrc
-echo "export PATH=$NVM_DIR/$NODE_VERSION/bin:/home/codeql_home/codeql:$PATH" >> /root/.bashrc
+echo "export NODE_VERSION=\"$NODE_VERSION\"" >> /envfile
+echo "export NVM_DIR=$NVM_DIR" >> /envfile
+echo "export NODE_PATH=$NVM_DIR/$NODE_VERSION/lib/node_modules" >> /envfile
+echo "export PATH=$NVM_DIR/$NODE_VERSION/bin:/home/codeql_home/codeql:$PATH" >> /envfile
+
+cat /envfile >> /root/.bashrc
 
 # permissive
 npm config set strict-ssl false
@@ -116,11 +118,11 @@ npm install -g jest mocha tap ava nyc yarn next
 
 if [ ! -z "$repo_link" ]; then 
 	cd /home/npm-filter
-	# do the install and build
+	# do the install and build only (build_only_config.json config file)
 	if [ ! -z "$repo_commit" ]; then
-		python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config configs/build_only_config.json --output_dir results
-	else 
-		python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
-	fi
+        python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config configs/build_only_config.json --output_dir results
+    else 
+        python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
+    fi
 fi
 
diff --git a/configs/build_only_config.json b/configs/build_only_config.json
new file mode 100644
index 0000000..aa7cce2
--- /dev/null
+++ b/configs/build_only_config.json
@@ -0,0 +1,5 @@
+{
+	"test": {
+		"track_tests": false
+	}
+}
\ No newline at end of file
diff --git a/configs/verbose_only.json b/configs/verbose_only.json
new file mode 100644
index 0000000..e307f7f
--- /dev/null
+++ b/configs/verbose_only.json
@@ -0,0 +1,11 @@
+{
+	"install": {
+        "do_install": false
+    },
+    "build": {
+        "track_build": false
+    },
+    "test": {
+		"test_verbose_all_output": { "do_verbose_tracking": true }
+	}
+}
diff --git a/run_verbose_for_repo_and_config.sh b/run_verbose_for_repo_and_config.sh
new file mode 100755
index 0000000..6ab7538
--- /dev/null
+++ b/run_verbose_for_repo_and_config.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# run npm-filter on a specified repo with verbose, at an optional commit
+# output to the "results" directory
+
+# usage: ./run_for_repo_and_config.sh repo_link repo_commit
+
+repo_link=$1
+config_file=configs/verbose_only.json
+repo_commit=$2
+
+if [ ! -z "$repo_link" ] && [ ! -z "$config_file" ]; then
+    if [ ! -z "$repo_commit" ]; then
+        python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config $config_file --output_dir results
+    else 
+        python3 src/diagnose_github_repo.py --repo_link $repo_link --config $config_file --output_dir results
+    fi
+fi
\ No newline at end of file
diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index 5d65c3a..4f9af42 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -26,6 +26,7 @@ class RepoWalker():
 	VERBOSE_MODE = False
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
+	CUSTOM_SETUP_SCRIPTS = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -76,11 +77,13 @@ def set_up_config( self, config_file):
 		self.IGNORED_COMMANDS = cf_dict.get( "ignored_commands", self.IGNORED_COMMANDS)
 		self.IGNORED_SUBSTRINGS = cf_dict.get( "ignored_substrings", self.IGNORED_SUBSTRINGS)
 		self.RM_AFTER_CLONING = cf_dict.get( "rm_after_cloning", self.RM_AFTER_CLONING)
-		# script and query file location is relative to the config file
+		# scripts and query file location is relative to the config file
 		self.SCRIPTS_OVER_CODE = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
 											for p in cf_dict.get( "scripts_over_code", self.SCRIPTS_OVER_CODE)]
 		self.QL_QUERIES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
 											for p in cf_dict.get( "QL_queries", self.QL_QUERIES)]
+		self.CUSTOM_SETUP_SCRIPTS = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_setup_scripts", self.CUSTOM_SETUP_SCRIPTS)]
 
 		cf_dict = config_json.get( "dependencies", {})
 		self.INCLUDE_DEV_DEPS = cf_dict.get("include_dev_deps", self.INCLUDE_DEV_DEPS)
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
index aa5dcf1..bc59d57 100644
--- a/src/diagnose_npm_package.py
+++ b/src/diagnose_npm_package.py
@@ -19,6 +19,7 @@ class NPMSpider(scrapy.Spider):
 	VERBOSE_MODE = False
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
+	CUSTOM_SETUP_SCRIPTS = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -73,6 +74,8 @@ def set_up_config( self, config_file):
 											for p in cf_dict.get( "scripts_over_code", self.SCRIPTS_OVER_CODE)]
 		self.QL_QUERIES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
 											for p in cf_dict.get( "QL_queries", self.QL_QUERIES)]
+		self.CUSTOM_SETUP_SCRIPTS = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_setup_scripts", self.CUSTOM_SETUP_SCRIPTS)]
 
 		cf_dict = config_json.get( "dependencies", {})
 		self.INCLUDE_DEV_DEPS = cf_dict.get("include_dev_deps", self.INCLUDE_DEV_DEPS)
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index a3f9c4f..a42cc2e 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -288,10 +288,31 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 		json_out["setup"]["pkg_json_ERROR"] = True
 		return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
 
-	# first, the install
 	manager = ""
+	# first, check if there is a custom install
+	# this runs custom scripts the same way as the scripts_over_code below; only 
+	# difference is it's before the npm-filter run
+	if crawler.CUSTOM_SETUP_SCRIPTS != []:
+		json_out["custom_setup_scripts"] = {}
+		for script in crawler.CUSTOM_SETUP_SCRIPTS:
+			print("Running custom setup script script over code: " + script)
+			json_out["custom_setup_scripts"][script] = {}
+			error, output, retcode = run_command( script)
+			script_output = output.decode('utf-8') + error.decode('utf-8')
+			ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+			script_output = ansi_escape.sub('', script_output)
+			json_out["custom_setup_scripts"][script]["output"] = script_output
+			if retcode != 0:
+				json_out["custom_setup_scripts"][script]["ERROR"] = True
+
+	# check if the install is done (check if there is a node_modules folder)
+	already_installed = os.path.isdir("node_modules")
+
+	# then, the install
 	if crawler.DO_INSTALL:
-		(manager, retcode, installer_command, installer_debug) = run_installation( pkg_json, crawler)
+		(new_manager, retcode, installer_command, installer_debug) = run_installation( pkg_json, crawler)
+		if manager == "":
+			manager = new_manager
 		json_out["installation"] = {}
 		json_out["installation"]["installer_command"] = installer_command
 		if crawler.VERBOSE_MODE:
@@ -299,10 +320,14 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 		if retcode != 0:
 			print("ERROR -- installation failed")
 			json_out["installation"]["ERROR"] = True
-			return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
+			if not already_installed:
+				return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
 	else:
 		json_out["installation"] = { "do_install": False }
 
+	if manager == "": # default the manager to npm if it wasn't already IDd
+		manager = "npm run "
+
 	if crawler.COMPUTE_DEP_LISTS:
 		json_out["dependencies"] = {}
 		if not crawler.DO_INSTALL:
@@ -316,8 +341,8 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 	# now, proceed with the build
 	if crawler.TRACK_BUILD:
 		json_out["build"] = {}
-		if not crawler.DO_INSTALL:
-			print("Can't do build without installing (do_install: false) -- skipping")
+		if not crawler.DO_INSTALL and not already_installed:
+			print("Can't do build without installing (do_install: false and not already installed) -- skipping")
 		else:
 			(retcode, build_script_list, build_debug) = run_build( manager, pkg_json, crawler)
 			json_out["build"]["build_script_list"] = build_script_list
@@ -332,8 +357,8 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 	# then, the testing
 	if crawler.TRACK_TESTS:
 		json_out["testing"] = {}
-		if not crawler.DO_INSTALL:
-			print("Can't run tests without installing (do_install: false) -- skipping")
+		if not crawler.DO_INSTALL and not already_installed:
+			print("Can't run tests without installing (do_install: false and not already installed) -- skipping")
 		else:
 			(retcode, test_json_summary) = run_tests( manager, pkg_json, crawler, repo_name, cur_dir)
 			json_out["testing"] = test_json_summary

From b080de07162cd57c44c9bbf4ea53f3100e75446b Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 15 Jun 2023 22:59:44 -0400
Subject: [PATCH 15/39] docker build option for custom install script

---
 Dockerfile | 22 +++++++++++++---------
 build.sh   | 20 +++++++++++++-------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 72c3292..d84717b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,26 +1,30 @@
 FROM ubuntu:latest
 ARG DEBIAN_FRONTEND=noninteractive
 
-# build arg: setting up for a specific repo? at a specific commit?
+# build arg: setting up for a specific repo? at a specific commit? custom install script?
 ARG REPO_LINK
 ARG REPO_COMMIT
-
-RUN apt-get update \
-	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
-
-RUN apt update
-RUN apt -y install python3-pip
-RUN pip3 install bs4 scrapy xmltodict pandas
+ARG CUSTOM_INSTALL_SCRIPT
 
 RUN mkdir -p /home/npm-filter/results
 RUN mkdir /home/npm-filter/src
 RUN mkdir /home/npm-filter/configs
 
 COPY src /home/npm-filter/src
-COPY configs /home/npm-filter/configs
+# copy the custom install script if it exists
+COPY configs/* $CUSTOM_INSTALL_SCRIPT /home/npm-filter/configs/
+# and name it the custom_install_script
+RUN if [ -f /home/npm-filter/configs/${CUSTOM_INSTALL_SCRIPT} ] ; then mv /home/npm-filter/configs/${CUSTOM_INSTALL_SCRIPT} /home/npm-filter/configs/custom_install_script ; fi
 COPY *.sh /home/npm-filter/
 COPY get_rel_project_reqs.js /home/npm-filter
 
+RUN apt-get update \
+	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
+
+RUN apt update
+RUN apt -y install python3-pip
+RUN pip3 install bs4 scrapy xmltodict pandas
+
 WORKDIR /home/npm-filter
 
 RUN git config --global http.sslVerify "false"
diff --git a/build.sh b/build.sh
index 85863e7..a3fa31b 100755
--- a/build.sh
+++ b/build.sh
@@ -19,11 +19,11 @@ fi
 
 mkdir -p /home/codeql_home
 
-cd /home/codeql_home
-curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
-unzip codeql-linux64.zip 
-# clone stable version
-git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
+# cd /home/codeql_home
+# curl -L -o codeql-linux64.zip https://github.com/github/codeql-cli-binaries/releases/download/v2.3.4/codeql-linux64.zip
+# unzip codeql-linux64.zip 
+# # clone stable version
+# git clone https://github.com/github/codeql.git --branch v1.26.0 codeql-repo
 
 apt -y install curl dirmngr apt-transport-https lsb-release ca-certificates gnupg build-essential
 apt-get update
@@ -116,13 +116,19 @@ npm config set strict-ssl false
 # install the dependencies: but use the current version of npm
 npm install -g jest mocha tap ava nyc yarn next
 
+config_file=configs/build_only_config.json
+if [ -f "/home/npm-filter/configs/custom_install_script" ]; then
+	chmod +x /home/npm-filter/configs/custom_install_script
+	config_file=configs/custom_install_only.json
+fi
+
 if [ ! -z "$repo_link" ]; then 
 	cd /home/npm-filter
 	# do the install and build only (build_only_config.json config file)
 	if [ ! -z "$repo_commit" ]; then
-        python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config configs/build_only_config.json --output_dir results
+        python3 src/diagnose_github_repo.py --repo_link_and_SHA $repo_link $repo_commit --config $config_file --output_dir results
     else 
-        python3 src/diagnose_github_repo.py --repo_link $repo_link --config configs/build_only_config.json --output_dir results
+        python3 src/diagnose_github_repo.py --repo_link $repo_link --config $config_file --output_dir results
     fi
 fi
 

From c2592626579e7ef4de8675fbf35656e3a2fa6ed7 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 15 Jun 2023 23:44:15 -0400
Subject: [PATCH 16/39] lil fix

---
 Dockerfile                       | 9 +++++++--
 configs/custom_install_only.json | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 configs/custom_install_only.json

diff --git a/Dockerfile b/Dockerfile
index d84717b..3710a79 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,8 @@ ARG DEBIAN_FRONTEND=noninteractive
 # build arg: setting up for a specific repo? at a specific commit? custom install script?
 ARG REPO_LINK
 ARG REPO_COMMIT
-ARG CUSTOM_INSTALL_SCRIPT
+# placeholder: if this arg isn't specified, copy over the readme file in configs (can't copy no source, RIP)
+ARG CUSTOM_INSTALL_SCRIPT=configs/README.md
 
 RUN mkdir -p /home/npm-filter/results
 RUN mkdir /home/npm-filter/src
@@ -12,7 +13,11 @@ RUN mkdir /home/npm-filter/configs
 
 COPY src /home/npm-filter/src
 # copy the custom install script if it exists
-COPY configs/* $CUSTOM_INSTALL_SCRIPT /home/npm-filter/configs/
+RUN echo $CUSTOM_INSTALL_SCRIPT
+COPY ${CUSTOM_INSTALL_SCRIPT} configs/ /home/npm-filter/configs/
+# delete the config readme: we don't need this in the docker. and it's a flag for no-custom-install 
+# since the readme is the default for custom install
+RUN rm /home/npm-filter/configs/README.md
 # and name it the custom_install_script
 RUN if [ -f /home/npm-filter/configs/${CUSTOM_INSTALL_SCRIPT} ] ; then mv /home/npm-filter/configs/${CUSTOM_INSTALL_SCRIPT} /home/npm-filter/configs/custom_install_script ; fi
 COPY *.sh /home/npm-filter/
diff --git a/configs/custom_install_only.json b/configs/custom_install_only.json
new file mode 100644
index 0000000..fb3df09
--- /dev/null
+++ b/configs/custom_install_only.json
@@ -0,0 +1,8 @@
+{
+	"test": {
+		"track_tests": false
+	},
+    "meta_info": {
+        "custom_setup_scripts": [ "custom_install_script" ]
+    }
+}

From 70816c6d37bf492bcd28eeda807aac38f9577334 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 22 Jun 2023 22:06:06 -0400
Subject: [PATCH 17/39] adding option to repeat test command executions

---
 configs/default_filter_config.json |   1 +
 src/diagnose_github_repo.py        |   2 +
 src/diagnose_npm_package.py        |   2 +
 src/test_JS_repo_lib.py            | 115 ++++++++++++++++-------------
 4 files changed, 67 insertions(+), 53 deletions(-)

diff --git a/configs/default_filter_config.json b/configs/default_filter_config.json
index 14fdabb..56d0149 100644
--- a/configs/default_filter_config.json
+++ b/configs/default_filter_config.json
@@ -13,6 +13,7 @@
 		"timeout": 1000
 	},
 	"test": {
+		"test_command_repeats": 1,
 		"track_tests": true,
 		"test_verbose_all_output": {
 			"do_verbose_tracking": false,
diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index 4f9af42..d2a5843 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -36,6 +36,7 @@ class RepoWalker():
 	TRACK_TESTS = True
 	TEST_VERBOSE_ALL_OUTPUT = False
 	TEST_VERBOSE_OUTPUT_JSON = "verbose_test_report.json"
+	TEST_COMMAND_REPEATS = 1
 
 	TRACKED_TEST_COMMANDS = ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench", 
 							 "mocha", "jest", "ava", "tap", "jasmine"]
@@ -102,6 +103,7 @@ def set_up_config( self, config_file):
 		self.TEST_TIMEOUT = cf_dict.get("timeout", self.TEST_TIMEOUT)
 		self.TRACKED_TEST_COMMANDS = cf_dict.get("tracked_test_commands", self.TRACKED_TEST_COMMANDS)
 		self.TRACK_TESTS = cf_dict.get("track_tests", self.TRACK_TESTS)
+		self.TEST_COMMAND_REPEATS = cf_dict.get("test_command_repeats", self.TEST_COMMAND_REPEATS)
 		test_verbose_config = cf_dict.get("test_verbose_all_output", {})
 		self.TEST_VERBOSE_ALL_OUTPUT = test_verbose_config.get("do_verbose_tracking", self.TEST_VERBOSE_ALL_OUTPUT)
 		self.TEST_VERBOSE_OUTPUT_JSON = test_verbose_config.get("verbose_json_output_file", self.TEST_VERBOSE_OUTPUT_JSON)
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
index bc59d57..59daa28 100644
--- a/src/diagnose_npm_package.py
+++ b/src/diagnose_npm_package.py
@@ -29,6 +29,7 @@ class NPMSpider(scrapy.Spider):
 	TRACK_TESTS = True
 	TEST_VERBOSE_ALL_OUTPUT = False
 	TEST_VERBOSE_OUTPUT_JSON = "verbose_test_report.json"
+	TEST_COMMAND_REPEATS = 1
 
 	TRACKED_TEST_COMMANDS = ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench", 
 							 "mocha", "jest", "ava", "tap", "jasmine"]
@@ -94,6 +95,7 @@ def set_up_config( self, config_file):
 		self.TEST_TIMEOUT = cf_dict.get("timeout", self.TEST_TIMEOUT)
 		self.TRACKED_TEST_COMMANDS = cf_dict.get("tracked_test_commands", self.TRACKED_TEST_COMMANDS)
 		self.TRACK_TESTS = cf_dict.get("track_tests", self.TRACK_TESTS)
+		self.TEST_COMMAND_REPEATS = cf_dict.get("test_command_repeats", self.TEST_COMMAND_REPEATS)
 		test_verbose_config = cf_dict.get("test_verbose_all_output", {})
 		self.TEST_VERBOSE_ALL_OUTPUT = test_verbose_config.get("do_verbose_tracking", self.TEST_VERBOSE_ALL_OUTPUT)
 		self.TEST_VERBOSE_OUTPUT_JSON = test_verbose_config.get("verbose_json_output_file", self.TEST_VERBOSE_OUTPUT_JSON)
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index a42cc2e..6815be4 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -110,59 +110,68 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 	test_scripts = [t for t in test_scripts if set([t.find(ig_com) for ig_com in crawler.IGNORED_COMMANDS]) == {-1}]
 	test_scripts = [t for t in test_scripts if set([pkg_json.get("scripts", {})[t].find(ig_sub) for ig_sub in crawler.IGNORED_SUBSTRINGS]) == {-1}]
 	for test_index, t in enumerate(test_scripts):
-		print("Running: " + manager + t)
-		error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
-		test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
-		test_info.set_test_command( pkg_json.get("scripts", {})[t])
-		test_info.compute_test_infras()
-		test_info.compute_nested_test_commands( test_scripts)
-		test_info.compute_test_stats()
-		# if we're in verbose testing mode (i.e. getting all timing info for each test, etc)
-		# then, we rerun the test commands with all the commands for adding verbose_mode to 
-		# each of the test infras involved (individually)
-		if crawler.TEST_VERBOSE_ALL_OUTPUT:
-			# we're gonna be adding our new custom scripts for verbosity testing
-			run_command( "mv package.json TEMP_package.json_TEMP")
-			test_verbosity_output = {}
-			for verbosity_index, test_infra in enumerate(test_info.test_infras):
-				verbose_test_json = crawler.output_dir + "/" \
-									+ "repo_" + repo_name + "_" \
-									+ "test_" + str(test_index) + "_"\
-									+ "infra_" + str(verbosity_index) + "_" \
-									+ crawler.TEST_VERBOSE_OUTPUT_JSON
-				infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
-				if not infra_verbosity_config: # checks if it's an empty object
-					print("TEST VERBOSE MODE: unsupported test infra " + test_infra)
-					test_verbosity_output[test_infra] = { "error": True }
-					continue
-				infra_verbosity_args = infra_verbosity_config.get("args", "")
-				infra_verbosity_args_pos = infra_verbosity_config.get("position", -1) # default position is at the end
-				infra_verbosity_post_proc = infra_verbosity_config.get("post_processing", None)
-				infra_verbosity_command, out_files = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
-																				verbose_test_json, infra_verbosity_args_pos)
-				verbosity_script_name = "instrumented_verbosity_command_" + str(verbosity_index)
-				pkg_json["scripts"][verbosity_script_name] = infra_verbosity_command
-				with open("package.json", 'w') as f:
-					json.dump( pkg_json, f)
-				print("Running verbosity: " + manager + infra_verbosity_command)
-				verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
-				# if there's post-processing to be done
-				if not infra_verbosity_post_proc is None:
-					for out_file_obj in out_files:
-						infra_verbosity_post_proc(out_file_obj["output_file"])
-				verbosity_index += 1
-				# get the output
-				test_verbosity_infra = {}
-				test_verbosity_infra["command"] = infra_verbosity_command
-				test_verbosity_infra["output_files"] = out_files
-				if crawler.VERBOSE_MODE:
-					test_verbosity_infra["test_debug"] = "\nError output: " + verb_error.decode('utf-8') \
-														 + "\nOutput stream: " + verb_output.decode('utf-8')
-				test_verbosity_output[test_infra] = test_verbosity_infra
-			test_info.set_test_verbosity_output(test_verbosity_output)
-			# put the package.json back
-			run_command( "mv TEMP_package.json_TEMP package.json")
-		test_json_summary[t] = test_info.get_json_rep()
+		test_output_rep = {}
+		for test_rep_index in range(crawler.TEST_COMMAND_REPEATS):
+			test_rep_id = "" if crawler.TEST_COMMAND_REPEATS == 1 else "testrep_" + str(test_rep_index)
+			print("Running rep " + str(test_rep_index) + " of " + str(crawler.TEST_COMMAND_REPEATS - 1) + ": " + manager + t)
+			error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
+			test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
+			test_info.set_test_command( pkg_json.get("scripts", {})[t])
+			test_info.compute_test_infras()
+			test_info.compute_nested_test_commands( test_scripts)
+			test_info.compute_test_stats()
+			# if we're in verbose testing mode (i.e. getting all timing info for each test, etc)
+			# then, we rerun the test commands with all the commands for adding verbose_mode to 
+			# each of the test infras involved (individually)
+			if crawler.TEST_VERBOSE_ALL_OUTPUT:
+				# we're gonna be adding our new custom scripts for verbosity testing
+				run_command( "mv package.json TEMP_package.json_TEMP")
+				test_verbosity_output = {}
+				for verbosity_index, test_infra in enumerate(test_info.test_infras):
+					verbose_test_json = crawler.output_dir + "/" \
+										+ "repo_" + repo_name + "_" \
+										+ "test_" + str(test_index) + "_"\
+										+ "infra_" + str(verbosity_index) + "_" \
+										+ "" if test_rep_id == "" else test_rep_id + "_" \
+										+ crawler.TEST_VERBOSE_OUTPUT_JSON
+					infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
+					if not infra_verbosity_config: # checks if it's an empty object
+						print("TEST VERBOSE MODE: unsupported test infra " + test_infra)
+						test_verbosity_output[test_infra] = { "error": True }
+						continue
+					infra_verbosity_args = infra_verbosity_config.get("args", "")
+					infra_verbosity_args_pos = infra_verbosity_config.get("position", -1) # default position is at the end
+					infra_verbosity_post_proc = infra_verbosity_config.get("post_processing", None)
+					infra_verbosity_command, out_files = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
+																					verbose_test_json, infra_verbosity_args_pos)
+					verbosity_script_name = "instrumented_verbosity_command_" + str(verbosity_index)
+					pkg_json["scripts"][verbosity_script_name] = infra_verbosity_command
+					with open("package.json", 'w') as f:
+						json.dump( pkg_json, f)
+					print("Running verbosity: " + manager + infra_verbosity_command)
+					verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
+					# if there's post-processing to be done
+					if not infra_verbosity_post_proc is None:
+						for out_file_obj in out_files:
+							infra_verbosity_post_proc(out_file_obj["output_file"])
+					verbosity_index += 1
+					# get the output
+					test_verbosity_infra = {}
+					test_verbosity_infra["command"] = infra_verbosity_command
+					test_verbosity_infra["output_files"] = out_files
+					if crawler.VERBOSE_MODE:
+						test_verbosity_infra["test_debug"] = "\nError output: " + verb_error.decode('utf-8') \
+															+ "\nOutput stream: " + verb_output.decode('utf-8')
+					test_verbosity_output[test_infra] = test_verbosity_infra
+				test_info.set_test_verbosity_output(test_verbosity_output)
+				# put the package.json back
+				run_command( "mv TEMP_package.json_TEMP package.json")
+			# if we're not doing any repeats then don't make another layer of jsons
+			if crawler.TEST_COMMAND_REPEATS == 1:
+				test_output_rep = test_info.get_json_rep()
+			else:
+				test_output_rep[test_rep_id] = test_info.get_json_rep()
+		test_json_summary[t] = test_output_rep
 	return( retcode, test_json_summary)
 
 def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity_args, verbose_test_json, infra_verbosity_args_pos):

From 796dddadccb3f4f17c418a8b46f25c52ee1374c3 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 22 Jun 2023 23:13:50 -0400
Subject: [PATCH 18/39] wow embarassing string concat FAIL

---
 src/test_JS_repo_lib.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index 6815be4..b231fd9 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -132,7 +132,7 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 										+ "repo_" + repo_name + "_" \
 										+ "test_" + str(test_index) + "_"\
 										+ "infra_" + str(verbosity_index) + "_" \
-										+ "" if test_rep_id == "" else test_rep_id + "_" \
+										+ ("" if test_rep_id == "" else test_rep_id + "_") \
 										+ crawler.TEST_VERBOSE_OUTPUT_JSON
 					infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
 					if not infra_verbosity_config: # checks if it's an empty object
@@ -194,6 +194,7 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 				out_file_object["output_file"] = output_file
 			else:
 				output_file = verbose_test_json[:path_index] + "/out_" + str(num_files) + "_" + verbose_test_json[path_index + 1:]
+				print(output_file)
 				new_infra_verbosity_args += output_file
 				out_file_object["output_file"] = output_file
 			output_files += [ out_file_object ]

From bfa51daafec3a4ccaee63fff69749b73fae3d84f Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 22 Jun 2023 23:22:22 -0400
Subject: [PATCH 19/39] option for output dir in batch runner

---
 runParallelGitRepos.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/runParallelGitRepos.sh b/runParallelGitRepos.sh
index 7b15acb..7a8240a 100755
--- a/runParallelGitRepos.sh
+++ b/runParallelGitRepos.sh
@@ -2,10 +2,15 @@
 
 repo_link_file=$1
 config_file=$2
+output_dir=$3
 
 if [ ! -f $config_file ]; then 
 	config_file="configs/QL_output_config.json"
 fi
 
+if [ ! -d $output_dir ]; then
+	output_dir=`pwd`
+fi
+
 # you'll probably want to bg this
-nohup parallel -j 20 -a $repo_link_file --timeout 600 --joblog job.log python3 src/diagnose_github_repo.py --repo_link {} --config $config_file
+nohup parallel -j 20 -a $repo_link_file --timeout 600 --joblog job.log python3 src/diagnose_github_repo.py --repo_link {} --config $config_file --output_dir $output_dir

From 6b82cd91aecc5e4d06b220412eb1d99f4ef6bf58 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 19:55:49 +0000
Subject: [PATCH 20/39] build docker container in CI

---
 .github/workflows/end2end.yml | 4 +++-
 runDocker.sh                  | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml
index 7238098..12171ea 100644
--- a/.github/workflows/end2end.yml
+++ b/.github/workflows/end2end.yml
@@ -28,12 +28,14 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
-
+      - name: Build NPMFilter container
+        run: docker build -t npmfilter .
       - name: Run NPMFilter
         id: run-npm-filter
         env:
             SHA: ${{ inputs.project_sha }}
             URL: ${{ inputs.project_url }}
+            DOCKER_IMAGE: npmfilter:latest
         run: |
             IFS="/" read -r -a projectArray <<< "$URL"
             OrgName=${projectArray[0]} 
diff --git a/runDocker.sh b/runDocker.sh
index 4d3f8a6..eee9e76 100755
--- a/runDocker.sh
+++ b/runDocker.sh
@@ -11,10 +11,14 @@ if [ ! -d npm_filter_docker_results ]; then
 	mkdir npm_filter_docker_results
 fi
 
+if [ -v $DOCKER_IMAGE ]; then
+	DOCKER_IMAGE=emarteca/npm-filter:latest
+fi
+
 docker run --mount type=bind,source=`pwd`/local_mount,destination=/mount \
 		   --volume `pwd`/npm_filter_docker_results:/home/npm-filter/results \
 		   --volume `pwd`/docker_configs:/home/npm-filter/docker_configs\
 		   -w /home/npm-filter \
-		   emarteca/npm-filter:latest \
+		   $DOCKER_IMAGE \
 		   bash -c "PATH=/home/codeql_home/codeql:$PATH; $npm_filter_command --output_dir results"
-rm -r local_mount
+rm -r local_mount
\ No newline at end of file

From ee4a9192409824dcc1bb5080571f10e66b986a6f Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 20:53:10 +0000
Subject: [PATCH 21/39] Change CI to build/run the project-specific containers

---
 .github/workflows/barbosa23flaky.yml          |   2 +-
 .github/workflows/end2endCustomContainers.yml | 107 ++++++++++++++++++
 .github/workflows/smoketest.yml               |   2 +-
 3 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/end2endCustomContainers.yml

diff --git a/.github/workflows/barbosa23flaky.yml b/.github/workflows/barbosa23flaky.yml
index e16758a..ff38578 100644
--- a/.github/workflows/barbosa23flaky.yml
+++ b/.github/workflows/barbosa23flaky.yml
@@ -30,7 +30,7 @@ jobs:
     strategy:
       matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
       fail-fast: false
-    uses: ./.github/workflows/end2end.yml
+    uses: ./.github/workflows/end2endCustomContainers.yml
     with:
       project_url: ${{ matrix.projects.project_url }}
       project_sha: ${{ matrix.projects.project_sha }}
diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
new file mode 100644
index 0000000..8dc2c79
--- /dev/null
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -0,0 +1,107 @@
+name: Test NPMFilter End to End on a Project with custom-built containers per-project 
+
+on:
+  workflow_dispatch:
+    inputs:
+        project_url:
+            description: 'GitHub suffix of project to test (username/project)'
+            required: true
+            type: string
+        project_sha:
+            description: 'SHA of project to test'
+            required: true
+            type: string
+  workflow_call:
+    inputs:
+        project_url:
+            description: 'GitHub suffix of project to test (username/project)'
+            required: true
+            type: string
+        project_sha:
+            description: 'SHA of project to test'
+            required: true
+            type: string
+jobs:
+  execute:
+    runs-on: self-hosted
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Build NPMFilter container
+        run: |
+           if [ -f "project-overrides/${OrgName}-${ProjectName}.sh" ]; then
+            CUSTOM_INSTALL_SCRIPT="--build-arg CUSTOM_INSTALL_SCRIPT=project-overrides/${OrgName}-${ProjectName}.sh"
+           fi
+           docker build -t npmfilter --build-arg REPO_LINK=${{ inputs.project_url }} --REPO_COMMIT=${{ inputs.project_sha }} $CUSTOM_INSTALL_SCRIPT .
+      - name: Run NPMFilter
+        id: run-npm-filter
+        env:
+            SHA: ${{ inputs.project_sha }}
+            URL: ${{ inputs.project_url }}
+            DOCKER_IMAGE: npmfilter:latest
+        run: |
+            IFS="/" read -r -a projectArray <<< "$URL"
+            OrgName=${projectArray[0]} 
+            ProjectName=${projectArray[1]}
+            LogDir=${URL//\//-}
+            echo "LogDir=$LogDir" >> $GITHUB_OUTPUT
+
+            echo "Running NPMFilter on $OrgName/$ProjectName@$SHA"
+
+            mkdir -p docker_configs/
+            cat >docker_configs/debug_filter_config.json <<EOL
+            {
+              "install": {
+                      "timeout": 1000,
+                      "do_install": true
+              },
+              "dependencies": {
+                      "track_deps": false,
+                      "include_dev_deps": false
+              },
+              "build": {
+                      "track_build": true,
+                      "tracked_build_commands": ["build", "compile", "init"],
+                      "timeout": 1000
+              },
+              "test": {
+                      "track_tests": true,
+                      "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
+                                                                        "mocha", "jest", "ava", "tap", "jasmine"],
+                      "timeout": 1000
+              },
+              "meta_info": {
+                      "VERBOSE_MODE": true,
+                      "ignored_commands": ["watch", "debug"],
+                      "ignored_substrings": ["--watch", "nodemon"],
+                      "rm_after_cloning": false,
+                      "scripts_over_code": [ ],
+                      "QL_queries": [ ]
+              }
+            }
+            EOL
+
+            CUR_DIR=$(pwd)
+            # Run NPMFilter
+            # ./runDocker.sh python3 src/diagnose_github_repo.py --repo_link_and_SHA https://github.com/$URL $SHA --config docker_configs/debug_filter_config.json
+            docker run --volume $CUR_DIR/results/:/home/npm-filter/results -w /home/npm-filter npmfilter:latest
+
+            # Get tests overview
+            python3 output_proc_scripts/count_tests_run.py npm_filter_docker_results/ > tests-overview.csv
+
+            # Check if tests were found
+            TestData=$(cat tests-overview.csv)
+            IFS="," read -r -a testCount <<< $(python3 output_proc_scripts/count_tests_run.py npm_filter_docker_results/)
+            TestsRun=${testCount[0]}
+            if [ $TestsRun -le 2 ]; then
+                echo "ERROR: No tests found."
+                exit -1
+            else
+                echo "OK: ${TestsRun} tests found!"
+            fi
+      - name: Upload output
+        uses: actions/upload-artifact@v2
+        with:
+            name: npm_filter_docker_results
+            path: npm_filter_docker_results
\ No newline at end of file
diff --git a/.github/workflows/smoketest.yml b/.github/workflows/smoketest.yml
index d1fa45e..a9f84e8 100644
--- a/.github/workflows/smoketest.yml
+++ b/.github/workflows/smoketest.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
       fail-fast: false
-    uses: ./.github/workflows/end2end.yml
+    uses: ./.github/workflows/end2endCustomContainers.yml
     with:
       project_url: ${{ matrix.projects.project_url }}
       project_sha: ${{ matrix.projects.project_sha }}

From 51d8d0f2b6c66e5b6e42714bf9110956c370b802 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 20:54:25 +0000
Subject: [PATCH 22/39] Change CI to build/run the project-specific containers

---
 .github/workflows/end2endCustomContainers.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
index 8dc2c79..5ca6227 100644
--- a/.github/workflows/end2endCustomContainers.yml
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -33,7 +33,7 @@ jobs:
            if [ -f "project-overrides/${OrgName}-${ProjectName}.sh" ]; then
             CUSTOM_INSTALL_SCRIPT="--build-arg CUSTOM_INSTALL_SCRIPT=project-overrides/${OrgName}-${ProjectName}.sh"
            fi
-           docker build -t npmfilter --build-arg REPO_LINK=${{ inputs.project_url }} --REPO_COMMIT=${{ inputs.project_sha }} $CUSTOM_INSTALL_SCRIPT .
+           docker build -t npmfilter --build-arg REPO_LINK=${{ inputs.project_url }} --build-arg REPO_COMMIT=${{ inputs.project_sha }} $CUSTOM_INSTALL_SCRIPT .
       - name: Run NPMFilter
         id: run-npm-filter
         env:

From 4f32ae0bf551825046feb3b8f073d28abdab166b Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 20:57:58 +0000
Subject: [PATCH 23/39] Change CI to build/run the project-specific containers

---
 .github/workflows/end2endCustomContainers.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
index 5ca6227..b4388a4 100644
--- a/.github/workflows/end2endCustomContainers.yml
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -33,7 +33,7 @@ jobs:
            if [ -f "project-overrides/${OrgName}-${ProjectName}.sh" ]; then
             CUSTOM_INSTALL_SCRIPT="--build-arg CUSTOM_INSTALL_SCRIPT=project-overrides/${OrgName}-${ProjectName}.sh"
            fi
-           docker build -t npmfilter --build-arg REPO_LINK=${{ inputs.project_url }} --build-arg REPO_COMMIT=${{ inputs.project_sha }} $CUSTOM_INSTALL_SCRIPT .
+           docker build -t npmfilter --build-arg REPO_LINK=https://github.com/${{ inputs.project_url }} --build-arg REPO_COMMIT=${{ inputs.project_sha }} $CUSTOM_INSTALL_SCRIPT .
       - name: Run NPMFilter
         id: run-npm-filter
         env:

From c9bd3faf70f71c8560269956af07b6617db26733 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 20:59:39 +0000
Subject: [PATCH 24/39] fix bash sourcing in runDocker

---
 runDocker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runDocker.sh b/runDocker.sh
index eee9e76..fb1a3cf 100755
--- a/runDocker.sh
+++ b/runDocker.sh
@@ -20,5 +20,5 @@ docker run --mount type=bind,source=`pwd`/local_mount,destination=/mount \
 		   --volume `pwd`/docker_configs:/home/npm-filter/docker_configs\
 		   -w /home/npm-filter \
 		   $DOCKER_IMAGE \
-		   bash -c "PATH=/home/codeql_home/codeql:$PATH; $npm_filter_command --output_dir results"
+		   bash -c "source /envfile; PATH=/home/codeql_home/codeql:\$PATH; $npm_filter_command --output_dir results"
 rm -r local_mount
\ No newline at end of file

From d76ac57c04c6c9c3ca38a9ce83f3b5b5361714b0 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Mon, 26 Jun 2023 21:06:48 +0000
Subject: [PATCH 25/39] back to runDocker.sh in CI

---
 .github/workflows/end2end.yml                 |  4 ++++
 .github/workflows/end2endCustomContainers.yml |  7 +++++--
 .github/workflows/smoketest.yml               | 11 ++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml
index 12171ea..d3e25f8 100644
--- a/.github/workflows/end2end.yml
+++ b/.github/workflows/end2end.yml
@@ -63,6 +63,10 @@ jobs:
               },
               "test": {
                       "track_tests": true,
+                      "test_verbose_all_output": {
+                        "do_verbose_tracking": false,
+                        "verbose_json_output_file": "verbose_test_report.json"
+                      },                  
                       "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
                                                                         "mocha", "jest", "ava", "tap", "jasmine"],
                       "timeout": 1000
diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
index b4388a4..fa0ad9a 100644
--- a/.github/workflows/end2endCustomContainers.yml
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -69,6 +69,10 @@ jobs:
                       "track_tests": true,
                       "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
                                                                         "mocha", "jest", "ava", "tap", "jasmine"],
+                      "test_verbose_all_output": {
+                        "do_verbose_tracking": false,
+                        "verbose_json_output_file": "verbose_test_report.json"
+                      },                                                                    
                       "timeout": 1000
               },
               "meta_info": {
@@ -84,8 +88,7 @@ jobs:
 
             CUR_DIR=$(pwd)
             # Run NPMFilter
-            # ./runDocker.sh python3 src/diagnose_github_repo.py --repo_link_and_SHA https://github.com/$URL $SHA --config docker_configs/debug_filter_config.json
-            docker run --volume $CUR_DIR/results/:/home/npm-filter/results -w /home/npm-filter npmfilter:latest
+            ./runDocker.sh python3 src/diagnose_github_repo.py --repo_link_and_SHA https://github.com/$URL $SHA --config docker_configs/debug_filter_config.json
 
             # Get tests overview
             python3 output_proc_scripts/count_tests_run.py npm_filter_docker_results/ > tests-overview.csv
diff --git a/.github/workflows/smoketest.yml b/.github/workflows/smoketest.yml
index a9f84e8..f6dd45b 100644
--- a/.github/workflows/smoketest.yml
+++ b/.github/workflows/smoketest.yml
@@ -20,7 +20,7 @@ jobs:
           echo "matrix-projects<<__EOF__" >> $GITHUB_OUTPUT
           echo $PROJECTS_JSON >> $GITHUB_OUTPUT
           echo "__EOF__" >> $GITHUB_OUTPUT
-  execute:
+  execute-specialized-container:
     needs: [build-matrix]
     strategy:
       matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
@@ -29,3 +29,12 @@ jobs:
     with:
       project_url: ${{ matrix.projects.project_url }}
       project_sha: ${{ matrix.projects.project_sha }}
+  execute-generic-container:
+    needs: [build-matrix]
+    strategy:
+      matrix: ${{ fromJSON(needs.build-matrix.outputs.matrix-projects) }}
+      fail-fast: false
+    uses: ./.github/workflows/end2end.yml
+    with:
+      project_url: ${{ matrix.projects.project_url }}
+      project_sha: ${{ matrix.projects.project_sha }}
\ No newline at end of file

From e0c26e32359b0d2c6959711990a469be0f7df7a6 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Tue, 27 Jun 2023 00:46:57 +0000
Subject: [PATCH 26/39] try to fix CI output logging

---
 .github/workflows/end2end.yml                 | 4 ++--
 .github/workflows/end2endCustomContainers.yml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml
index d3e25f8..3b6c3c8 100644
--- a/.github/workflows/end2end.yml
+++ b/.github/workflows/end2end.yml
@@ -64,7 +64,7 @@ jobs:
               "test": {
                       "track_tests": true,
                       "test_verbose_all_output": {
-                        "do_verbose_tracking": false,
+                        "do_verbose_tracking": true,
                         "verbose_json_output_file": "verbose_test_report.json"
                       },                  
                       "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
@@ -101,5 +101,5 @@ jobs:
       - name: Upload output
         uses: actions/upload-artifact@v2
         with:
-            name: npm_filter_docker_results
+            name: results_${{ inputs.project_url }}_${{ inputs.project_sha }}
             path: npm_filter_docker_results
\ No newline at end of file
diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
index fa0ad9a..3fc0dba 100644
--- a/.github/workflows/end2endCustomContainers.yml
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -70,7 +70,7 @@ jobs:
                       "tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
                                                                         "mocha", "jest", "ava", "tap", "jasmine"],
                       "test_verbose_all_output": {
-                        "do_verbose_tracking": false,
+                        "do_verbose_tracking": true,
                         "verbose_json_output_file": "verbose_test_report.json"
                       },                                                                    
                       "timeout": 1000
@@ -106,5 +106,5 @@ jobs:
       - name: Upload output
         uses: actions/upload-artifact@v2
         with:
-            name: npm_filter_docker_results
-            path: npm_filter_docker_results
\ No newline at end of file
+          name: results_${{ inputs.project_url }}_${{ inputs.project_sha }}
+          path: npm_filter_docker_results
\ No newline at end of file

From 3cdf18bec565bb7dd92d2e5c11c8d0f986918d5d Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Tue, 27 Jun 2023 00:55:31 +0000
Subject: [PATCH 27/39] try to fix CI output logging

---
 .github/workflows/end2end.yml                 | 4 +++-
 .github/workflows/end2endCustomContainers.yml | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml
index 3b6c3c8..e38c2bd 100644
--- a/.github/workflows/end2end.yml
+++ b/.github/workflows/end2end.yml
@@ -98,8 +98,10 @@ jobs:
             else
                 echo "OK: ${TestsRun} tests found!"
             fi
+
+            echo "LOGNAME=results-${OrgName}-${ProjectName}-${SHA}" >> "$GITHUB_OUTPUT"
       - name: Upload output
         uses: actions/upload-artifact@v2
         with:
-            name: results_${{ inputs.project_url }}_${{ inputs.project_sha }}
+            name: ${{ steps.run-npm-filter.outputs.LOGNAME }}
             path: npm_filter_docker_results
\ No newline at end of file
diff --git a/.github/workflows/end2endCustomContainers.yml b/.github/workflows/end2endCustomContainers.yml
index 3fc0dba..ba8432e 100644
--- a/.github/workflows/end2endCustomContainers.yml
+++ b/.github/workflows/end2endCustomContainers.yml
@@ -103,8 +103,10 @@ jobs:
             else
                 echo "OK: ${TestsRun} tests found!"
             fi
+
+            echo "LOGNAME=results-${OrgName}-${ProjectName}-${SHA}" >> "$GITHUB_OUTPUT"
       - name: Upload output
         uses: actions/upload-artifact@v2
         with:
-          name: results_${{ inputs.project_url }}_${{ inputs.project_sha }}
+          name: ${{ steps.run-npm-filter.outputs.LOGNAME }}
           path: npm_filter_docker_results
\ No newline at end of file

From 7d2f03b2c1f301c93cdc60c775731f0874c8d71e Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Tue, 27 Jun 2023 17:16:39 -0400
Subject: [PATCH 28/39] fixing command instrumentation for strings that have
 test infras but arent a call to them; and a few minor tweaks

---
 configs/verbose.json                   |  1 +
 src/TestInfo.py                        |  2 +-
 src/output_parsing/test_output_proc.py | 16 ++++++++++++----
 src/test_JS_repo_lib.py                | 10 +++++++++-
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/configs/verbose.json b/configs/verbose.json
index b29a4f1..0a630aa 100644
--- a/configs/verbose.json
+++ b/configs/verbose.json
@@ -1,5 +1,6 @@
 {
 	"test": {
+		"test_command_repeats": 1,
 		"test_verbose_all_output": { "do_verbose_tracking": true }
 	}
 }
diff --git a/src/TestInfo.py b/src/TestInfo.py
index 61f08f0..0cb39a1 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -99,7 +99,7 @@ class TestInfo:
 		"gulp lint": "gulp lint -- linter"
 	}
 
-	TRACKED_RUNNERS = [ "node", "babel-node", "grunt" ]
+	TRACKED_RUNNERS = [ "node", "babel-node", "grunt", "lerna" ]
 
 	def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
 		self.success = success
diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index bd52da5..e1b6ee5 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -8,8 +8,12 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
     # convert an xml file to json
     # used to convert the xunit reporter output from mocha into json 
     # code from https://www.geeksforgeeks.org/python-xml-to-json/
-    with open(output_file) as xml_file:
-        data_dict = xmltodict.parse(xml_file.read()).get("testsuite", {})
+    data_dict = {}
+    try:
+        with open(output_file) as xml_file:
+            data_dict = xmltodict.parse(xml_file.read()).get("testsuite", {})
+    except:
+        data_dict = {}
     # the format: all the tests are in a top-level list called "testcase"
     test_suites = []
     test_names = []
@@ -34,8 +38,12 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
 def parse_jest_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
         new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
-    with open(output_file) as json_file:
-        data_dict = json.loads(json_file.read())
+    data_dict = {}
+    try:
+        with open(output_file) as json_file:
+            data_dict = json.loads(json_file.read())
+    except:
+        data_dict = {}
     # the format: all tests are in a top level list called "testResults"
     # this is a list of objects that have "assertionResults" representing the test suites
     # "assertionResults" is a list of objects that have the test data
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index b231fd9..abf56cb 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -134,7 +134,7 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 										+ "infra_" + str(verbosity_index) + "_" \
 										+ ("" if test_rep_id == "" else test_rep_id + "_") \
 										+ crawler.TEST_VERBOSE_OUTPUT_JSON
-					infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS[test_infra]
+					infra_verbosity_config = TestInfo.VERBOSE_TESTS_EXTRA_ARGS.get(test_infra)
 					if not infra_verbosity_config: # checks if it's an empty object
 						print("TEST VERBOSE MODE: unsupported test infra " + test_infra)
 						test_verbosity_output[test_infra] = { "error": True }
@@ -205,6 +205,14 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 	infra_calls = test_script.split(test_infra)
 	instrumented_test_command = []
 	for i, infra_call in enumerate(infra_calls):
+		# if the last char in the string is not whitespace and not a command delimiter,
+		# and it's not the last string in the split
+		# then it's a string that is appended to the front of the name of the infra (e.g., "\"jest\"") 
+		# and not a call 
+		if i < len(infra_calls) - 1 and infra_call != "" and (not infra_call[-1].isspace()) and (not any([infra_call.endswith(s) for s in command_split_chars])):
+			instrumented_test_command += [ infra_call ]
+			continue
+
 		# if the current call is empty string
 		# then this is the call to the testing infra and the next is the arguments 
 		# so, skip this one

From 76be9c1e47959f87b90d42b08e5ef9e7bd418bff Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 7 Jul 2023 16:22:27 -0400
Subject: [PATCH 29/39] adding option to specify a custom lock file to be
 copied over pre-install

---
 src/diagnose_github_repo.py | 3 +++
 src/diagnose_npm_package.py | 3 +++
 src/test_JS_repo_lib.py     | 5 +++++
 3 files changed, 11 insertions(+)

diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index d2a5843..ba2d938 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -27,6 +27,7 @@ class RepoWalker():
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
 	CUSTOM_SETUP_SCRIPTS = []
+	CUSTOM_LOCK_FILES = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -93,6 +94,8 @@ def set_up_config( self, config_file):
 		cf_dict = config_json.get( "install", {})
 		self.DO_INSTALL = cf_dict.get("do_install", self.DO_INSTALL)
 		self.INSTALL_TIMEOUT = cf_dict.get("timeout", self.INSTALL_TIMEOUT)
+		self.CUSTOM_LOCK_FILES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_lock_files", self.CUSTOM_LOCK_FILES)]
 
 		cf_dict = config_json.get( "build", {})
 		self.TRACK_BUILD = cf_dict.get("track_build", self.TRACK_BUILD)
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
index 59daa28..efcb4c1 100644
--- a/src/diagnose_npm_package.py
+++ b/src/diagnose_npm_package.py
@@ -20,6 +20,7 @@ class NPMSpider(scrapy.Spider):
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
 	CUSTOM_SETUP_SCRIPTS = []
+	CUSTOM_LOCK_FILES = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -85,6 +86,8 @@ def set_up_config( self, config_file):
 		cf_dict = config_json.get( "install", {})
 		self.DO_INSTALL = cf_dict.get("do_install", self.DO_INSTALL)
 		self.INSTALL_TIMEOUT = cf_dict.get("timeout", self.INSTALL_TIMEOUT)
+		self.CUSTOM_LOCK_FILES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_lock_files", self.CUSTOM_LOCK_FILES)]
 
 		cf_dict = config_json.get( "build", {})
 		self.TRACK_BUILD = cf_dict.get("track_build", self.TRACK_BUILD)
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index abf56cb..f65624e 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -307,6 +307,11 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 		return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
 
 	manager = ""
+	# if there's custom lock files, copy them into the repo (repo is "." since we're in the repo currently)
+	if crawler.CUSTOM_LOCK_FILES != []:
+		for custom_lock in crawler.CUSTOM_LOCK_FILES:
+			run_command("cp " + custom_lock + " .")
+
 	# first, check if there is a custom install
 	# this runs custom scripts the same way as the scripts_over_code below; only 
 	# difference is it's before the npm-filter run

From 83d2179a3f6a8e93a0c8ad182c81007a87d18e96 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 7 Jul 2023 17:59:49 -0400
Subject: [PATCH 30/39] adding option for diagnosing a local dir

---
 src/diagnose_github_repo.py | 22 +++++++++++++++++++++-
 src/test_JS_repo_lib.py     | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index ba2d938..948ebb8 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -20,6 +20,11 @@ def get_repo_and_SHA_from_repo_link(repo):
 		commit_SHA = split_res[1]
 	return(split_res[0], commit_SHA)
 
+# same format as getting the name from the repo link: we want the name of the dir, 
+# so after the last slash (and if there's no slash the whole name is returned)
+def get_name_from_path(repo_local_path):
+	return( repo_local_path.split("/")[-1])
+
 
 class RepoWalker():
 	name = "npm-pkgs"
@@ -60,6 +65,9 @@ def __init__(self, config_file="", output_dir = "."):
 	def set_repo_links(self, repo_links):
 		self.repo_links = repo_links
 
+	def set_local_repo_path(self, repo_local_dir):
+		self.repo_local_dir = repo_local_dir
+
 	def set_up_config( self, config_file):
 		if not os.path.exists(config_file):
 			if config_file != "":
@@ -126,22 +134,33 @@ def iterate_over_repos( self):
 				json_results["metadata"]["repo_commit_SHA"] = commit_SHA
 			with open(self.output_dir + "/" + package_name + '__results.json', 'w') as f:
 				json.dump( json_results, f, indent=4)
+		if self.repo_local_dir:
+			package_name = get_name_from_path( self.repo_local_dir)
+			json_results = diagnose_local_dir(self.repo_local_dir, self)
+			json_results["metadata"] = {}
+			json_results["metadata"]["repo_local_dir"] = repo_local_dir
+			with open(self.output_dir + "/" + package_name + '__results.json', 'w') as f:
+				json.dump( json_results, f, indent=4)
 
 
 argparser = argparse.ArgumentParser(description="Diagnose github repos, from a variety of sources")
 argparser.add_argument("--repo_list_file", metavar="rlistfile", type=str, nargs='?', help="file with list of github repo links")
 argparser.add_argument("--repo_link", metavar="rlink", type=str, nargs='?', help="single repo link")
+argparser.add_argument("--repo_local_dir", metavar="rlocallink", type=str, nargs='?', help="path to local directory that has the repo code")
 argparser.add_argument("--repo_link_and_SHA", metavar="rlink_and_SHA", type=str, nargs='*', help="single repo link, with optional commit SHA")
 argparser.add_argument("--config", metavar="config_file", type=str, nargs='?', help="path to config file")
 argparser.add_argument("--output_dir", metavar="output_dir", type=str, nargs='?', help="directory for results to be output to")
 args = argparser.parse_args()
 
 config = args.config if args.config else ""
-
 output_dir = args.output_dir if args.output_dir else "."
 
 walker = RepoWalker(config_file=config, output_dir=output_dir)
 
+repo_local_dir = None
+if args.repo_local_dir:
+	repo_local_dir = os.path.abspath(args.repo_local_dir)
+
 repo_links = []
 if args.repo_list_file:
 	try:
@@ -159,6 +178,7 @@ def iterate_over_repos( self):
 	# so we join all the repo_link args into a space-delimited string
 	repo_links += [' '.join(args.repo_link_and_SHA)]
 walker.set_repo_links( repo_links)
+walker.set_local_repo_path(repo_local_dir)
 walker.iterate_over_repos()
 	
 
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index f65624e..a60eae9 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -281,7 +281,40 @@ def diagnose_package( repo_link, crawler, commit_SHA=None):
 	else:
 		print( "Package repository already exists. Using existing directory: " + repo_name)
 	
+	# diagnose the repo dir
+	return( diagnose_repo_name(repo_name, crawler, json_out, cur_dir, commit_SHA=commit_SHA))
 
+def diagnose_local_dir(repo_dir, crawler):
+	json_out = {}
+	repo_name = ""
+	cur_dir = os.getcwd()
+	repo_name = repo_dir.split("/")[-1]
+	if not os.path.isdir(repo_dir):
+		print("ERROR using local directory: " + repo_dir + " invalid directory path")
+		json_out["setup"] = {}
+		json_out["setup"]["local_dir_ERROR"] = True
+		return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
+	
+	print("Diagnosing: " + repo_name + " --- from: " + repo_dir)
+	if not os.path.isdir("TESTING_REPOS"):
+		os.mkdir("TESTING_REPOS")
+	os.chdir("TESTING_REPOS")
+
+	# if the repo already exists, dont clone it
+	if not os.path.isdir( repo_name):
+		print( "Copying package directory")
+		error, output, retcode = run_command( "cp -r " + repo_dir + " " + repo_name)
+		if retcode != 0:
+			print("ERROR copying the directory. Exiting now.")
+			json_out["setup"] = {}
+			json_out["setup"]["local_dir_ERROR"] = True
+			return( on_diagnose_exit( json_out, crawler, cur_dir, repo_name))
+	else:
+		print( "Package directory already exists. Using existing directory: " + repo_name)
+	# diagnose the repo dir
+	return( diagnose_repo_name(repo_name, crawler, json_out, cur_dir))
+
+def diagnose_repo_name(repo_name, crawler, json_out, cur_dir, commit_SHA=None):
 	# move into the repo and begin testing
 	os.chdir( repo_name)
 

From a6bb11102273f165e7c37461cd70c1c22298ecc4 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Mon, 10 Jul 2023 21:17:14 -0400
Subject: [PATCH 31/39] fixing bug in verbose test command instrumentation when
 theres args

---
 src/TestInfo.py         |  2 ++
 src/test_JS_repo_lib.py | 26 ++++++++++++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/TestInfo.py b/src/TestInfo.py
index 0cb39a1..c5bec7e 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -228,6 +228,8 @@ def called_in_command( str_comm, command, manager):
 			return( True)
 		if command.find( "cross-env CI=true " + check_comm) > -1:
 			return( True)
+		if command.find( "cross-env TZ=utc " + check_comm) > -1:
+			return( True)
 		if command.find( "opener " + check_comm) > -1:
 			return( True)
 		if command.find( "gulp " + check_comm) > -1:
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index a60eae9..b1e94db 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -203,16 +203,30 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 	# split into sub-commands
 	command_split_chars = [ "&&", ";"]
 	infra_calls = test_script.split(test_infra)
-	instrumented_test_command = []
-	for i, infra_call in enumerate(infra_calls):
+	real_calls = []
+	for maybe_call in infra_calls:
 		# if the last char in the string is not whitespace and not a command delimiter,
 		# and it's not the last string in the split
 		# then it's a string that is appended to the front of the name of the infra (e.g., "\"jest\"") 
 		# and not a call 
-		if i < len(infra_calls) - 1 and infra_call != "" and (not infra_call[-1].isspace()) and (not any([infra_call.endswith(s) for s in command_split_chars])):
-			instrumented_test_command += [ infra_call ]
-			continue
-
+		# rebuild it
+		if i < len(infra_calls) - 1 and maybe_call != "" and (not maybe_call[-1].isspace()) and (not any([maybe_call.endswith(s) for s in command_split_chars])):
+			if len(real_calls) > 0:
+				real_calls[-1] += test_infra + maybe_call
+				continue
+		# if the first char in the string is not whitespace and not a command delimiter,
+		# and it's not the first string in the split
+		# then it's a string that is appended to the back of the name of the infra (e.g., jest".config.js")
+		# and not a call either
+		# rebuild it
+		if i > 0 and maybe_call != "" and (not maybe_call[0].isspace()) and (not any([maybe_call.startswith(s) for s in command_split_chars])):
+			if len(real_calls) > 0:
+				real_calls[-1] += test_infra + maybe_call
+				continue
+		real_calls += [ maybe_call ]
+	infra_calls = real_calls
+	instrumented_test_command = []
+	for i, infra_call in enumerate(infra_calls):
 		# if the current call is empty string
 		# then this is the call to the testing infra and the next is the arguments 
 		# so, skip this one

From 123544e937327da1761a236cacdb874fb8d718cc Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Tue, 11 Jul 2023 00:21:55 -0400
Subject: [PATCH 32/39] another lil bug fix in the verbose test mode
 instrumentation

---
 src/output_parsing/test_output_proc.py | 9 ++++++---
 src/test_JS_repo_lib.py                | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index e1b6ee5..e757291 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -69,6 +69,9 @@ def parse_jest_json_to_csv(output_file, new_output_file=None):
             test_stdout += [";".join(test_results.get("failureMessages", []))]
             test_pass_fail += [test_status] # passed/failed/pending -- if not present assume failed
     res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
-    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
-    with open(new_output_file, 'w') as csv_file:
-        csv_file.write(res_df.to_csv())
\ No newline at end of file
+    try:
+        res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+        with open(new_output_file, 'w') as csv_file:
+            csv_file.write(res_df.to_csv())
+    except:
+        print("ERROR in data for file " + new_output_file + " -- no output printed. skipping to next step...")
\ No newline at end of file
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index b1e94db..abb7fbd 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -234,8 +234,8 @@ def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity
 		if infra_call == "" and i < len(infra_calls) - 1:
 			instrumented_test_command += [ "" ]
 			continue
-		# if the first call is non-empty, then it's pre-test-infra and we skip it too
-		elif infra_call != "" and i == 0:
+		# if the first call is non-empty and there's more than one call, then it's pre-test-infra and we skip it too
+		elif len(infra_calls) > 1 and infra_call != "" and i == 0:
 			instrumented_test_command += [ "" ]
 			continue
 		# get the arguments, splitting off from any other non-test commands that might be

From ca0a8a2bc202836a9c1ee469bcb2e1fa427bd275 Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Fri, 4 Aug 2023 13:04:14 +0000
Subject: [PATCH 33/39] Increase timeout from 15 minutes to 3 hours. Running on
 throttled configs results in a need for a greater timeout...

---
 Dockerfile                  | 1 -
 src/diagnose_github_repo.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3710a79..f9427b3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -26,7 +26,6 @@ COPY get_rel_project_reqs.js /home/npm-filter
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
 
-RUN apt update
 RUN apt -y install python3-pip
 RUN pip3 install bs4 scrapy xmltodict pandas
 
diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
index 948ebb8..ee7f09b 100644
--- a/src/diagnose_github_repo.py
+++ b/src/diagnose_github_repo.py
@@ -51,10 +51,10 @@ class RepoWalker():
 	TRACKED_BUILD_COMMANDS = ["build", "compile", "init"]
 
 	# timeouts for stages, in seconds
-	INSTALL_TIMEOUT = 1000
+	INSTALL_TIMEOUT = 10800 # 3 hours
 	# note: these are timeouts per *script* in the stage of the process
-	BUILD_TIMEOUT = 1000
-	TEST_TIMEOUT = 1000
+	BUILD_TIMEOUT = 10800 # 3 hours
+	TEST_TIMEOUT = 10800 # 3 hours
 
 	QL_CUTOFF = 5 # ignore if there are < 5 results
 	

From db4382e140b065aa237536dd15d087785c65765c Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 4 Aug 2023 18:55:00 -0400
Subject: [PATCH 34/39] docs

---
 README.md                              | 27 ++++++++++++++++++++++++--
 configs/README.md                      | 17 ++++++++++++++--
 configs/default_filter_config.json     |  3 ++-
 src/output_parsing/test_output_proc.py | 17 ++++++++++++++++
 src/test_JS_repo_lib.py                |  1 +
 5 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 84de04d..11c31da 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ python src/diagnose_github_repo.py
 			[--repo_list_file [rlistfile]] 
 			[--repo_link [rlink]] 
 			[--repo_link_and_SHA [rlink_and_SHA]] 
+            [--repo_local_dir [path_to_local_dir]]
 			[--config [config_file]]
                         [--output_dir [output_dir]]
 ```
@@ -35,6 +36,7 @@ All arguments are optional, although the tool will not do anything if no repo li
 	```
 * `--repo_link [rlink]`: a link to a single GitHub repo to be analyzed, e.g., `https://github.com/expressjs/body-parser`
 * `--repo_link_and_SHA [rlink_and_SHA]`: a link to a single GitHub repo to be analyzed, followed by a space-delimited commit SHA to analyze the repo at, e.g., `https://github.com/expressjs/body-parser 	d0a214b3beded8a9cd2dcb51d355f92c9ead81d4`
+* `repo_local_dir`: path to a local directory containing the source code of a repo/package to be diagnosed
 * `--config [config_file]`: path to a configuration file for the tool (config options explained in [the config file section](#configuration-file)) 
 * `--output_dir [output_dir]`: path to a directory in which to output the tool's results files (shape of results are explained in [the output section](#output))
 
@@ -73,6 +75,7 @@ The output is organized into the following top-level fields in the JSON, in orde
 	* if it runs other test commands, then a list of these commands are included (`nested_test_commands`)
 	* whether or not it timed out (`timed_out`)
 	* if it does run new user tests, then the number of passing and number of failing tests (`num_passing`, `num_failing`)
+    * if verbose testing is specified as an option, then there will be an additional file of extra test output produced
 * `scripts_over_code`: an object with fields for each of the scripts run over the package source code. For each script, the tool lists its output and if there was an error.
 * `QL_queries`: an object with fields for each of the QL queries run over the package source code. For each script, the tool lists the output (if running in verbose mode), and if there was an error.
 * `metadata`: an object with fields for some metadata about the package: repository link, commit SHA if one was specified
@@ -132,9 +135,29 @@ The output of each QL query is saved to a CSV file in the same directory as the
 ### Running with docker
 To be safe, you should probably run any untrusted code in a sandbox.
 Since the entire point of this tool is to run code from a set of packages/projects you didn't write, we assume most of this code will fall into the untrusted category.
-We host the docker container [on DockerHub](https://hub.docker.com/r/emarteca/npm-filter); if you edit the package source code and want to run your version in a docker container, we have included the docker build command below.
 
-#### Building docker (if you've updated the npm-filter source code)
+We host the generic docker container [on DockerHub](https://hub.docker.com/r/emarteca/npm-filter); if you edit the package source code and want to run your version in a docker container, we have included the docker build command below.
+
+The generic docker container runs on any package or repo specified.
+However, it is pre-built with default versions of node and npm.
+There is also the option to build a _repo-specific_ docker container. 
+In this case, the container is built with the particular version of node and npm specified in the repo's `package.json` configuration file.
+The container is also pre-built with the install and build phases of `npm-filter` run, so that you can then run the tests in the container without waiting for any setup.
+
+#### Building a container-specific docker
+If you want to build a container specific to a particular repo, use the following command:
+```
+# general use
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=[github link to repo] [--build-arg REPO_COMMIT=[specific commit SHA]]
+
+# specific example for memfs
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=https://github.com/streamich/memfs 
+
+# another example, for memfs at a specific commit
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=https://github.com/streamich/memfs --build-arg REPO_COMMIT=863f373185837141504c05ed19f7a253232e0905
+```
+
+#### Building generic docker (if you've updated the npm-filter source code)
 Note: you don't need to do this if you're using npm-filter out of the box. 
 In that case, you'll pull directly from DockerHub.
 ```
diff --git a/configs/README.md b/configs/README.md
index b387231..a79313e 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -3,6 +3,7 @@ The configuration file is a JSON, organized by stages of npm-filter analysis.
 The stages are as follows:
 * `install`: package installation. Users can specify:
   * `timeout`: number of millisections after which, if the install is not complete, the process bails and is considered timed out
+  * `do_install`: if false, skip the install stage
 * `dependencies`: package dependency tracking (this is the libraries the current package depends on, both directly and transitively). Users can specify:
   * `track_deps`: if true, this specifies to compute the package dependencies
   * `include_dev_deps`: if true, this specifies to include the `devDependencies` in the dependency computation
@@ -10,10 +11,14 @@ The stages are as follows:
 * `build`: package compile/build stage. Users can specify: 
   * `tracked_build_commands`: a list of build commands to test (any npm script with one of these commands as a substring will be tested). Any command not in this list will not be tested for the build stage.
   * `timeout`: timeout in milliseconds, per build command
+  * `track_build`: if false, skip the build stage
 * `test`: package test stage. Users can specify:
   * `track_tests`: if true, then the tool will run this testing diagnostic stage
   * `tracked_test_commands`: a list of test commands to test (any npm script with one of these commands as a substring will be tested). Any command not in this list will not be tested for the test stage.
   * `timeout`: timeout in milliseconds, per test command
+  * `test_verbose_all_output`: an object with two fields to configure the "verbose" test tracking option: here, output and some metrics (runtime, pass/fail, etc) for each test is output to a specified file. Note that currently we only support this option for the `jest` and `mocha` test infras. 
+  	* `do_verbose_tracking`: if true, do this verbose test tracking
+	* `verbose_json_output_file`: name of the file to which to save this verbose output
 * `meta_info`: any analysis-level configurations. Users can specify:
   * `VERBOSE_MODE`: if true, then the output JSON file will include the full output of all the commands run. Mainly for debugging.
   * `ignored_commands`: commands to ignore: if these are present in the npm script name, then they are not run even if they otherwise fall into a category of commands to run (mainly used to exclude any interactive-mode commands, such as tests with `watch`)
@@ -21,6 +26,7 @@ The stages are as follows:
   * `rm_after_cloning`: if true, delete the package source code after the tool is done running. Strongly recommended if running over a large batch of packages.
   * `scripts_over_code`: list of paths to script files to run over the package source code. Note that these paths are relative to the location of **the config file**.
   * `QL_queries`: list of paths to QL query files to run over the package source code. Like the scripts, these paths are relative to the location of the config file.
+  * `custom_setup_scripts`: list of paths to script files to run over the package code after cloning, but before any of the stages of `npm-filter` are actually run. Commonly used to replace the default install stage (i.e., set `do_install` to `false`). Like all the other scripts, these paths are relative to the location of the config file.
 
 Users can customize any of the configuration fields, by providing a JSON file with the desired fields modified.
 Default values are used for any fields not specified.
@@ -29,18 +35,24 @@ As a demonstrative example, the default configuration is included below.
 ```
 {
 	"install": {
-		"timeout": 1000
+		"timeout": 1000,
+		"do_install": true
 	},
 	"dependencies": {
 		"track_deps": false,
 		"include_dev_deps": false
 	},
 	"build": {
+		"track_build": true,
 		"tracked_build_commands": ["build", "compile", "init"],
 		"timeout": 1000
 	},
 	"test": {
 		"track_tests": true,
+		"test_verbose_all_output": {
+			"do_verbose_tracking": false,
+			"verbose_json_output_file": "verbose_test_report.json"
+		},
 		"tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
 								  "mocha", "jest", "ava", "tap", "jasmine"],
 		"timeout": 1000
@@ -51,7 +63,8 @@ As a demonstrative example, the default configuration is included below.
 		"ignored_substrings": ["--watch", "nodemon"],
 		"rm_after_cloning": false,
 		"scripts_over_code": [ ],
-		"QL_queries": [ ]
+		"QL_queries": [ ],
+		"custom_setup_scripts": [ ]
 	}
 }
 ```
diff --git a/configs/default_filter_config.json b/configs/default_filter_config.json
index 56d0149..d0d8fa4 100644
--- a/configs/default_filter_config.json
+++ b/configs/default_filter_config.json
@@ -29,6 +29,7 @@
 		"ignored_substrings": ["--watch", "nodemon"],
 		"rm_after_cloning": false,
 		"scripts_over_code": [ ],
-		"QL_queries": [ ]
+		"QL_queries": [ ],
+		"custom_setup_scripts": [ ]
 	}
 }
\ No newline at end of file
diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index e757291..5d4bc3b 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -2,6 +2,14 @@
 import xmltodict
 import pandas as pd
  
+# parse the output of mocha xunit reporter to a csv
+# does not delete the original xunit output file
+# outputs include, per test (in this order):
+# - test suite it's a part of
+# - name of the test itself
+# - runtime of the test
+# - stdout of the test (if any)
+# - pass/fail status (could also be "pending")
 def parse_mocha_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
         new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
@@ -35,6 +43,15 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
     with open(new_output_file, 'w') as csv_file:
         csv_file.write(res_df.to_csv())
 
+# parse the output of jest xunit reporter to a csv
+# this does the same thing as for mocha, to produce the same data fields
+# does not delete the original xunit output file
+# outputs include, per test (in this order):
+# - test suite it's a part of
+# - name of the test itself
+# - runtime of the test
+# - stdout of the test (if any)
+# - pass/fail status (could also be "pending")
 def parse_jest_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
         new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index abb7fbd..cd19dd1 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -174,6 +174,7 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 		test_json_summary[t] = test_output_rep
 	return( retcode, test_json_summary)
 
+# instrument the test command specified to make it produce verbose output to a file
 def instrument_test_command_for_verbose(test_script, test_infra, infra_verbosity_args, verbose_test_json, infra_verbosity_args_pos):
 	# replace the output file name with the custom output filename
 	# add an index to the filename for the 2nd,+ time the filename shows up

From 265471ad5c953e282d8904782b077c68987c93f1 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Fri, 4 Aug 2023 18:55:48 -0400
Subject: [PATCH 35/39] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 11c31da..f801837 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ python src/diagnose_github_repo.py
 			[--repo_list_file [rlistfile]] 
 			[--repo_link [rlink]] 
 			[--repo_link_and_SHA [rlink_and_SHA]] 
-            [--repo_local_dir [path_to_local_dir]]
+			[--repo_local_dir [path_to_local_dir]]
 			[--config [config_file]]
                         [--output_dir [output_dir]]
 ```

From 27783bbb71dd88e1ee84c9158612c9d5c514aefe Mon Sep 17 00:00:00 2001
From: Jonathan Bell <jon@jonbell.net>
Date: Wed, 16 Aug 2023 20:42:52 +0000
Subject: [PATCH 36/39] Add logging for timing of each test target + attempt to
 force jest to run tests in band

---
 src/TestInfo.py         | 8 +++++++-
 src/test_JS_repo_lib.py | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/TestInfo.py b/src/TestInfo.py
index c5bec7e..7d99c9f 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -40,9 +40,11 @@ class TestInfo:
 	}
 	# extra args, their position in the arg list, and any post-processing required
     # post-processing is a function that takes 2 arguments: input file and output file
+	# CAUTION: DO NOT PUT ANY MORE ARGS AFTER PLACEHOLDER_OUTPUT_FILE_NAME. THE CODE THAT
+	# PARSES THE OUTPUT RELIES ON THIS BEING THE *LAST* ARGUMENT
 	VERBOSE_TESTS_EXTRA_ARGS = {
 		"jest": {
-			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"args": " --verbose --json -i --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
 			"position":  -1,
 			"post_processing": TestOutputProc.parse_jest_json_to_csv
 		},
@@ -116,6 +118,8 @@ def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
 		self.timed_out = False
 		self.VERBOSE_MODE = VERBOSE_MODE
 		self.test_verbosity_output = None
+		self.startTime = 0
+		self.endTime = 0
 
 	def set_test_command( self, test_command):
 		self.test_command = test_command
@@ -189,6 +193,8 @@ def get_json_rep( self):
 		if self.test_verbosity_output:
 			json_rep["test_verbosity_output"] = self.test_verbosity_output
 		json_rep["timed_out"] = self.timed_out
+		json_rep["start_time"] = self.startTime
+		json_rep["end_time"] = self.endTime
 		return( json_rep)
 
 	def __str__(self):
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index abb7fbd..8113f1e 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -2,6 +2,7 @@
 import subprocess
 import json
 import os
+import time
 from TestInfo import *
 
 def run_command( commands, timeout=None):
@@ -114,8 +115,13 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 		for test_rep_index in range(crawler.TEST_COMMAND_REPEATS):
 			test_rep_id = "" if crawler.TEST_COMMAND_REPEATS == 1 else "testrep_" + str(test_rep_index)
 			print("Running rep " + str(test_rep_index) + " of " + str(crawler.TEST_COMMAND_REPEATS - 1) + ": " + manager + t)
+			# time how long the next line takes
+			startTime = time.time()
 			error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
+			endTime = time.time()
 			test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
+			test_info.startTime = startTime
+			test_info.endTime = endTime
 			test_info.set_test_command( pkg_json.get("scripts", {})[t])
 			test_info.compute_test_infras()
 			test_info.compute_nested_test_commands( test_scripts)

From 98a8bda517d73fd9009fed8e32ab1a29c435e4b2 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Mon, 21 Aug 2023 19:37:32 -0400
Subject: [PATCH 37/39] stop running the non-verbose-instrumented version of
 test commands if we are also running the verbose-instrumented version

---
 src/TestInfo.py         |  9 +++++++--
 src/test_JS_repo_lib.py | 34 +++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/TestInfo.py b/src/TestInfo.py
index c5bec7e..0c91d2c 100644
--- a/src/TestInfo.py
+++ b/src/TestInfo.py
@@ -123,14 +123,19 @@ def set_test_command( self, test_command):
 	def set_test_verbosity_output( self, verbose_output):
 		self.test_verbosity_output = verbose_output
 
+	def get_test_infras_list( test_command, manager):
+		test_infras = []
+		test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, test_command, manager) ]
+		test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, test_command, manager) ]
+		return( test_infras)
+
 	def compute_test_infras( self):
 		self.test_infras = []
 		self.test_covs = []
 		self.test_lints = []
 		self.nested_test_commands = []
 		if self.test_command:
-			self.test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, self.test_command, self.manager) ]
-			self.test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, self.test_command, self.manager) ]
+			self.test_infras += TestInfo.get_test_infras_list(self.test_command, self.manager)
 			self.test_covs += [ TestInfo.TRACKED_COVERAGE[ti] for ti in TestInfo.TRACKED_COVERAGE if called_in_command(ti, self.test_command, self.manager) ]
 			self.test_lints += [ TestInfo.TRACKED_LINTERS[ti] for ti in TestInfo.TRACKED_LINTERS if called_in_command(ti, self.test_command, self.manager) ]
 		self.test_infras = list(set(self.test_infras))
diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index cd19dd1..4623306 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -114,20 +114,16 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 		for test_rep_index in range(crawler.TEST_COMMAND_REPEATS):
 			test_rep_id = "" if crawler.TEST_COMMAND_REPEATS == 1 else "testrep_" + str(test_rep_index)
 			print("Running rep " + str(test_rep_index) + " of " + str(crawler.TEST_COMMAND_REPEATS - 1) + ": " + manager + t)
-			error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
-			test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
-			test_info.set_test_command( pkg_json.get("scripts", {})[t])
-			test_info.compute_test_infras()
-			test_info.compute_nested_test_commands( test_scripts)
-			test_info.compute_test_stats()
+			test_command = pkg_json.get("scripts", {})[t]
+			test_infras = TestInfo.get_test_infras_list(test_command, manager)
+			test_verbosity_output = {}
 			# if we're in verbose testing mode (i.e. getting all timing info for each test, etc)
 			# then, we rerun the test commands with all the commands for adding verbose_mode to 
 			# each of the test infras involved (individually)
 			if crawler.TEST_VERBOSE_ALL_OUTPUT:
 				# we're gonna be adding our new custom scripts for verbosity testing
 				run_command( "mv package.json TEMP_package.json_TEMP")
-				test_verbosity_output = {}
-				for verbosity_index, test_infra in enumerate(test_info.test_infras):
+				for verbosity_index, test_infra in enumerate(test_infras):
 					verbose_test_json = crawler.output_dir + "/" \
 										+ "repo_" + repo_name + "_" \
 										+ "test_" + str(test_index) + "_"\
@@ -142,14 +138,14 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 					infra_verbosity_args = infra_verbosity_config.get("args", "")
 					infra_verbosity_args_pos = infra_verbosity_config.get("position", -1) # default position is at the end
 					infra_verbosity_post_proc = infra_verbosity_config.get("post_processing", None)
-					infra_verbosity_command, out_files = instrument_test_command_for_verbose(test_info.test_command, test_infra, infra_verbosity_args, 
+					infra_verbosity_command, out_files = instrument_test_command_for_verbose(test_command, test_infra, infra_verbosity_args, 
 																					verbose_test_json, infra_verbosity_args_pos)
 					verbosity_script_name = "instrumented_verbosity_command_" + str(verbosity_index)
 					pkg_json["scripts"][verbosity_script_name] = infra_verbosity_command
 					with open("package.json", 'w') as f:
 						json.dump( pkg_json, f)
 					print("Running verbosity: " + manager + infra_verbosity_command)
-					verb_error, verb_output, verb_retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
+					error, output, retcode = run_command( manager + verbosity_script_name, crawler.TEST_TIMEOUT)
 					# if there's post-processing to be done
 					if not infra_verbosity_post_proc is None:
 						for out_file_obj in out_files:
@@ -160,12 +156,24 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 					test_verbosity_infra["command"] = infra_verbosity_command
 					test_verbosity_infra["output_files"] = out_files
 					if crawler.VERBOSE_MODE:
-						test_verbosity_infra["test_debug"] = "\nError output: " + verb_error.decode('utf-8') \
-															+ "\nOutput stream: " + verb_output.decode('utf-8')
+						test_verbosity_infra["test_debug"] = "\nError output: " + error.decode('utf-8') \
+															+ "\nOutput stream: " + output.decode('utf-8')
 					test_verbosity_output[test_infra] = test_verbosity_infra
-				test_info.set_test_verbosity_output(test_verbosity_output)
 				# put the package.json back
 				run_command( "mv TEMP_package.json_TEMP package.json")
+			# not verbose test mode -- just run the normal test command
+			else: 
+				error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
+			test_info = TestInfo( (retcode == 0), error, output, manager, crawler.VERBOSE_MODE)
+			# the below info on the test infras etc is independent of verbose mode: just based on the command itself
+			test_info.set_test_command( test_command)
+			test_info.compute_test_infras()
+			test_info.compute_nested_test_commands( test_scripts)
+			# note: if we're running in verbose mode, then the stats will be that of the last executed verbose mode 
+			# instrumented version of the test command
+			test_info.compute_test_stats()
+			if crawler.TEST_VERBOSE_ALL_OUTPUT:
+				test_info.set_test_verbosity_output(test_verbosity_output)
 			# if we're not doing any repeats then don't make another layer of jsons
 			if crawler.TEST_COMMAND_REPEATS == 1:
 				test_output_rep = test_info.get_json_rep()

From a875faa9c74593d8204e13ba399becf864822625 Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Mon, 21 Aug 2023 23:10:25 -0400
Subject: [PATCH 38/39] lil fix

---
 src/test_JS_repo_lib.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/test_JS_repo_lib.py b/src/test_JS_repo_lib.py
index 73ae479..81b415d 100644
--- a/src/test_JS_repo_lib.py
+++ b/src/test_JS_repo_lib.py
@@ -169,7 +169,9 @@ def run_tests( manager, pkg_json, crawler, repo_name, cur_dir="."):
 				# put the package.json back
 				run_command( "mv TEMP_package.json_TEMP package.json")
 			# not verbose test mode -- just run the normal test command
-			else: 
+			# if start and end time are both still zero then no instrumented test commands ran
+			# and so we also rerun here
+			if (not crawler.TEST_VERBOSE_ALL_OUTPUT) or (start_time == 0 and end_time == 0): 
 				start_time = time.time()
 				error, output, retcode = run_command( manager + t, crawler.TEST_TIMEOUT)
 				end_time = time.time()

From 7dc635f59678d1959346b6483569d61badab270b Mon Sep 17 00:00:00 2001
From: Ellen Arteca <ellen.arteca@gmail.com>
Date: Thu, 24 Aug 2023 22:50:00 -0400
Subject: [PATCH 39/39] catch errors in mocha output format (same as jest)

---
 src/output_parsing/test_output_proc.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
index 5d4bc3b..9ab742d 100644
--- a/src/output_parsing/test_output_proc.py
+++ b/src/output_parsing/test_output_proc.py
@@ -39,9 +39,12 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
             test_stdout += [""]
             test_pass_fail += ["passed"]
     res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
-    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
-    with open(new_output_file, 'w') as csv_file:
-        csv_file.write(res_df.to_csv())
+    try:
+        res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+        with open(new_output_file, 'w') as csv_file:
+            csv_file.write(res_df.to_csv())
+    except:
+        print("ERROR in data for file " + new_output_file + " -- no output printed. skipping to next step...")
 
 # parse the output of jest xunit reporter to a csv
 # this does the same thing as for mocha, to produce the same data fields