Merge branch 'raft' into eval-yost-js

emarteca · Sep 18, 2023 · 588428e · 588428e
2 parents 39fe4a8 + 7dc635f
commit 588428e
Show file tree

Hide file tree

Showing 9 changed files with 212 additions and 40 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -26,7 +26,6 @@ COPY get_rel_project_reqs.js /home/npm-filter
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends python3 git unzip vim curl gnupg xz-utils parallel
 
-RUN apt update
 RUN apt -y install python3-pip
 RUN pip3 install bs4 scrapy xmltodict pandas
 

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ python src/diagnose_github_repo.py
 			[--repo_list_file [rlistfile]] 
 			[--repo_link [rlink]] 
 			[--repo_link_and_SHA [rlink_and_SHA]] 
+			[--repo_local_dir [path_to_local_dir]]
 			[--config [config_file]]
                         [--output_dir [output_dir]]
 ```
@@ -35,6 +36,7 @@ All arguments are optional, although the tool will not do anything if no repo li
 	```
 * `--repo_link [rlink]`: a link to a single GitHub repo to be analyzed, e.g., `https://github.com/expressjs/body-parser`
 * `--repo_link_and_SHA [rlink_and_SHA]`: a link to a single GitHub repo to be analyzed, followed by a space-delimited commit SHA to analyze the repo at, e.g., `https://github.com/expressjs/body-parser 	d0a214b3beded8a9cd2dcb51d355f92c9ead81d4`
+* `repo_local_dir`: path to a local directory containing the source code of a repo/package to be diagnosed
 * `--config [config_file]`: path to a configuration file for the tool (config options explained in [the config file section](#configuration-file)) 
 * `--output_dir [output_dir]`: path to a directory in which to output the tool's results files (shape of results are explained in [the output section](#output))
 
@@ -73,6 +75,7 @@ The output is organized into the following top-level fields in the JSON, in orde
 	* if it runs other test commands, then a list of these commands are included (`nested_test_commands`)
 	* whether or not it timed out (`timed_out`)
 	* if it does run new user tests, then the number of passing and number of failing tests (`num_passing`, `num_failing`)
+    * if verbose testing is specified as an option, then there will be an additional file of extra test output produced
 * `scripts_over_code`: an object with fields for each of the scripts run over the package source code. For each script, the tool lists its output and if there was an error.
 * `QL_queries`: an object with fields for each of the QL queries run over the package source code. For each script, the tool lists the output (if running in verbose mode), and if there was an error.
 * `metadata`: an object with fields for some metadata about the package: repository link, commit SHA if one was specified
@@ -132,9 +135,29 @@ The output of each QL query is saved to a CSV file in the same directory as the
 ### Running with docker
 To be safe, you should probably run any untrusted code in a sandbox.
 Since the entire point of this tool is to run code from a set of packages/projects you didn't write, we assume most of this code will fall into the untrusted category.
-We host the docker container [on DockerHub](https://hub.docker.com/r/emarteca/npm-filter); if you edit the package source code and want to run your version in a docker container, we have included the docker build command below.
 
-#### Building docker (if you've updated the npm-filter source code)
+We host the generic docker container [on DockerHub](https://hub.docker.com/r/emarteca/npm-filter); if you edit the package source code and want to run your version in a docker container, we have included the docker build command below.
+
+The generic docker container runs on any package or repo specified.
+However, it is pre-built with default versions of node and npm.
+There is also the option to build a _repo-specific_ docker container. 
+In this case, the container is built with the particular version of node and npm specified in the repo's `package.json` configuration file.
+The container is also pre-built with the install and build phases of `npm-filter` run, so that you can then run the tests in the container without waiting for any setup.
+
+#### Building a container-specific docker
+If you want to build a container specific to a particular repo, use the following command:
+```
+# general use
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=[github link to repo] [--build-arg REPO_COMMIT=[specific commit SHA]]
+
+# specific example for memfs
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=https://github.com/streamich/memfs 
+
+# another example, for memfs at a specific commit
+docker build -t emarteca/npm-filter --build-arg REPO_LINK=https://github.com/streamich/memfs --build-arg REPO_COMMIT=863f373185837141504c05ed19f7a253232e0905
+```
+
+#### Building generic docker (if you've updated the npm-filter source code)
 Note: you don't need to do this if you're using npm-filter out of the box. 
 In that case, you'll pull directly from DockerHub.
 ```

diff --git a/configs/README.md b/configs/README.md
@@ -3,24 +3,30 @@ The configuration file is a JSON, organized by stages of npm-filter analysis.
 The stages are as follows:
 * `install`: package installation. Users can specify:
   * `timeout`: number of millisections after which, if the install is not complete, the process bails and is considered timed out
+  * `do_install`: if false, skip the install stage
 * `dependencies`: package dependency tracking (this is the libraries the current package depends on, both directly and transitively). Users can specify:
   * `track_deps`: if true, this specifies to compute the package dependencies
   * `include_dev_deps`: if true, this specifies to include the `devDependencies` in the dependency computation
   * `timeout`: timeout in milliseconds
 * `build`: package compile/build stage. Users can specify: 
   * `tracked_build_commands`: a list of build commands to test (any npm script with one of these commands as a substring will be tested). Any command not in this list will not be tested for the build stage.
   * `timeout`: timeout in milliseconds, per build command
+  * `track_build`: if false, skip the build stage
 * `test`: package test stage. Users can specify:
   * `track_tests`: if true, then the tool will run this testing diagnostic stage
   * `tracked_test_commands`: a list of test commands to test (any npm script with one of these commands as a substring will be tested). Any command not in this list will not be tested for the test stage.
   * `timeout`: timeout in milliseconds, per test command
+  * `test_verbose_all_output`: an object with two fields to configure the "verbose" test tracking option: here, output and some metrics (runtime, pass/fail, etc) for each test is output to a specified file. Note that currently we only support this option for the `jest` and `mocha` test infras. 
+  	* `do_verbose_tracking`: if true, do this verbose test tracking
+	* `verbose_json_output_file`: name of the file to which to save this verbose output
 * `meta_info`: any analysis-level configurations. Users can specify:
   * `VERBOSE_MODE`: if true, then the output JSON file will include the full output of all the commands run. Mainly for debugging.
   * `ignored_commands`: commands to ignore: if these are present in the npm script name, then they are not run even if they otherwise fall into a category of commands to run (mainly used to exclude any interactive-mode commands, such as tests with `watch`)
   * `ignored_substrings`: commands to ignore: if these strings are present in the command string itself, then these npm scripts are not run (same as `ignored_commands`, but for the command strings instead of the npm script names)
   * `rm_after_cloning`: if true, delete the package source code after the tool is done running. Strongly recommended if running over a large batch of packages.
   * `scripts_over_code`: list of paths to script files to run over the package source code. Note that these paths are relative to the location of **the config file**.
   * `QL_queries`: list of paths to QL query files to run over the package source code. Like the scripts, these paths are relative to the location of the config file.
+  * `custom_setup_scripts`: list of paths to script files to run over the package code after cloning, but before any of the stages of `npm-filter` are actually run. Commonly used to replace the default install stage (i.e., set `do_install` to `false`). Like all the other scripts, these paths are relative to the location of the config file.
 
 Users can customize any of the configuration fields, by providing a JSON file with the desired fields modified.
 Default values are used for any fields not specified.
@@ -29,18 +35,24 @@ As a demonstrative example, the default configuration is included below.
 ```
 {
 	"install": {
-		"timeout": 1000
+		"timeout": 1000,
+		"do_install": true
 	},
 	"dependencies": {
 		"track_deps": false,
 		"include_dev_deps": false
 	},
 	"build": {
+		"track_build": true,
 		"tracked_build_commands": ["build", "compile", "init"],
 		"timeout": 1000
 	},
 	"test": {
 		"track_tests": true,
+		"test_verbose_all_output": {
+			"do_verbose_tracking": false,
+			"verbose_json_output_file": "verbose_test_report.json"
+		},
 		"tracked_test_commands": ["test", "unit", "cov", "ci", "integration", "lint", "travis", "e2e", "bench",
 								  "mocha", "jest", "ava", "tap", "jasmine"],
 		"timeout": 1000
@@ -51,7 +63,8 @@ As a demonstrative example, the default configuration is included below.
 		"ignored_substrings": ["--watch", "nodemon"],
 		"rm_after_cloning": false,
 		"scripts_over_code": [ ],
-		"QL_queries": [ ]
+		"QL_queries": [ ],
+		"custom_setup_scripts": [ ]
 	}
 }
 ```

diff --git a/configs/default_filter_config.json b/configs/default_filter_config.json
@@ -29,6 +29,7 @@
 		"ignored_substrings": ["--watch", "nodemon"],
 		"rm_after_cloning": false,
 		"scripts_over_code": [ ],
-		"QL_queries": [ ]
+		"QL_queries": [ ],
+		"custom_setup_scripts": [ ]
 	}
 }
diff --git a/src/TestInfo.py b/src/TestInfo.py
@@ -40,9 +40,11 @@ class TestInfo:
 	}
 	# extra args, their position in the arg list, and any post-processing required
     # post-processing is a function that takes 2 arguments: input file and output file
+	# CAUTION: DO NOT PUT ANY MORE ARGS AFTER PLACEHOLDER_OUTPUT_FILE_NAME. THE CODE THAT
+	# PARSES THE OUTPUT RELIES ON THIS BEING THE *LAST* ARGUMENT
 	VERBOSE_TESTS_EXTRA_ARGS = {
 		"jest": {
-			"args": " --verbose --json --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
+			"args": " --verbose --json -i --outputFile=$PLACEHOLDER_OUTPUT_FILE_NAME$",
 			"position":  -1,
 			"post_processing": TestOutputProc.parse_jest_json_to_csv
 		},
@@ -116,21 +118,28 @@ def __init__(self, success, error_stream, output_stream, manager, VERBOSE_MODE):
 		self.timed_out = False
 		self.VERBOSE_MODE = VERBOSE_MODE
 		self.test_verbosity_output = None
+		self.startTime = 0
+		self.endTime = 0
 
 	def set_test_command( self, test_command):
 		self.test_command = test_command
 
 	def set_test_verbosity_output( self, verbose_output):
 		self.test_verbosity_output = verbose_output
 
+	def get_test_infras_list( test_command, manager):
+		test_infras = []
+		test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, test_command, manager) ]
+		test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, test_command, manager) ]
+		return( test_infras)
+
 	def compute_test_infras( self):
 		self.test_infras = []
 		self.test_covs = []
 		self.test_lints = []
 		self.nested_test_commands = []
 		if self.test_command:
-			self.test_infras += [ ti for ti in TestInfo.TRACKED_INFRAS if called_in_command(ti, self.test_command, self.manager) ]
-			self.test_infras += [ ri for ri in TestInfo.TRACKED_RUNNERS if called_in_command(ri, self.test_command, self.manager) ]
+			self.test_infras += TestInfo.get_test_infras_list(self.test_command, self.manager)
 			self.test_covs += [ TestInfo.TRACKED_COVERAGE[ti] for ti in TestInfo.TRACKED_COVERAGE if called_in_command(ti, self.test_command, self.manager) ]
 			self.test_lints += [ TestInfo.TRACKED_LINTERS[ti] for ti in TestInfo.TRACKED_LINTERS if called_in_command(ti, self.test_command, self.manager) ]
 		self.test_infras = list(set(self.test_infras))
@@ -189,6 +198,8 @@ def get_json_rep( self):
 		if self.test_verbosity_output:
 			json_rep["test_verbosity_output"] = self.test_verbosity_output
 		json_rep["timed_out"] = self.timed_out
+		json_rep["start_time"] = self.start_time
+		json_rep["end_time"] = self.end_time
 		return( json_rep)
 
 	def __str__(self):
@@ -228,6 +239,8 @@ def called_in_command( str_comm, command, manager):
 			return( True)
 		if command.find( "cross-env CI=true " + check_comm) > -1:
 			return( True)
+		if command.find( "cross-env TZ=utc " + check_comm) > -1:
+			return( True)
 		if command.find( "opener " + check_comm) > -1:
 			return( True)
 		if command.find( "gulp " + check_comm) > -1:

diff --git a/src/diagnose_github_repo.py b/src/diagnose_github_repo.py
@@ -20,13 +20,19 @@ def get_repo_and_SHA_from_repo_link(repo):
 		commit_SHA = split_res[1]
 	return(split_res[0], commit_SHA)
 
+# same format as getting the name from the repo link: we want the name of the dir, 
+# so after the last slash (and if there's no slash the whole name is returned)
+def get_name_from_path(repo_local_path):
+	return( repo_local_path.split("/")[-1])
+
 
 class RepoWalker():
 	name = "npm-pkgs"
 	VERBOSE_MODE = False
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
 	CUSTOM_SETUP_SCRIPTS = []
+	CUSTOM_LOCK_FILES = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -45,10 +51,10 @@ class RepoWalker():
 	TRACKED_BUILD_COMMANDS = ["build", "compile", "init"]
 
 	# timeouts for stages, in seconds
-	INSTALL_TIMEOUT = 1000
+	INSTALL_TIMEOUT = 10800 # 3 hours
 	# note: these are timeouts per *script* in the stage of the process
-	BUILD_TIMEOUT = 1000
-	TEST_TIMEOUT = 1000
+	BUILD_TIMEOUT = 10800 # 3 hours
+	TEST_TIMEOUT = 10800 # 3 hours
 
 	QL_CUTOFF = 5 # ignore if there are < 5 results
 
@@ -59,6 +65,9 @@ def __init__(self, config_file="", output_dir = "."):
 	def set_repo_links(self, repo_links):
 		self.repo_links = repo_links
 
+	def set_local_repo_path(self, repo_local_dir):
+		self.repo_local_dir = repo_local_dir
+
 	def set_up_config( self, config_file):
 		if not os.path.exists(config_file):
 			if config_file != "":
@@ -93,6 +102,8 @@ def set_up_config( self, config_file):
 		cf_dict = config_json.get( "install", {})
 		self.DO_INSTALL = cf_dict.get("do_install", self.DO_INSTALL)
 		self.INSTALL_TIMEOUT = cf_dict.get("timeout", self.INSTALL_TIMEOUT)
+		self.CUSTOM_LOCK_FILES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_lock_files", self.CUSTOM_LOCK_FILES)]
 
 		cf_dict = config_json.get( "build", {})
 		self.TRACK_BUILD = cf_dict.get("track_build", self.TRACK_BUILD)
@@ -123,22 +134,33 @@ def iterate_over_repos( self):
 				json_results["metadata"]["repo_commit_SHA"] = commit_SHA
 			with open(self.output_dir + "/" + package_name + '__results.json', 'w') as f:
 				json.dump( json_results, f, indent=4)
+		if self.repo_local_dir:
+			package_name = get_name_from_path( self.repo_local_dir)
+			json_results = diagnose_local_dir(self.repo_local_dir, self)
+			json_results["metadata"] = {}
+			json_results["metadata"]["repo_local_dir"] = repo_local_dir
+			with open(self.output_dir + "/" + package_name + '__results.json', 'w') as f:
+				json.dump( json_results, f, indent=4)
 
 
 argparser = argparse.ArgumentParser(description="Diagnose github repos, from a variety of sources")
 argparser.add_argument("--repo_list_file", metavar="rlistfile", type=str, nargs='?', help="file with list of github repo links")
 argparser.add_argument("--repo_link", metavar="rlink", type=str, nargs='?', help="single repo link")
+argparser.add_argument("--repo_local_dir", metavar="rlocallink", type=str, nargs='?', help="path to local directory that has the repo code")
 argparser.add_argument("--repo_link_and_SHA", metavar="rlink_and_SHA", type=str, nargs='*', help="single repo link, with optional commit SHA")
 argparser.add_argument("--config", metavar="config_file", type=str, nargs='?', help="path to config file")
 argparser.add_argument("--output_dir", metavar="output_dir", type=str, nargs='?', help="directory for results to be output to")
 args = argparser.parse_args()
 
 config = args.config if args.config else ""
-
 output_dir = args.output_dir if args.output_dir else "."
 
 walker = RepoWalker(config_file=config, output_dir=output_dir)
 
+repo_local_dir = None
+if args.repo_local_dir:
+	repo_local_dir = os.path.abspath(args.repo_local_dir)
+
 repo_links = []
 if args.repo_list_file:
 	try:
@@ -156,6 +178,7 @@ def iterate_over_repos( self):
 	# so we join all the repo_link args into a space-delimited string
 	repo_links += [' '.join(args.repo_link_and_SHA)]
 walker.set_repo_links( repo_links)
+walker.set_local_repo_path(repo_local_dir)
 walker.iterate_over_repos()
 
 
diff --git a/src/diagnose_npm_package.py b/src/diagnose_npm_package.py
@@ -20,6 +20,7 @@ class NPMSpider(scrapy.Spider):
 	RM_AFTER_CLONING = False
 	SCRIPTS_OVER_CODE = []
 	CUSTOM_SETUP_SCRIPTS = []
+	CUSTOM_LOCK_FILES = []
 	QL_QUERIES = []
 
 	DO_INSTALL = True
@@ -85,6 +86,8 @@ def set_up_config( self, config_file):
 		cf_dict = config_json.get( "install", {})
 		self.DO_INSTALL = cf_dict.get("do_install", self.DO_INSTALL)
 		self.INSTALL_TIMEOUT = cf_dict.get("timeout", self.INSTALL_TIMEOUT)
+		self.CUSTOM_LOCK_FILES = [ os.path.abspath(os.path.dirname(config_file if config_file else __file__)) + "/" + p 
+											for p in cf_dict.get( "custom_lock_files", self.CUSTOM_LOCK_FILES)]
 
 		cf_dict = config_json.get( "build", {})
 		self.TRACK_BUILD = cf_dict.get("track_build", self.TRACK_BUILD)

diff --git a/src/output_parsing/test_output_proc.py b/src/output_parsing/test_output_proc.py
@@ -2,6 +2,14 @@
 import xmltodict
 import pandas as pd
 
+# parse the output of mocha xunit reporter to a csv
+# does not delete the original xunit output file
+# outputs include, per test (in this order):
+# - test suite it's a part of
+# - name of the test itself
+# - runtime of the test
+# - stdout of the test (if any)
+# - pass/fail status (could also be "pending")
 def parse_mocha_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
         new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
@@ -31,10 +39,22 @@ def parse_mocha_json_to_csv(output_file, new_output_file=None):
             test_stdout += [""]
             test_pass_fail += ["passed"]
     res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
-    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
-    with open(new_output_file, 'w') as csv_file:
-        csv_file.write(res_df.to_csv())
+    try:
+        res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+        with open(new_output_file, 'w') as csv_file:
+            csv_file.write(res_df.to_csv())
+    except:
+        print("ERROR in data for file " + new_output_file + " -- no output printed. skipping to next step...")
 
+# parse the output of jest xunit reporter to a csv
+# this does the same thing as for mocha, to produce the same data fields
+# does not delete the original xunit output file
+# outputs include, per test (in this order):
+# - test suite it's a part of
+# - name of the test itself
+# - runtime of the test
+# - stdout of the test (if any)
+# - pass/fail status (could also be "pending")
 def parse_jest_json_to_csv(output_file, new_output_file=None):
     if new_output_file is None:
         new_output_file = output_file.split(".")[0] + ".csv" # same name, csv file extension
@@ -69,6 +89,9 @@ def parse_jest_json_to_csv(output_file, new_output_file=None):
             test_stdout += [";".join(test_results.get("failureMessages", []))]
             test_pass_fail += [test_status] # passed/failed/pending -- if not present assume failed
     res_df = pd.DataFrame(list(zip(test_suites, test_names, test_runtimes, test_stdout, test_pass_fail)))
-    res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
-    with open(new_output_file, 'w') as csv_file:
-        csv_file.write(res_df.to_csv())
+    try:
+        res_df.columns = ["test_suite", "name", "runtime", "stdout", "pass_fail"]
+        with open(new_output_file, 'w') as csv_file:
+            csv_file.write(res_df.to_csv())
+    except:
+        print("ERROR in data for file " + new_output_file + " -- no output printed. skipping to next step...")