diff --git a/util/artifact_downloader.py b/util/artifact_downloader.py index a3a427d..ad2abd3 100644 --- a/util/artifact_downloader.py +++ b/util/artifact_downloader.py @@ -1,7 +1,14 @@ import json +import logging import os import requests +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) def download_jar(artifact, version_key, folder="artifacts"): @@ -9,33 +16,52 @@ def download_jar(artifact, version_key, folder="artifacts"): version_info = artifact.get(version_key) if version_info and "Central" in version_info.get("repositories", []): - group_path = artifact["groupId"].replace(".", "/") + group_id = artifact["groupId"] + group_path = group_id.replace(".", "/") artifact_id = artifact["artifactId"] version = version_info["version"] url = f"https://repo1.maven.org/maven2/{group_path}/{artifact_id}/{version}/{artifact_id}-{version}.jar" - file_path = os.path.join(folder, f"{artifact_id}-{version}.jar") + + response = requests.head(url) + if response.status_code == 404: + tqdm_log(f"File {url} does not exist") + url = url.replace(".jar", ".pom") + response = requests.head(url) + if response.status_code == 200: + tqdm_log(f"File {url} exists, but is a POM file") + return + + file_path = os.path.join(folder, group_id, f"{artifact_id}-{version}.jar") + + if os.path.exists(file_path): + tqdm_log(f"File {file_path} already exists") + return response = requests.get(url) if response.status_code == 200: - os.makedirs(folder, exist_ok=True) + os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: f.write(response.content) else: - print(f"Failed to download {url} with status code {response.status_code}") + tqdm_log( + f"Failed to download {url} with status code {response.status_code}" + ) + + +def tqdm_log(msg): + with logging_redirect_tqdm(): + logging.error(msg) def main(file_path): with open(file_path, "r") as file: artifacts = json.load(file) - for i, artifact in enumerate(artifacts, start=1): + for artifact in tqdm(artifacts, desc="Downloading artifacts", unit="artifact"): download_jar(artifact, "mostUsedVersion") download_jar(artifact, "mostUsedVulnerableVersion") - if i >= 5: - break - if __name__ == "__main__": import sys diff --git a/util/infer_and_save.py b/util/infer_and_save.py new file mode 100644 index 0000000..babe2d5 --- /dev/null +++ b/util/infer_and_save.py @@ -0,0 +1,56 @@ +import logging +import os +import subprocess + +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +def main(artifacts_path, custom_cwd): + java_command = "/usr/lib/jvm/java-11-openjdk-amd64/bin/java" + java_options = "-Xmx8g" + classpath = "target/dependency/*:target/thesis-1.0-SNAPSHOT.jar" + main_class = "nl.tudelft.cornul11.thesis.corpus.MainApp" + mode = "-m IDENTIFICATION_MODE" + + for root, dirs, files in tqdm(os.walk(artifacts_path)): + for file in files: + if file.endswith(".jar"): + jar_path = os.path.join(root, file) + output_path = jar_path.replace(".jar", ".json") + command = [ + java_command, + java_options, + "-cp", + classpath, + main_class, + mode, + "-f", + jar_path, + "-o", + output_path, + ] + + logging.info(f"Running command: {' '.join(command)}") + result = subprocess.run(command, cwd=custom_cwd, shell=True) + if result.returncode != 0: + tqdm_log(f"Failed to run command: {' '.join(command)}") + else: + tqdm_log(f"Successfully ran command: {' '.join(command)}") + + +def tqdm_log(msg): + with logging_redirect_tqdm(): + logging.error(msg) + + +if __name__ == "__main__": + import sys + + artifacts_path = sys.argv[1] + custom_cwd = sys.argv[2] + main(artifacts_path, custom_cwd)