Skip to content

Commit

Permalink
added general trend collection, parsing and scraping tools from the m…
Browse files Browse the repository at this point in the history
…aven ecosystem
  • Loading branch information
Cornul11 committed Nov 18, 2023
1 parent 6e38692 commit 67d8681
Show file tree
Hide file tree
Showing 6 changed files with 487 additions and 25 deletions.
59 changes: 59 additions & 0 deletions util/build_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import argparse
import json
import os


def generate_latex_table(folder_path):
folder_path = os.path.abspath(folder_path)
thresholds = [0.5, 0.75, 0.9, 0.95, 0.99, 1.0]
latex_table = []

latex_table.append(r"\begin{tabular}{ccccccc}")
latex_table.append(r"\toprule")
latex_table.append(r"Configuration & Threshold & Precision & Recall & F1 Score \\")
latex_table.append(r"\midrule")

for configuration in ["Relocation Disabled", "Relocation Enabled", "Minimize Jar Disabled", "Minimize Jar Enabled"]:
latex_table.append(f"{configuration} & & & & \\\\")
for threshold in thresholds:
filename = os.path.join(folder_path, f"stats_{threshold}.json")
if not os.path.exists(filename):
print(f"File not found: {filename}")
continue

with open(filename, "r") as f:
data = json.load(f)
suffix = configuration.replace(" ", "")

total_f1_score = data[f"totalF1Score{suffix}"]
precision = data[f"precision{suffix}"]
recall = data[f"recall{suffix}"]
total_projects = data[f"totalProjects{suffix}"]

if total_projects == 0:
precision_val = recall_val = f1_score_val = 0
else:
precision_val = precision / total_projects
recall_val = recall / total_projects
f1_score_val = total_f1_score / total_projects

row = f"& {threshold} & {precision_val:.3f} & {recall_val:.3f} & {f1_score_val:.3f} \\\\"
latex_table.append(row)
latex_table.append(r"\midrule")

latex_table.append(r"\bottomrule")
latex_table.append(r"\end{tabular}")

latex_code = "\n".join(latex_table)
print(latex_code)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate LaTeX table from JSON files."
)
parser.add_argument(
"folder_path", type=str, help="Path to the folder containing the JSON files"
)
args = parser.parse_args()
generate_latex_table(args.folder_path)
40 changes: 40 additions & 0 deletions util/collect_most_popular_pkgs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
import os
import sys
import time
from os.path import join, dirname

import requests
from dotenv import load_dotenv

dotenv_path = join(dirname(__file__), ".env")
load_dotenv(dotenv_path)

LIBRARIES_IO_KEY = os.environ.get("LIBRARIES_IO_KEY")


def get_top_libraries(n):
# max one query per second
max_page = n // 100 + 1
url = "https://libraries.io/api/search?&platforms=Maven&sort=rank&per_page=100"
libraries = []
params = {"api_key": LIBRARIES_IO_KEY}
for page in range(1, max_page + 1):
params["page"] = page
response = requests.get(url, params=params)
libraries.extend(response.json())
time.sleep(1) # throttle to max 60 queries per minute

return libraries[:n]


if __name__ == "__main__":
if len(sys.argv) != 2 or not sys.argv[1].isdigit():
print("Usage: python collect_most_popular_pkgs.py <number_of_pkgs>")
sys.exit(1)

libraries = get_top_libraries(int(sys.argv[1]))
with open("libraries.json", "w") as file:
json.dump(libraries, file, indent=4)


133 changes: 133 additions & 0 deletions util/parse_top_maven_libs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from bs4 import BeautifulSoup
import json


def extract_libraries(html_string):
soup = BeautifulSoup(html_string, "html.parser")
libraries = []
for div in soup.find_all("div", class_="im"):
if div.find("div", class_="im-header") is None:
# div is an addiv
continue
title = (
div.find("div", class_="im-header")
.find("h2", class_="im-title")
.find("a")
.text.strip()
)
subtitle = div.find("div", class_="im-header").find("p", class_="im-subtitle")
links = subtitle.find_all("a")
groupId = links[0].text.strip()
artifactId = links[1].text.strip()
libraries.append((groupId, artifactId, title))
return libraries


with open("libraries.json", "r") as file:
libraries = json.load(file)


def read_html_string():
lines = []
while True:
line = input()
if line == "END":
break
lines.append(line)
return "\n".join(lines)


if False:
# it's for extracting the top used libraries
for _ in range(10):
print("Enter HTML string (type 'END' on a new line to finish):")
html_string = read_html_string()
libraries.extend(extract_libraries(html_string))


def extract_versions(html_string):
soup = BeautifulSoup(html_string, "html.parser")
version_data = []

table = soup.select_one("#snippets .grid.versions")
if not table:
return version_data

for tr in table.find_all("tr"):
version_link = tr.find("a", class_="vbtn release")
usages_link = tr.find("div", class_="pbt")
vulnerabilities_link = tr.find("a", class_="vuln")

if version_link:
version = version_link.text.strip()
usages = (
int(usages_link.text.strip().replace(",", ""))
if usages_link and usages_link.text.strip().replace(",", "").isdigit()
else 0
)
vulnerabilities = (
int(vulnerabilities_link.text.split()[0].replace(",", ""))
if vulnerabilities_link
else 0
)

version_data.append((version, usages, vulnerabilities))

return version_data


def find_most_popular_and_vulnerable(version_data):
non_vulnerable_versions = [v for v in version_data if v[2] == 0]
vulnerable_versions = [v for v in version_data if v[2] > 0]

most_popular_non_vulnerable = max(
non_vulnerable_versions, key=lambda x: x[1], default=None
)
most_popular_vulnerable = max(vulnerable_versions, key=lambda x: x[1], default=None)

return most_popular_non_vulnerable, most_popular_vulnerable


def extract_library_info(html_string):
soup = BeautifulSoup(html_string, "html.parser")
breadcrumb = soup.select_one("div.breadcrumb")
if not breadcrumb:
return None, None
breadcrumb_parts = breadcrumb.get_text(strip=True).split("»")
if len(breadcrumb_parts) < 3:
return None, None

groupId = breadcrumb_parts[1].strip()
artifactId = breadcrumb_parts[2].strip()
return groupId, artifactId


def update_library_versions(
libraries, groupId, artifactId, most_popular, most_vulnerable
):
for lib in libraries:
if lib["groupId"] == groupId and lib["artifactId"] == artifactId:
lib["mostPopularVersion"] = most_popular
lib["mostVulnerableVersion"] = most_vulnerable
break


for _ in range(1):
print("Enter HTML string (type 'END' on a new line to finish):")
html_string = read_html_string()
if html_string == "KEK\nEND":
break

groupId, artifactId = extract_library_info(html_string)

most_popular, most_vulnerable = find_most_popular_and_vulnerable(
extract_versions(html_string)
)

update_library_versions(
libraries, groupId, artifactId, most_popular, most_vulnerable
)


with open("libraries.json", "w") as file:
json.dump(libraries, file, indent=4)
76 changes: 51 additions & 25 deletions util/pom_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ def get_pom_files_from_file(file_path):
yield line.strip()


def get_publication_date_from_maven_repo_header(group_id, artifact_id, version):
url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom"
response = requests.head(url)
if response.status_code == 200:
date_text = response.headers["last-modified"]
publication_date = datetime.strptime(
date_text, "%a, %d %b %Y %H:%M:%S %Z"
).strftime("%Y-%m")
return publication_date


def get_publication_date_from_maven_repo(group_id, artifact_id, version):
global total_waiting_for_maven
start_time = datetime.now()
Expand Down Expand Up @@ -136,6 +147,21 @@ def get_publication_date_from_local_maven_index(group_id, artifact_id, version):
total_waiting_for_maven += (end_time - start_time).total_seconds()


def extract_gav_from_pom_path(pom_file_path):
# last folder name is the version
# second last is the artifactId
# and everything that comes after ../.m2/repository/ is the groupId, only with dots instead of slashes

path_components = pom_file_path.split(os.sep)
version = path_components[-2]
artifact_id = path_components[-3]

m2_index = path_components.index(".m2")
group_id = ".".join(path_components[m2_index + 2 : -3])

return group_id, artifact_id, version


def contains_shade_plugin(pom_file_path):
result_dict = {
"path": pom_file_path,
Expand All @@ -155,28 +181,6 @@ def contains_shade_plugin(pom_file_path):
root = tree.getroot()
ns_url = "http://maven.apache.org/POM/4.0.0"

group_id = root.find(f"{{{ns_url}}}groupId")
artifact_id = root.find(f"{{{ns_url}}}artifactId")
version = root.find(f"{{{ns_url}}}version")

if group_id is None:
parent = root.find(f"{{{ns_url}}}parent")
if parent is not None:
group_id = parent.find(f"{{{ns_url}}}groupId")

if version is None:
parent = root.find(f"{{{ns_url}}}parent")
if parent is not None:
version = parent.find(f"{{{ns_url}}}version")

result_dict.update(
{
"group_id": group_id.text if group_id is not None else None,
"artifact_id": artifact_id.text if artifact_id is not None else None,
"version": version.text if version is not None else None,
}
)

if root.find(f"{{{ns_url}}}parent") is not None:
result_dict["has_parent"] = True

Expand Down Expand Up @@ -275,6 +279,7 @@ def create_archive(pom_files, archive_path, progress_bar):
total_relocations = 0
total_errors = 0
total_not_found_in_index = 0
total_not_found = 0
total_with_parents = 0
total_shade_plugin_no_parent = 0
overall_trends = {}
Expand Down Expand Up @@ -303,18 +308,33 @@ def create_archive(pom_files, archive_path, progress_bar):
total=total_pom_files,
desc="Processing pom.xml files",
):
if result["is_error"]:
group_id, artifact_id, version = extract_gav_from_pom_path(result["path"])

if (
result["is_error"]
or group_id is None
or artifact_id is None
or version is None
):
total_errors += 1

date = get_publication_date_from_local_maven_index(
result["group_id"], result["artifact_id"], result["version"]
group_id, artifact_id, version
)

if date is None:
total_not_found_in_index += 1
date = get_publication_date_from_maven_repo_header(
group_id, artifact_id, version
)

# retain only the year and month for stats
year_month = date[:7] if date else None

if date:
overall_trends[year_month] = overall_trends.get(year_month, 0) + 1
else:
total_not_found_in_index += 1
total_not_found += 1

if result["has_shade_plugin"]:
if date:
Expand Down Expand Up @@ -375,6 +395,9 @@ def create_archive(pom_files, archive_path, progress_bar):
print(
f"Total not found in index: {total_not_found_in_index} ({total_not_found_in_index / total_shade_plugins * 100:.2f}%)"
)
print(
f"Total not found: {total_not_found} ({total_not_found / total_shade_plugins * 100:.2f}%)"
)

if args.save:
stats = {
Expand All @@ -386,6 +409,9 @@ def create_archive(pom_files, archive_path, progress_bar):
"total_minimize_jar": total_minimize_jar,
"total_relocations": total_relocations,
"total_with_parents": total_with_parents,
"total_not_found_in_index": total_not_found_in_index,
"total_not_found": total_not_found,
"general_trends": overall_trends,
"shade_plugin_trends": monthly_trends,
}

Expand Down
Loading

0 comments on commit 67d8681

Please sign in to comment.