-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added general trend collection, parsing and scraping tools from the m…
…aven ecosystem
- Loading branch information
Showing
6 changed files
with
487 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import argparse | ||
import json | ||
import os | ||
|
||
|
||
def generate_latex_table(folder_path): | ||
folder_path = os.path.abspath(folder_path) | ||
thresholds = [0.5, 0.75, 0.9, 0.95, 0.99, 1.0] | ||
latex_table = [] | ||
|
||
latex_table.append(r"\begin{tabular}{ccccccc}") | ||
latex_table.append(r"\toprule") | ||
latex_table.append(r"Configuration & Threshold & Precision & Recall & F1 Score \\") | ||
latex_table.append(r"\midrule") | ||
|
||
for configuration in ["Relocation Disabled", "Relocation Enabled", "Minimize Jar Disabled", "Minimize Jar Enabled"]: | ||
latex_table.append(f"{configuration} & & & & \\\\") | ||
for threshold in thresholds: | ||
filename = os.path.join(folder_path, f"stats_{threshold}.json") | ||
if not os.path.exists(filename): | ||
print(f"File not found: {filename}") | ||
continue | ||
|
||
with open(filename, "r") as f: | ||
data = json.load(f) | ||
suffix = configuration.replace(" ", "") | ||
|
||
total_f1_score = data[f"totalF1Score{suffix}"] | ||
precision = data[f"precision{suffix}"] | ||
recall = data[f"recall{suffix}"] | ||
total_projects = data[f"totalProjects{suffix}"] | ||
|
||
if total_projects == 0: | ||
precision_val = recall_val = f1_score_val = 0 | ||
else: | ||
precision_val = precision / total_projects | ||
recall_val = recall / total_projects | ||
f1_score_val = total_f1_score / total_projects | ||
|
||
row = f"& {threshold} & {precision_val:.3f} & {recall_val:.3f} & {f1_score_val:.3f} \\\\" | ||
latex_table.append(row) | ||
latex_table.append(r"\midrule") | ||
|
||
latex_table.append(r"\bottomrule") | ||
latex_table.append(r"\end{tabular}") | ||
|
||
latex_code = "\n".join(latex_table) | ||
print(latex_code) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Generate LaTeX table from JSON files." | ||
) | ||
parser.add_argument( | ||
"folder_path", type=str, help="Path to the folder containing the JSON files" | ||
) | ||
args = parser.parse_args() | ||
generate_latex_table(args.folder_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import json | ||
import os | ||
import sys | ||
import time | ||
from os.path import join, dirname | ||
|
||
import requests | ||
from dotenv import load_dotenv | ||
|
||
dotenv_path = join(dirname(__file__), ".env") | ||
load_dotenv(dotenv_path) | ||
|
||
LIBRARIES_IO_KEY = os.environ.get("LIBRARIES_IO_KEY") | ||
|
||
|
||
def get_top_libraries(n): | ||
# max one query per second | ||
max_page = n // 100 + 1 | ||
url = "https://libraries.io/api/search?&platforms=Maven&sort=rank&per_page=100" | ||
libraries = [] | ||
params = {"api_key": LIBRARIES_IO_KEY} | ||
for page in range(1, max_page + 1): | ||
params["page"] = page | ||
response = requests.get(url, params=params) | ||
libraries.extend(response.json()) | ||
time.sleep(1) # throttle to max 60 queries per minute | ||
|
||
return libraries[:n] | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 2 or not sys.argv[1].isdigit(): | ||
print("Usage: python collect_most_popular_pkgs.py <number_of_pkgs>") | ||
sys.exit(1) | ||
|
||
libraries = get_top_libraries(int(sys.argv[1])) | ||
with open("libraries.json", "w") as file: | ||
json.dump(libraries, file, indent=4) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from bs4 import BeautifulSoup | ||
import json | ||
|
||
|
||
def extract_libraries(html_string): | ||
soup = BeautifulSoup(html_string, "html.parser") | ||
libraries = [] | ||
for div in soup.find_all("div", class_="im"): | ||
if div.find("div", class_="im-header") is None: | ||
# div is an addiv | ||
continue | ||
title = ( | ||
div.find("div", class_="im-header") | ||
.find("h2", class_="im-title") | ||
.find("a") | ||
.text.strip() | ||
) | ||
subtitle = div.find("div", class_="im-header").find("p", class_="im-subtitle") | ||
links = subtitle.find_all("a") | ||
groupId = links[0].text.strip() | ||
artifactId = links[1].text.strip() | ||
libraries.append((groupId, artifactId, title)) | ||
return libraries | ||
|
||
|
||
with open("libraries.json", "r") as file: | ||
libraries = json.load(file) | ||
|
||
|
||
def read_html_string(): | ||
lines = [] | ||
while True: | ||
line = input() | ||
if line == "END": | ||
break | ||
lines.append(line) | ||
return "\n".join(lines) | ||
|
||
|
||
if False: | ||
# it's for extracting the top used libraries | ||
for _ in range(10): | ||
print("Enter HTML string (type 'END' on a new line to finish):") | ||
html_string = read_html_string() | ||
libraries.extend(extract_libraries(html_string)) | ||
|
||
|
||
def extract_versions(html_string): | ||
soup = BeautifulSoup(html_string, "html.parser") | ||
version_data = [] | ||
|
||
table = soup.select_one("#snippets .grid.versions") | ||
if not table: | ||
return version_data | ||
|
||
for tr in table.find_all("tr"): | ||
version_link = tr.find("a", class_="vbtn release") | ||
usages_link = tr.find("div", class_="pbt") | ||
vulnerabilities_link = tr.find("a", class_="vuln") | ||
|
||
if version_link: | ||
version = version_link.text.strip() | ||
usages = ( | ||
int(usages_link.text.strip().replace(",", "")) | ||
if usages_link and usages_link.text.strip().replace(",", "").isdigit() | ||
else 0 | ||
) | ||
vulnerabilities = ( | ||
int(vulnerabilities_link.text.split()[0].replace(",", "")) | ||
if vulnerabilities_link | ||
else 0 | ||
) | ||
|
||
version_data.append((version, usages, vulnerabilities)) | ||
|
||
return version_data | ||
|
||
|
||
def find_most_popular_and_vulnerable(version_data): | ||
non_vulnerable_versions = [v for v in version_data if v[2] == 0] | ||
vulnerable_versions = [v for v in version_data if v[2] > 0] | ||
|
||
most_popular_non_vulnerable = max( | ||
non_vulnerable_versions, key=lambda x: x[1], default=None | ||
) | ||
most_popular_vulnerable = max(vulnerable_versions, key=lambda x: x[1], default=None) | ||
|
||
return most_popular_non_vulnerable, most_popular_vulnerable | ||
|
||
|
||
def extract_library_info(html_string): | ||
soup = BeautifulSoup(html_string, "html.parser") | ||
breadcrumb = soup.select_one("div.breadcrumb") | ||
if not breadcrumb: | ||
return None, None | ||
breadcrumb_parts = breadcrumb.get_text(strip=True).split("»") | ||
if len(breadcrumb_parts) < 3: | ||
return None, None | ||
|
||
groupId = breadcrumb_parts[1].strip() | ||
artifactId = breadcrumb_parts[2].strip() | ||
return groupId, artifactId | ||
|
||
|
||
def update_library_versions( | ||
libraries, groupId, artifactId, most_popular, most_vulnerable | ||
): | ||
for lib in libraries: | ||
if lib["groupId"] == groupId and lib["artifactId"] == artifactId: | ||
lib["mostPopularVersion"] = most_popular | ||
lib["mostVulnerableVersion"] = most_vulnerable | ||
break | ||
|
||
|
||
for _ in range(1): | ||
print("Enter HTML string (type 'END' on a new line to finish):") | ||
html_string = read_html_string() | ||
if html_string == "KEK\nEND": | ||
break | ||
|
||
groupId, artifactId = extract_library_info(html_string) | ||
|
||
most_popular, most_vulnerable = find_most_popular_and_vulnerable( | ||
extract_versions(html_string) | ||
) | ||
|
||
update_library_versions( | ||
libraries, groupId, artifactId, most_popular, most_vulnerable | ||
) | ||
|
||
|
||
with open("libraries.json", "w") as file: | ||
json.dump(libraries, file, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.