added general trend collection, parsing and scraping tools from the m…

…aven ecosystem
Cornul11 · Nov 18, 2023 · 67d8681 · 67d8681
1 parent 6e38692
commit 67d8681
Show file tree

Hide file tree

Showing 6 changed files with 487 additions and 25 deletions.
diff --git a/util/build_table.py b/util/build_table.py
@@ -0,0 +1,59 @@
+import argparse
+import json
+import os
+
+
+def generate_latex_table(folder_path):
+    folder_path = os.path.abspath(folder_path)
+    thresholds = [0.5, 0.75, 0.9, 0.95, 0.99, 1.0]
+    latex_table = []
+
+    latex_table.append(r"\begin{tabular}{ccccccc}")
+    latex_table.append(r"\toprule")
+    latex_table.append(r"Configuration & Threshold & Precision & Recall & F1 Score \\")
+    latex_table.append(r"\midrule")
+
+    for configuration in ["Relocation Disabled", "Relocation Enabled", "Minimize Jar Disabled", "Minimize Jar Enabled"]:
+        latex_table.append(f"{configuration} & & & & \\\\")
+        for threshold in thresholds:
+            filename = os.path.join(folder_path, f"stats_{threshold}.json")
+            if not os.path.exists(filename):
+                print(f"File not found: {filename}")
+                continue
+
+            with open(filename, "r") as f:
+                data = json.load(f)
+                suffix = configuration.replace(" ", "")
+
+                total_f1_score = data[f"totalF1Score{suffix}"]
+                precision = data[f"precision{suffix}"]
+                recall = data[f"recall{suffix}"]
+                total_projects = data[f"totalProjects{suffix}"]
+
+                if total_projects == 0:
+                    precision_val = recall_val = f1_score_val = 0
+                else:
+                    precision_val = precision / total_projects
+                    recall_val = recall / total_projects
+                    f1_score_val = total_f1_score / total_projects
+
+                row = f"& {threshold} & {precision_val:.3f} & {recall_val:.3f} & {f1_score_val:.3f} \\\\"
+                latex_table.append(row)
+        latex_table.append(r"\midrule")
+
+    latex_table.append(r"\bottomrule")
+    latex_table.append(r"\end{tabular}")
+
+    latex_code = "\n".join(latex_table)
+    print(latex_code)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate LaTeX table from JSON files."
+    )
+    parser.add_argument(
+        "folder_path", type=str, help="Path to the folder containing the JSON files"
+    )
+    args = parser.parse_args()
+    generate_latex_table(args.folder_path)
diff --git a/util/collect_most_popular_pkgs.py b/util/collect_most_popular_pkgs.py
@@ -0,0 +1,40 @@
+import json
+import os
+import sys
+import time
+from os.path import join, dirname
+
+import requests
+from dotenv import load_dotenv
+
+dotenv_path = join(dirname(__file__), ".env")
+load_dotenv(dotenv_path)
+
+LIBRARIES_IO_KEY = os.environ.get("LIBRARIES_IO_KEY")
+
+
+def get_top_libraries(n):
+    # max one query per second
+    max_page = n // 100 + 1
+    url = "https://libraries.io/api/search?&platforms=Maven&sort=rank&per_page=100"
+    libraries = []
+    params = {"api_key": LIBRARIES_IO_KEY}
+    for page in range(1, max_page + 1):
+        params["page"] = page
+        response = requests.get(url, params=params)
+        libraries.extend(response.json())
+        time.sleep(1)  # throttle to max 60 queries per minute
+
+    return libraries[:n]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2 or not sys.argv[1].isdigit():
+        print("Usage: python collect_most_popular_pkgs.py <number_of_pkgs>")
+        sys.exit(1)
+
+    libraries = get_top_libraries(int(sys.argv[1]))
+    with open("libraries.json", "w") as file:
+        json.dump(libraries, file, indent=4)
+
+
diff --git a/util/parse_top_maven_libs.py b/util/parse_top_maven_libs.py
@@ -0,0 +1,133 @@
+from bs4 import BeautifulSoup
+import json
+
+
+def extract_libraries(html_string):
+    soup = BeautifulSoup(html_string, "html.parser")
+    libraries = []
+    for div in soup.find_all("div", class_="im"):
+        if div.find("div", class_="im-header") is None:
+            # div is an addiv
+            continue
+        title = (
+            div.find("div", class_="im-header")
+            .find("h2", class_="im-title")
+            .find("a")
+            .text.strip()
+        )
+        subtitle = div.find("div", class_="im-header").find("p", class_="im-subtitle")
+        links = subtitle.find_all("a")
+        groupId = links[0].text.strip()
+        artifactId = links[1].text.strip()
+        libraries.append((groupId, artifactId, title))
+    return libraries
+
+
+with open("libraries.json", "r") as file:
+    libraries = json.load(file)
+
+
+def read_html_string():
+    lines = []
+    while True:
+        line = input()
+        if line == "END":
+            break
+        lines.append(line)
+    return "\n".join(lines)
+
+
+if False:
+    # it's for extracting the top used libraries
+    for _ in range(10):
+        print("Enter HTML string (type 'END' on a new line to finish):")
+        html_string = read_html_string()
+        libraries.extend(extract_libraries(html_string))
+
+
+def extract_versions(html_string):
+    soup = BeautifulSoup(html_string, "html.parser")
+    version_data = []
+
+    table = soup.select_one("#snippets .grid.versions")
+    if not table:
+        return version_data
+
+    for tr in table.find_all("tr"):
+        version_link = tr.find("a", class_="vbtn release")
+        usages_link = tr.find("div", class_="pbt")
+        vulnerabilities_link = tr.find("a", class_="vuln")
+
+        if version_link:
+            version = version_link.text.strip()
+            usages = (
+                int(usages_link.text.strip().replace(",", ""))
+                if usages_link and usages_link.text.strip().replace(",", "").isdigit()
+                else 0
+            )
+            vulnerabilities = (
+                int(vulnerabilities_link.text.split()[0].replace(",", ""))
+                if vulnerabilities_link
+                else 0
+            )
+
+            version_data.append((version, usages, vulnerabilities))
+
+    return version_data
+
+
+def find_most_popular_and_vulnerable(version_data):
+    non_vulnerable_versions = [v for v in version_data if v[2] == 0]
+    vulnerable_versions = [v for v in version_data if v[2] > 0]
+
+    most_popular_non_vulnerable = max(
+        non_vulnerable_versions, key=lambda x: x[1], default=None
+    )
+    most_popular_vulnerable = max(vulnerable_versions, key=lambda x: x[1], default=None)
+
+    return most_popular_non_vulnerable, most_popular_vulnerable
+
+
+def extract_library_info(html_string):
+    soup = BeautifulSoup(html_string, "html.parser")
+    breadcrumb = soup.select_one("div.breadcrumb")
+    if not breadcrumb:
+        return None, None
+    breadcrumb_parts = breadcrumb.get_text(strip=True).split("»")
+    if len(breadcrumb_parts) < 3:
+        return None, None
+
+    groupId = breadcrumb_parts[1].strip()
+    artifactId = breadcrumb_parts[2].strip()
+    return groupId, artifactId
+
+
+def update_library_versions(
+    libraries, groupId, artifactId, most_popular, most_vulnerable
+):
+    for lib in libraries:
+        if lib["groupId"] == groupId and lib["artifactId"] == artifactId:
+            lib["mostPopularVersion"] = most_popular
+            lib["mostVulnerableVersion"] = most_vulnerable
+            break
+
+
+for _ in range(1):
+    print("Enter HTML string (type 'END' on a new line to finish):")
+    html_string = read_html_string()
+    if html_string == "KEK\nEND":
+        break
+
+    groupId, artifactId = extract_library_info(html_string)
+
+    most_popular, most_vulnerable = find_most_popular_and_vulnerable(
+        extract_versions(html_string)
+    )
+
+    update_library_versions(
+        libraries, groupId, artifactId, most_popular, most_vulnerable
+    )
+
+
+with open("libraries.json", "w") as file:
+    json.dump(libraries, file, indent=4)
diff --git a/util/pom_analysis.py b/util/pom_analysis.py
@@ -43,6 +43,17 @@ def get_pom_files_from_file(file_path):
             yield line.strip()
 
 
+def get_publication_date_from_maven_repo_header(group_id, artifact_id, version):
+    url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom"
+    response = requests.head(url)
+    if response.status_code == 200:
+        date_text = response.headers["last-modified"]
+        publication_date = datetime.strptime(
+            date_text, "%a, %d %b %Y %H:%M:%S %Z"
+        ).strftime("%Y-%m")
+        return publication_date
+
+
 def get_publication_date_from_maven_repo(group_id, artifact_id, version):
     global total_waiting_for_maven
     start_time = datetime.now()
@@ -136,6 +147,21 @@ def get_publication_date_from_local_maven_index(group_id, artifact_id, version):
         total_waiting_for_maven += (end_time - start_time).total_seconds()
 
 
+def extract_gav_from_pom_path(pom_file_path):
+    # last folder name is the version
+    # second last is the artifactId
+    # and everything that comes after ../.m2/repository/ is the groupId, only with dots instead of slashes
+
+    path_components = pom_file_path.split(os.sep)
+    version = path_components[-2]
+    artifact_id = path_components[-3]
+
+    m2_index = path_components.index(".m2")
+    group_id = ".".join(path_components[m2_index + 2 : -3])
+
+    return group_id, artifact_id, version
+
+
 def contains_shade_plugin(pom_file_path):
     result_dict = {
         "path": pom_file_path,
@@ -155,28 +181,6 @@ def contains_shade_plugin(pom_file_path):
         root = tree.getroot()
         ns_url = "http://maven.apache.org/POM/4.0.0"
 
-        group_id = root.find(f"{{{ns_url}}}groupId")
-        artifact_id = root.find(f"{{{ns_url}}}artifactId")
-        version = root.find(f"{{{ns_url}}}version")
-
-        if group_id is None:
-            parent = root.find(f"{{{ns_url}}}parent")
-            if parent is not None:
-                group_id = parent.find(f"{{{ns_url}}}groupId")
-
-        if version is None:
-            parent = root.find(f"{{{ns_url}}}parent")
-            if parent is not None:
-                version = parent.find(f"{{{ns_url}}}version")
-
-        result_dict.update(
-            {
-                "group_id": group_id.text if group_id is not None else None,
-                "artifact_id": artifact_id.text if artifact_id is not None else None,
-                "version": version.text if version is not None else None,
-            }
-        )
-
         if root.find(f"{{{ns_url}}}parent") is not None:
             result_dict["has_parent"] = True
 
@@ -275,6 +279,7 @@ def create_archive(pom_files, archive_path, progress_bar):
     total_relocations = 0
     total_errors = 0
     total_not_found_in_index = 0
+    total_not_found = 0
     total_with_parents = 0
     total_shade_plugin_no_parent = 0
     overall_trends = {}
@@ -303,18 +308,33 @@ def create_archive(pom_files, archive_path, progress_bar):
                 total=total_pom_files,
                 desc="Processing pom.xml files",
         ):
-            if result["is_error"]:
+            group_id, artifact_id, version = extract_gav_from_pom_path(result["path"])
+
+            if (
+                    result["is_error"]
+                    or group_id is None
+                    or artifact_id is None
+                    or version is None
+            ):
                 total_errors += 1
 
             date = get_publication_date_from_local_maven_index(
-                result["group_id"], result["artifact_id"], result["version"]
+                group_id, artifact_id, version
             )
+
+            if date is None:
+                total_not_found_in_index += 1
+                date = get_publication_date_from_maven_repo_header(
+                    group_id, artifact_id, version
+                )
+
+            # retain only the year and month for stats
             year_month = date[:7] if date else None
 
             if date:
                 overall_trends[year_month] = overall_trends.get(year_month, 0) + 1
             else:
-                total_not_found_in_index += 1
+                total_not_found += 1
 
             if result["has_shade_plugin"]:
                 if date:
@@ -375,6 +395,9 @@ def create_archive(pom_files, archive_path, progress_bar):
         print(
             f"Total not found in index: {total_not_found_in_index} ({total_not_found_in_index / total_shade_plugins * 100:.2f}%)"
         )
+        print(
+            f"Total not found: {total_not_found} ({total_not_found / total_shade_plugins * 100:.2f}%)"
+        )
 
         if args.save:
             stats = {
@@ -386,6 +409,9 @@ def create_archive(pom_files, archive_path, progress_bar):
                 "total_minimize_jar": total_minimize_jar,
                 "total_relocations": total_relocations,
                 "total_with_parents": total_with_parents,
+                "total_not_found_in_index": total_not_found_in_index,
+                "total_not_found": total_not_found,
+                "general_trends": overall_trends,
                 "shade_plugin_trends": monthly_trends,
             }