pushed all code used in the experiments

Cornul11 · Mar 18, 2024 · cf6c0c3 · cf6c0c3
1 parent 4fddc48
commit cf6c0c3
Show file tree

Hide file tree

Showing 6 changed files with 32,501 additions and 0 deletions.
diff --git a/util/README.md b/util/README.md
@@ -0,0 +1,31 @@
+Create the pom_info table:
+
+```sql
+CREATE TABLE `pom_info` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `library_id` int(11) NOT NULL,
+  `has_assembly_plugin` tinyint(1) DEFAULT 0,
+  `has_shade_plugin` tinyint(1) DEFAULT 0,
+  `has_dependency_reduced_pom` tinyint(1) DEFAULT 0,
+  `has_minimize_jar` tinyint(1) DEFAULT 0,
+  `has_relocations` tinyint(1) DEFAULT 0,
+  `has_filters` tinyint(1) DEFAULT 0,
+  `has_transformers` tinyint(1) DEFAULT 0,
+  `parent_id` int(11) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `idx_library_id` (`library_id`),
+  KEY `fk_parent_id` (`parent_id`),
+  CONSTRAINT `fk_library_info` FOREIGN KEY (`library_id`) REFERENCES `libraries` (`id`),
+  CONSTRAINT `fk_parent_id` FOREIGN KEY (`parent_id`) REFERENCES `libraries` (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci
+```
+
+
+
+CREATE TABLE `signatures_memory` (
+`id` int(11) NOT NULL AUTO_INCREMENT,
+`library_id` int(11) NOT NULL,
+`class_hash` bigint(20) NOT NULL,
+`class_crc` bigint(20) NOT NULL,
+PRIMARY KEY (`id`)
+) ENGINE=InnoDB AUTO_INCREMENT=1175227255 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci
diff --git a/util/evaluation_results.ipynb b/util/evaluation_results.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-10T17:45:32.372659Z",
+     "start_time": "2024-03-10T17:45:32.368224Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os, json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-12T13:22:34.058895Z",
+     "start_time": "2024-03-12T13:22:33.171578Z"
+    }
+   },
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m project_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../projects_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m result_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m project_files \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mlistdir(project_dir)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompare_results\u001b[39m(expected, actual, threshold\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.95\u001b[39m):\n\u001b[1;32m      7\u001b[0m   true_positives \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
+      "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m project_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../projects_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m result_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m project_files \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mlistdir(project_dir)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompare_results\u001b[39m(expected, actual, threshold\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.95\u001b[39m):\n\u001b[1;32m      7\u001b[0m   true_positives \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
+      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1457\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.SafeCallWrapper.__call__\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:701\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1395\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1344\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:312\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m~/tudelft/master_thesis/jar-vulnerability-detection/util/venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py:2070\u001b[0m, in \u001b[0;36mPyDB.do_wait_suspend\u001b[0;34m(self, thread, frame, event, arg, exception_type)\u001b[0m\n\u001b[1;32m   2067\u001b[0m             from_this_thread\u001b[38;5;241m.\u001b[39mappend(frame_custom_thread_id)\n\u001b[1;32m   2069\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_threads_suspended_single_notification\u001b[38;5;241m.\u001b[39mnotify_thread_suspended(thread_id, thread, stop_reason):\n\u001b[0;32m-> 2070\u001b[0m         keep_suspended \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_wait_suspend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msuspend_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrom_this_thread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframes_tracker\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2072\u001b[0m frames_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   2074\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m keep_suspended:\n\u001b[1;32m   2075\u001b[0m     \u001b[38;5;66;03m# This means that we should pause again after a set next statement.\u001b[39;00m\n",
+      "File \u001b[0;32m~/tudelft/master_thesis/jar-vulnerability-detection/util/venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py:2106\u001b[0m, in \u001b[0;36mPyDB._do_wait_suspend\u001b[0;34m(self, thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)\u001b[0m\n\u001b[1;32m   2103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_input_hook()\n\u001b[1;32m   2105\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocess_internal_commands()\n\u001b[0;32m-> 2106\u001b[0m     time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.01\u001b[39m)\n\u001b[1;32m   2108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcancel_async_evaluation(get_current_thread_id(thread), \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mid\u001b[39m(frame)))\n\u001b[1;32m   2110\u001b[0m \u001b[38;5;66;03m# process any stepping instructions\u001b[39;00m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "project_dir = \"../projects_metadata\"\n",
+    "result_dir = \"../evaluation\"\n",
+    "project_files = os.listdir(project_dir)\n",
+    "\n",
+    "\n",
+    "def compare_results(expected, actual, threshold=0.95):\n",
+    "  true_positives = 0\n",
+    "  false_positives = 0\n",
+    "  false_negatives = 0\n",
+    "\n",
+    "  for dep in expected['effectiveDependencies']:\n",
+    "    if not dep['presentInDatabase']:\n",
+    "      continue  # skip dependencies that are not present in the database\n",
+    "\n",
+    "    if dep in actual['notFoundLibraries']:\n",
+    "      # not one class file of this dep was found in the uber-jar, then it most probably has no class files\n",
+    "      continue\n",
+    "\n",
+    "    found = False\n",
+    "    gav = dep['groupId'] + \":\" + dep['artifactId'] + \":\" + dep['version']\n",
+    "\n",
+    "    for inferred_dep in actual['inferredLibraries']:\n",
+    "      if inferred_dep['includedRatio'] < threshold:\n",
+    "        continue\n",
+    "      if gav == inferred_dep['gav'] or gav in inferred_dep['alternativeVersions']:\n",
+    "        true_positives += 1\n",
+    "        found = True\n",
+    "        break\n",
+    "\n",
+    "    if not found:\n",
+    "      false_negatives += 1\n",
+    "\n",
+    "  nb_actual = sum(1 for inferred_dep in actual if inferred_dep['includedRatio'] >= threshold)\n",
+    "  false_positives = nb_actual - true_positives\n",
+    "  return (true_positives, false_positives, false_negatives)\n",
+    "\n",
+    "\n",
+    "shadeConfigurations = [(True, True), (True, False), (False, True), (False, False)]\n",
+    "for shadeConfig in shadeConfigurations:\n",
+    "  precisions = {}\n",
+    "  recalls = {}\n",
+    "  f1s = {}\n",
+    "  for threshold in [0.5, 0.75, 0.9, 0.95, 0.99, 1.0]:\n",
+    "    precisions[threshold] = []\n",
+    "    recalls[threshold] = []\n",
+    "    f1s[threshold] = []\n",
+    "    for project_file in sorted(project_files):\n",
+    "      expected_data = None\n",
+    "      actual_data = None\n",
+    "      with open(os.path.join(project_dir, project_file), \"r\") as f:\n",
+    "        expected_data = json.load(f)\n",
+    "\n",
+    "        if expected_data['shadeConfiguration']['minimizeJar'] and not shadeConfig[0]:\n",
+    "          continue\n",
+    "        if expected_data['shadeConfiguration']['relocation'] and not shadeConfig[1]:\n",
+    "          continue\n",
+    "      actual_file_path = os.path.join(result_dir, project_file.replace(\".json\", \"_libraries.json\"))\n",
+    "      if not os.path.exists(actual_file_path):\n",
+    "        continue\n",
+    "      with open(actual_file_path, \"r\") as f:\n",
+    "        actual_data = json.load(f)\n",
+    "\n",
+    "      results = compare_results(expected_data, actual_data, threshold)\n",
+    "\n",
+    "      precision = results[0] / (results[0] + results[1]) if results[0] + results[1] > 0 else 1\n",
+    "      recall = results[0] / (results[0] + results[2]) if results[0] + results[2] > 0 else 1\n",
+    "      f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0\n",
+    "\n",
+    "      precisions[threshold].append(precision)\n",
+    "      recalls[threshold].append(recall)\n",
+    "      f1s[threshold].append(f1)\n",
+    "      # print(pfile, results, precision, recall, f1)\n",
+    "\n",
+    "  pass\n",
+    "  print(\"minimizeJar:\", shadeConfig[0], \"relocation:\", shadeConfig[1])\n",
+    "  for threshold in [0.5, 0.75, 0.9, 0.95, 0.99, 1.0]:\n",
+    "    precision = sum(precisions[threshold]) / len(precisions[threshold])\n",
+    "    recall = sum(recalls[threshold]) / len(recalls[threshold])\n",
+    "    f1 = sum(f1s[threshold]) / len(f1s[threshold])\n",
+    "    print(\"%.2f & %.3f & %.3f & %.3f \\\\\\\\\" % (threshold, precision, recall, f1))\n",
+    "  print(\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/util/evaluation_results.py b/util/evaluation_results.py
@@ -0,0 +1,105 @@
+import json
+import os
+
+project_dir = "../projects_metadata"
+result_dir = "../evaluation"
+project_files = os.listdir(project_dir)
+
+
+def compare_results(expected, actual, threshold=0.95):
+    true_positives = 0
+    false_positives = 0
+    false_negatives = 0
+
+    for dep in expected["effectiveDependencies"]:
+        if not dep["presentInDatabase"]:
+            continue  # skip dependencies that are not present in the database
+
+        if dep in actual["notFoundLibraries"]:
+            # not one class file of this dep was found in the uber-jar, then it most probably has no class files
+            continue
+
+        found = False
+        gav = dep["groupId"] + ":" + dep["artifactId"] + ":" + dep["version"]
+
+        for inferred_dep in actual["inferredLibraries"]:
+            if inferred_dep["includedRatio"] < threshold:
+                continue
+            if gav == inferred_dep["gav"] or gav in inferred_dep["alternativeVersions"]:
+                true_positives += 1
+                found = True
+                break
+
+        if not found:
+            false_negatives += 1
+
+    nb_actual = sum(
+        1 for inferred_dep in actual["inferredLibraries"] if inferred_dep["includedRatio"] >= threshold
+    )
+    false_positives = nb_actual - true_positives
+    return (true_positives, false_positives, false_negatives)
+
+
+shadeConfigurations = [(True, True), (True, False), (False, True), (False, False)]
+for shadeConfig in shadeConfigurations:
+    precisions = {}
+    recalls = {}
+    f1s = {}
+    for threshold in [0.5, 0.75, 0.9, 0.95, 0.99, 1.0]:
+        precisions[threshold] = []
+        recalls[threshold] = []
+        f1s[threshold] = []
+        for project_file in sorted(project_files):
+            expected_data = None
+            actual_data = None
+            with open(os.path.join(project_dir, project_file), "r") as f:
+                expected_data = json.load(f)
+
+                if (
+                    expected_data["shadeConfiguration"]["minimizeJar"]
+                    and not shadeConfig[0]
+                ):
+                    continue
+                if (
+                    expected_data["shadeConfiguration"]["relocation"]
+                    and not shadeConfig[1]
+                ):
+                    continue
+            actual_file_path = os.path.join(
+                result_dir, project_file.replace(".json", "_libraries.json")
+            )
+            if not os.path.exists(actual_file_path):
+                continue
+            with open(actual_file_path, "r") as f:
+                actual_data = json.load(f)
+
+            results = compare_results(expected_data, actual_data, threshold)
+
+            precision = (
+                results[0] / (results[0] + results[1])
+                if results[0] + results[1] > 0
+                else 1
+            )
+            recall = (
+                results[0] / (results[0] + results[2])
+                if results[0] + results[2] > 0
+                else 1
+            )
+            f1 = (
+                2 * precision * recall / (precision + recall)
+                if precision + recall > 0
+                else 0
+            )
+
+            precisions[threshold].append(precision)
+            recalls[threshold].append(recall)
+            f1s[threshold].append(f1)
+            # print(pfile, results, precision, recall, f1)
+
+    print("minimizeJar:", shadeConfig[0], "relocation:", shadeConfig[1])
+    for threshold in [0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0]:
+        precision = sum(precisions[threshold]) / len(precisions[threshold])
+        recall = sum(recalls[threshold]) / len(recalls[threshold])
+        f1 = sum(f1s[threshold]) / len(f1s[threshold])
+        print("%.2f & %.3f & %.3f & %.3f \\\\" % (threshold, precision, recall, f1))
+    print("")
diff --git a/util/graph_creator.ipynb b/util/graph_creator.ipynb