Skip to content

Commit

Permalink
better pom analysis + persistence and recursive resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
Cornul11 committed Feb 23, 2024
1 parent 2a4cb78 commit 560709c
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 21 deletions.
89 changes: 68 additions & 21 deletions util/pom_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,27 +146,27 @@ def contains_shade_plugin(pom_file_path):
elif artifact_id is not None and artifact_id.text == "maven-shade-plugin":
result_dict["has_shade_plugin"] = True

def search_configuration_tags(configuration, result_dict):
tags_to_search = [
("createDependencyReducedPom", "has_dependency_reduced_pom"),
("minimizeJar", "has_minimize_jar"),
("relocations", "has_relocations"),
("filters", "has_filters"),
("transformers", "has_transformers"),
]

for tag, dict_key in tags_to_search:
if configuration.find(f"{{{NS_URL}}}{tag}") is not None:
result_dict[dict_key] = True

# search in <configuration> of <plugin>
for conf in plugin.findall(f".//{{{NS_URL}}}configuration"):
if conf.find(f"{{{NS_URL}}}createDependencyReducedPom") is not None:
result_dict["has_dependency_reduced_pom"] = True
if conf.find(f"{{{NS_URL}}}minimizeJar") is not None:
result_dict["has_minimize_jar"] = True
if conf.find(f"{{{NS_URL}}}relocations") is not None:
result_dict["has_relocations"] = True
search_configuration_tags(conf, result_dict)

# search within <executions> as well
for execution in plugin.findall(f".//{{{NS_URL}}}execution"):
for conf in execution.findall(f".//{{{NS_URL}}}configuration"):
if (
conf.find(f"{{{NS_URL}}}createDependencyReducedPom")
is not None
):
result_dict["has_dependency_reduced_pom"] = True
if conf.find(f"{{{NS_URL}}}minimizeJar") is not None:
result_dict["has_minimize_jar"] = True
if conf.find(f"{{{NS_URL}}}relocations") is not None:
result_dict["has_relocations"] = True
search_configuration_tags(conf, result_dict)

except ET.ParseError as e:
logging.error(f"Error parsing pom file: {pom_file_path}: {e}")
Expand Down Expand Up @@ -205,6 +205,8 @@ def initialize_stats(self):
self.total_dependency_reduced_pom = 0
self.total_minimize_jar = 0
self.total_relocations = 0
self.total_transformers = 0
self.total_filters = 0
self.total_errors = 0
self.total_not_found_in_index_with_assembly = 0
self.total_not_found_in_index_with_shade = 0
Expand All @@ -221,6 +223,8 @@ def initialize_stats(self):
"dependency_reduced_pom": {},
"minimize_jar": {},
"relocations": {},
"transformers": {},
"filters": {},
"shade_plugin_and_no_parent": {},
}

Expand Down Expand Up @@ -267,7 +271,10 @@ def analyze_pom_files(self, pom_files):
f"Inserting abstract POM for {group_id}:{artifact_id}:{version}"
)
library_id = self.db_manager.insert_abstract_pom(
group_id, artifact_id, version
group_id,
artifact_id,
version,
self.get_gav_creation_date(group_id, artifact_id, version),
)
if library_id is None:
logging.error(
Expand All @@ -294,6 +301,11 @@ def analyze_pom_files(self, pom_files):
parent_gav["group_id"],
parent_gav["artifact_id"],
parent_gav["version"],
self.get_gav_creation_date(
parent_gav["group_id"],
parent_gav["artifact_id"],
parent_gav["version"],
),
)
logging.info(f"Inserted parent POM for {pom_file}")

Expand Down Expand Up @@ -323,6 +335,20 @@ def create_archive(self, pom_files):
if not has_parent(pom_file):
zipf.write(pom_file)

def get_gav_creation_date(self, group_id, artifact_id, version) -> datetime:
date = get_publication_date_from_local_maven_index(
group_id, artifact_id, version
)
if date is None:
date = self.db_manager.get_creation_date(group_id, artifact_id, version)

if date is not None and date < "2000-01":
date = "1800-01"

# retain only the year and month for stats
year_month = date[:7] if date else None
return datetime.strptime(year_month, "%Y-%m") if date else None

def update_stats(self, result):
group_id, artifact_id, version = extract_gav_from_pom_path(result["path"])

Expand All @@ -336,6 +362,8 @@ def update_stats(self, result):
group_id, artifact_id, version
)

result["found_in_index"] = date is not None

if date is None:
self.total_not_found_in_index += 1

Expand All @@ -353,6 +381,8 @@ def update_stats(self, result):
if date is not None:
date = date.strftime("%Y-%m") # retain only year and month

result["found_in_libraries"] = date is not None

# if date is before 2000, it's probably wrong
if date is not None and date < "2000-01":
date = None
Expand Down Expand Up @@ -386,6 +416,14 @@ def update_stats(self, result):
self._update_trend(self.shade_trends["relocations"], year_month)
self.total_relocations += 1

if result["has_filters"]:
self._update_trend(self.shade_trends["filters"], year_month)
self.total_filters += 1

if result["has_transformers"]:
self._update_trend(self.shade_trends["transformers"], year_month)
self.total_transformers += 1

if result["has_parent"]:
self.total_with_parents += 1
if result["has_shade_plugin"] and not result["has_parent"]:
Expand Down Expand Up @@ -613,14 +651,18 @@ def parse_database_url(self, db_url):
except IndexError:
raise ValueError("Invalid database URL format")

def insert_abstract_pom(self, group_id, artifact_id, version):
def insert_abstract_pom(self, group_id, artifact_id, version, date=None):
jar_hash = 0
jar_crc = 0
is_uber_jar = -2
disk_size = 0
total_class_files = 0
unique_signatures = 0
creation_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
creation_date = (
datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if date is None
else date.strftime("%Y-%m-%d %H:%M:%S")
)

query = """INSERT INTO libraries (group_id, artifact_id, version,
jar_hash, jar_crc, is_uber_jar, disk_size, total_class_files,
Expand Down Expand Up @@ -669,8 +711,9 @@ def get_library_id(self, group_id, artifact_id, version):
def insert_or_update_pom_info(self, library_id, pom_data, parent_id=None):
query = """INSERT INTO pom_info (library_id,
has_assembly_plugin, has_shade_plugin, has_dependency_reduced_pom,
has_minimize_jar, has_relocations, has_filters, has_transformers, parent_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
has_minimize_jar, has_relocations, has_filters, has_transformers,
parent_id, found_in_index, found_in_libraries)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
has_assembly_plugin = VALUES(has_assembly_plugin),
has_shade_plugin = VALUES(has_shade_plugin),
Expand All @@ -679,7 +722,9 @@ def insert_or_update_pom_info(self, library_id, pom_data, parent_id=None):
has_relocations = VALUES(has_relocations),
has_filters = VALUES(has_filters),
has_transformers = VALUES(has_transformers),
parent_id = VALUES(parent_id)"""
parent_id = VALUES(parent_id),
found_in_index = VALUES(found_in_index),
found_in_libraries = VALUES(found_in_libraries)"""

values = (
library_id,
Expand All @@ -691,6 +736,8 @@ def insert_or_update_pom_info(self, library_id, pom_data, parent_id=None):
pom_data["has_filters"],
pom_data["has_transformers"],
parent_id,
pom_data.get("found_in_index", -1),
pom_data.get("found_in_libraries", -1),
)

with self.connection_pool.get_connection() as conn:
Expand Down
121 changes: 121 additions & 0 deletions util/recursive_pom_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from jproperties import Properties
from mysql.connector import pooling
from tqdm import tqdm


def parse_database_url(db_url):
# db_url is in the format "jdbc:postgresql://localhost:5432/maven"
try:
url_parts = db_url.split("//")[1].split("/")
host_port = url_parts[0]
database = url_parts[1]

host = host_port.split(":")[0]

return host, database
except IndexError:
raise ValueError("Invalid database URL format")


properties = Properties()
with open("../config.properties", "rb") as properties_file:
properties.load(properties_file, "utf-8")

db_host, db_name = parse_database_url(properties.get("database.url").data)


def connect_to_db():
try:
connection_pool = pooling.MySQLConnectionPool(
pool_name="pom_resolution_pool",
pool_size=5,
host=db_host,
database=db_name,
user=properties.get("database.username").data,
password=properties.get("database.password").data,
)
return connection_pool
except Exception as e:
print(f"Error connecting to the database: {e}")
return None


def get_top_level_parents(cursor):
cursor.execute(
"""
SELECT id, library_id, has_assembly_plugin, has_shade_plugin, has_dependency_reduced_pom,
has_minimize_jar, has_relocations, has_filters, has_transformers
FROM pom_info
WHERE parent_id IS NULL OR parent_id NOT IN (SELECT library_id FROM pom_info)"""
)
return cursor.fetchall()


def update_children(pool, parent_id, inherited_props):
cnx = pool.get_connection()
cursor = cnx.cursor(buffered=True)

cursor.execute(
"""
SELECT id FROM pom_info WHERE parent_id = %s
""",
(parent_id,),
)
children = cursor.fetchall()

for (child_id,) in children:
update_clause = ", ".join(
[f"{prop} = 1" for prop, value in inherited_props.items() if value]
)
if update_clause:
update_query = f"UPDATE pom_info SET {update_clause} WHERE id = {child_id}"
cursor.execute(update_query)

cursor.execute(
"""
SELECT has_assembly_plugin, has_shade_plugin, has_dependency_reduced_pom,
has_minimize_jar, has_relocations, has_filters, has_transformers
FROM pom_info WHERE id = %s
""",
(child_id,),
)
child_props = cursor.fetchone()
next_gen_props = {
prop: max(value, child_props[i])
for i, (prop, value) in enumerate(inherited_props.items())
}

update_children(pool, child_id, next_gen_props)

cnx.commit()
cursor.close()
cnx.close()


def main():
pool = connect_to_db()
if pool:
cnx = pool.get_connection()
cursor = cnx.cursor(buffered=True)
top_level_parents = get_top_level_parents(cursor)
cursor.close()
cnx.close()

for parent in tqdm(top_level_parents):
parent_id, library_id, *has_props = parent
inherited_props = {
"has_assembly_plugin": has_props[0],
"has_shade_plugin": has_props[1],
"has_dependency_reduced_pom": has_props[2],
"has_minimize_jar": has_props[3],
"has_relocations": has_props[4],
"has_filters": has_props[5],
"has_transformers": has_props[6],
}
update_children(pool, library_id, inherited_props)
else:
print("Failed to connect to the database")


if __name__ == "__main__":
main()

0 comments on commit 560709c

Please sign in to comment.