Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor schema names #638

Merged
merged 1 commit into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 8Knot/cache_manager/cache_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def retrieve_from_cache(
"""
SELECT *
FROM {tablename} t
WHERE t.id IN %s;
WHERE t.repo_id IN %s;
""".format(
tablename=tablename
),
Expand Down
60 changes: 30 additions & 30 deletions 8Knot/cache_manager/db_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS commits_query(
id int,
commits text, -- this is the commit hash, so it's base64 hash.
repo_id int,
commit_hash text, -- this is the commit hash, so it's base64 hash.
author_email text,
date text,
author_date text,
author_timestamp text,
committer_timestamp text)
"""
Expand All @@ -134,15 +134,15 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issues_query(
id int,
repo_id int,
repo_name text,
issue int,
issue_number int,
gh_issue int,
reporter_id text,
issue_closer text,
created text,
closed text
created_at text,
closed_at text
)
"""
)
Expand All @@ -151,14 +151,14 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS prs_query(
id int,
repo_id int,
repo_name text,
pull_request int,
pull_request_id int,
pr_src_number int,
cntrb_id text,
created text,
closed text,
merged text
created_at text,
closed_at text,
merged_at text
)
"""
)
Expand All @@ -168,8 +168,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS affiliation_query(
cntrb_id text,
created text,
id int,
created_at text,
repo_id int,
login text,
action text,
rank int,
Expand All @@ -183,7 +183,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS contributors_query(
id int,
repo_id int,
repo_name text,
cntrb_id text,
created_at text,
Expand All @@ -199,9 +199,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issue_assignee_query(
issue_id text,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -214,9 +214,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_assignee_query(
pull_request_id int,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -229,7 +229,7 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS cntrb_per_file_query(
file_path text,
id int,
repo_id int,
cntrb_ids text
)
"""
Expand All @@ -240,8 +240,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_file_query(
file_path text,
pull_request int,
id int
pull_request_id int,
repo_id int
)
"""
)
Expand All @@ -250,7 +250,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_files_query(
id int,
repo_id int,
repo_name text,
repo_path text,
rl_analysis_date text,
Expand All @@ -264,7 +264,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_languages_query(
id int,
repo_id int,
programming_language text,
code_lines int,
files int
Expand All @@ -276,7 +276,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS package_version_query(
id int,
repo_id int,
name text,
current_release_date text,
latest_release_date text,
Expand All @@ -289,7 +289,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_releases_query(
id int,
repo_id int,
release_name text,
release_created_at text,
release_published_at text,
Expand All @@ -302,7 +302,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS ossf_score_query(
id int,
repo_id int,
name text,
score float4
)
Expand All @@ -313,7 +313,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_info_query(
id int,
repo_id int,
issues_enabled text,
fork_count int,
watchers_count int,
Expand All @@ -331,7 +331,7 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_response_query(
pull_request_id int,
ID int,
repo_id int,
cntrb_id text,
msg_timestamp text,
msg_cntrb_id text,
Expand Down
8 changes: 4 additions & 4 deletions 8Knot/pages/affiliation/visualizations/gh_org_affiliation.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,16 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
requiring no further processing."""

# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# intital count of same company name in github profile
result = df.cntrb_company.value_counts(dropna=False)
Expand Down
18 changes: 12 additions & 6 deletions 8Knot/pages/affiliation/visualizations/org_associated_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -201,16 +207,16 @@ def org_associated_activity_graph(repolist, num, start_date, end_date, email_fil

def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# creates list of emails for each contribution and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand Down
30 changes: 21 additions & 9 deletions 8Knot/pages/affiliation/visualizations/org_core_contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -165,7 +171,13 @@ def toggle_popover(n, is_open):
background=True,
)
def compay_associated_activity_graph(
repolist, contributions, contributors, start_date, end_date, email_filter, bot_switch
repolist,
contributions,
contributors,
start_date,
end_date,
email_filter,
bot_switch,
):
# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=aq.__name__, repolist=repolist):
Expand Down Expand Up @@ -201,23 +213,23 @@ def compay_associated_activity_graph(

def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# groups contributions by countributor id and counts, created column now hold the number
# of contributions for its respective contributor
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created"]].count()
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created_at"]].count()

# filters out contributors that dont meet the core contribution threshhold
df = df[df.created >= contributions]
df = df[df.created_at >= contributions]

# creates list of unique emails and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand Down
8 changes: 4 additions & 4 deletions 8Knot/pages/affiliation/visualizations/unqiue_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,16 @@ def unique_domains_graph(repolist, num, start_date, end_date, bot_switch):

def process_data(df: pd.DataFrame, num, start_date, end_date):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# creates list of unique emails and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").unique().tolist()
Expand Down
16 changes: 10 additions & 6 deletions 8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def directory_dropdown(repo_id):
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df["repo_name"].iloc[0]
repo_path = df["repo_path"].iloc[0]
repo_id = str(df["id"].iloc[0])
repo_id = str(df["repo_id"].iloc[0])

# pattern found in each file path, used to slice to get only the root file path
path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
Expand All @@ -192,7 +192,7 @@ def directory_dropdown(repo_id):
# drop unneccessary columns not needed after preprocessing steps
df = df.reset_index()
df.drop(
["index", "id", "repo_name", "repo_path", "rl_analysis_date"],
["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
axis=1,
inplace=True,
)
Expand Down Expand Up @@ -308,7 +308,7 @@ def process_data(
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df_file["repo_name"].iloc[0]
repo_path = df_file["repo_path"].iloc[0]
repo_id = str(df_file["id"].iloc[0])
repo_id = str(df_file["repo_id"].iloc[0])

# pattern found in each file path, used to slice to get only the root file path
path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
Expand All @@ -322,8 +322,8 @@ def process_data(
df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))

# drop unnecessary columns
df_file.drop(["id"], axis=1, inplace=True)
df_file_cntbs.drop(["id"], axis=1, inplace=True)
df_file.drop(["repo_id"], axis=1, inplace=True)
df_file_cntbs.drop(["repo_id"], axis=1, inplace=True)

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever opened a pr that included edits on the file
Expand Down Expand Up @@ -387,7 +387,11 @@ def process_data(

# drop unneccessary columns not needed after preprocessing steps
df_actions = df_actions.reset_index()
df_actions.drop(["index", "id", "repo_name", "login", "Action", "rank"], axis=1, inplace=True)
df_actions.drop(
["index", "repo_id", "repo_name", "login", "Action", "rank"],
axis=1,
inplace=True,
)

# dictionary of cntrb_ids and their most recent activity on repo
last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()
Expand Down
Loading
Loading