Skip to content

Commit 962b516

Browse files
committed
update scripts
1 parent 608786c commit 962b516

5 files changed

+300
-52
lines changed

scripts/aggregate_temporal.py

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import json
2+
from datetime import datetime, timedelta
3+
from collections import defaultdict
4+
from typing import Dict, List, Optional
5+
import argparse
6+
from copy import deepcopy
7+
8+
def parse_timestamp(ts: str) -> datetime:
9+
"""Parse GitHub timestamp format to datetime"""
10+
return datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")
11+
12+
def get_activity_period(timestamp: str, period: str = "daily") -> str:
13+
"""Convert timestamp to period key (daily/weekly/monthly)"""
14+
dt = parse_timestamp(timestamp)
15+
if period == "daily":
16+
return dt.strftime("%Y-%m-%d")
17+
elif period == "weekly":
18+
# Get start of week (Monday)
19+
start = dt - timedelta(days=dt.weekday())
20+
return start.strftime("%Y-%m-%d")
21+
else: # monthly
22+
return dt.strftime("%Y-%m")
23+
24+
def aggregate_contributor_data(data: Dict, period: str) -> Dict[str, List]:
25+
"""Aggregate contributor data by time period"""
26+
period_data = defaultdict(lambda: defaultdict(lambda: {
27+
"contributor": "",
28+
"score": 0,
29+
"summary": "",
30+
"avatar_url": "",
31+
"activity": {
32+
"code": {
33+
"total_commits": 0,
34+
"total_prs": 0,
35+
"commits": [],
36+
"pull_requests": []
37+
},
38+
"issues": {
39+
"total_opened": 0,
40+
"opened": []
41+
},
42+
"engagement": {
43+
"total_comments": 0,
44+
"total_reviews": 0,
45+
"comments": [],
46+
"reviews": []
47+
}
48+
}
49+
}))
50+
51+
# Process each contributor
52+
for contrib in data:
53+
username = contrib["contributor"]
54+
55+
# Process commits
56+
for commit in contrib["activity"]["code"]["commits"]:
57+
period_key = get_activity_period(commit["created_at"], period)
58+
period_data[period_key][username]["contributor"] = username
59+
period_data[period_key][username]["avatar_url"] = contrib["avatar_url"]
60+
period_data[period_key][username]["activity"]["code"]["commits"].append(commit)
61+
period_data[period_key][username]["activity"]["code"]["total_commits"] += 1
62+
63+
# Process PRs
64+
for pr in contrib["activity"]["code"]["pull_requests"]:
65+
period_key = get_activity_period(pr["created_at"], period)
66+
period_data[period_key][username]["contributor"] = username
67+
period_data[period_key][username]["avatar_url"] = contrib["avatar_url"]
68+
period_data[period_key][username]["activity"]["code"]["pull_requests"].append(pr)
69+
period_data[period_key][username]["activity"]["code"]["total_prs"] += 1
70+
71+
# Process issues
72+
for issue in contrib["activity"]["issues"]["opened"]:
73+
period_key = get_activity_period(issue["created_at"], period)
74+
period_data[period_key][username]["contributor"] = username
75+
period_data[period_key][username]["avatar_url"] = contrib["avatar_url"]
76+
period_data[period_key][username]["activity"]["issues"]["opened"].append(issue)
77+
period_data[period_key][username]["activity"]["issues"]["total_opened"] += 1
78+
79+
# Convert defaultdict to regular dict and list structure
80+
result = {}
81+
for period_key, contributors in period_data.items():
82+
result[period_key] = list(contributors.values())
83+
84+
return result
85+
86+
def save_period_data(data: Dict[str, List], output_dir: str, period: str):
87+
"""Save aggregated data to appropriate directories"""
88+
import os
89+
from pathlib import Path
90+
91+
# Create directory structure
92+
base_dir = Path(output_dir)
93+
period_dir = base_dir / period
94+
history_dir = period_dir / "history"
95+
96+
os.makedirs(period_dir, exist_ok=True)
97+
os.makedirs(history_dir, exist_ok=True)
98+
99+
# Save each period's data
100+
for date_key, contributors in data.items():
101+
if not contributors: # Skip empty periods
102+
continue
103+
104+
# Save current data
105+
current_file = period_dir / "scored.json"
106+
with open(current_file, 'w') as f:
107+
json.dump(contributors, f, indent=2)
108+
109+
# Save historical copy
110+
history_file = history_dir / f"scored_{date_key}.json"
111+
with open(history_file, 'w') as f:
112+
json.dump(contributors, f, indent=2)
113+
114+
def main():
115+
parser = argparse.ArgumentParser(description="Aggregate GitHub activity data by time period")
116+
parser.add_argument("input_file", help="Input contributors JSON file")
117+
parser.add_argument("output_dir", help="Output directory for aggregated data")
118+
parser.add_argument("--periods", nargs="+", choices=["daily", "weekly", "monthly"],
119+
default=["daily", "weekly", "monthly"],
120+
help="Time periods to generate")
121+
args = parser.parse_args()
122+
123+
# Load data
124+
print(f"\nLoading data from {args.input_file}...")
125+
with open(args.input_file) as f:
126+
data = json.load(f)
127+
128+
# Process each time period
129+
for period in args.periods:
130+
print(f"\nProcessing {period} aggregation...")
131+
aggregated = aggregate_contributor_data(data, period)
132+
133+
print(f"Saving {period} data...")
134+
save_period_data(aggregated, args.output_dir, period)
135+
136+
# Print some stats
137+
total_periods = len(aggregated)
138+
total_contributions = sum(len(contributors) for contributors in aggregated.values())
139+
print(f"Generated {total_periods} {period} periods with {total_contributions} total contributions")
140+
141+
print("\nProcessing complete!")
142+
143+
if __name__ == "__main__":
144+
main()

scripts/fetch_github.sh

100644100755
File mode changed.

scripts/generate_history_summaries.sh

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
# Create directories if they don't exist
4+
mkdir -p data/{daily,weekly,monthly}/history
5+
6+
# Function to process files for a given period
7+
process_historical() {
8+
local period=$1
9+
echo "Processing ${period} historical summaries..."
10+
11+
for scored_file in data/${period}/history/scored_*.json; do
12+
if [ -f "$scored_file" ]; then
13+
# Extract date from filename
14+
date=$(echo "$scored_file" | grep -o '[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}')
15+
16+
if [ ! -z "$date" ]; then
17+
echo "Generating summary for $date..."
18+
output_file="data/${period}/history/contributors_${date}.json"
19+
20+
# Generate summary
21+
python scripts/summarize.py -f \
22+
"$scored_file" \
23+
"$output_file" \
24+
--model ollama
25+
26+
if [ $? -eq 0 ]; then
27+
echo "✓ Processed $scored_file -> $output_file"
28+
else
29+
echo "✗ Failed to process $scored_file"
30+
fi
31+
fi
32+
fi
33+
done
34+
}
35+
36+
# Process each period type
37+
for period in daily weekly monthly; do
38+
if ls data/${period}/history/scored_*.json 1> /dev/null 2>&1; then
39+
process_historical $period
40+
else
41+
echo "No scored files found for ${period}"
42+
fi
43+
done
44+
45+
echo "All historical summaries processed!"

scripts/summarize.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from langchain_core.prompts import PromptTemplate
77
from collections import defaultdict
88

9-
def get_contribution_stats(data: Dict, days: int = 45) -> Dict:
9+
def get_contribution_stats(data: Dict, days: int = 90) -> Dict:
1010
"""Get high-level contribution statistics for time period"""
1111
cutoff_date = datetime.utcnow() - timedelta(days=days)
1212
stats = defaultdict(int)
@@ -71,7 +71,7 @@ def get_contribution_stats(data: Dict, days: int = 45) -> Dict:
7171
'areas': {k: list(v) for k, v in work_areas.items()}
7272
}
7373

74-
def get_recent_activity(data: Dict, days: int = 45) -> List[str]:
74+
def get_recent_activity(data: Dict, days: int = 90) -> List[str]:
7575
"""Get most relevant recent activity"""
7676
cutoff_date = datetime.utcnow() - timedelta(days=days)
7777
activity = []
@@ -159,7 +159,7 @@ def get_summary_prompt(data: Dict, activity: List[str], stats: Dict) -> str:
159159
if stats['areas'].get('issue_areas'):
160160
areas_str += f"\nIssue areas: {', '.join(stats['areas']['issue_areas'])}"
161161

162-
return f"""Based on this GitHub activity from the last 45 days, write a 2-3 sentence summary of what {data['contributor']} worked on:
162+
return f"""Based on this GitHub activity from the last 90 days, write a 2-3 sentence summary of what {data['contributor']} worked on:
163163
164164
Recent Activity (most significant first):
165165
{chr(10).join(activity)}
@@ -176,12 +176,12 @@ def get_summary_prompt(data: Dict, activity: List[str], stats: Dict) -> str:
176176
def generate_summary(data: Dict, model: str, api_key: str = None) -> str:
177177
"""Generate summary using specified model"""
178178
try:
179-
activity = get_recent_activity(data, days=45)
180-
stats = get_contribution_stats(data, days=45)
179+
activity = get_recent_activity(data, days=90)
180+
stats = get_contribution_stats(data, days=90)
181181

182182
# If no activity was found, return early
183183
if not activity:
184-
return f"{data['contributor']} has no significant activity in the last 45 days."
184+
return f"{data['contributor']} has no significant activity in the last 90 days."
185185

186186
if model == "openai":
187187
from openai import OpenAI

0 commit comments

Comments
 (0)