-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathgenerating_cover_page.py
115 lines (88 loc) · 3.01 KB
/
generating_cover_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import html.parser
import requests
import markdown2
import json
import re
import queue
from urllib.parse import urlparse
import traceback
from multiprocessing import Process# Going parallel!
from urllib.parse import urlparse
from bs4 import BeautifulSoup
# Own
from parse_awesome_lists import *
from attach_all_github_repo_info import *
input_dir = './original_awesomes/'
with open('config.yml', 'r') as config_file:
config = yaml.load(config_file)
conn = pymysql.connect(
host='127.0.0.1',
charset='utf8',
use_unicode=True,
unix_socket='/tmp/mysql.sock',
user=config['database']['user'],
passwd=None,
db=config['database']['db'],
autocommit=True
)
cur = conn.cursor()
def generate_urls_ratio(soup):
lis = soup.find_all('li')
github_urls = set()
for li in lis:
a = li.find_all('a')
if has_valid_github_url(a):
github_urls.add(a[0]['href'])
return (len(github_urls), len(lis))
def count_github_urls(url, input_directory='.'):
filename = input_directory + urlparse(url).path.split('/')[-1] + '.md'
f = open(filename, 'r')
content = f.read()
soup = generate_soup(content)
return generate_urls_ratio(soup)
def generate_awsome_project_info(url):
query = "select stargazers_count, pushed_at from awesome_augmented where repo_url='%s'" % (url)
cur.execute(query)
rows = cur.fetchall()
if len(rows) == 1:
stars_count, updated_at = rows[0]
updated_at_datetime = parse(updated_at)
updated_days_ago = (datetime.now(pytz.utc)- updated_at_datetime).days
return (stars_count, updated_days_ago)
return (None, None)
def regenerate_hrefs(url):
prefix = 'https://github.com/chaconnewu/awesome-augmented/blob/master/awesomes/'
full_name = urlparse(url).path
new_url = prefix + full_name.split('/')[-1] + '.md'
return new_url
def main():
# awesome_url = 'https://raw.githubusercontent.com/sindresorhus/awesome/master/readme.md'
# content = request_content(awesome_url)
# f = open('awesome.md', 'w')
# f.write(content)
# del f
soup = generate_soup(open('README.md', 'r').read())
lis = soup.find_all('li');
visited = set()
for li in lis:
a = li.find_all('a')
if len(a) > 0 and re.search('^https://github.com/[^/]+/[^/]+/?$', a[0]['href']):
url = a[0]['href']
stars_count, updated_days_ago = generate_awsome_project_info(url)
if not stars_count or not updated_days_ago:
continue
tag = soup.new_tag('sup')
tag.string = ' ★ %d, pushed %d days ago ' % (stars_count, updated_days_ago)
if a[0] in visited:
continue
else:
visited.add(a[0])
a[0]['href'] = regenerate_hrefs(url)
status
a[0].insert_after(tag)
# li.insert(len(li.contents), tag)
f = open('README.md', 'w')
f.write(soup.prettify(formatter=None))
del f
if __name__ == '__main__':
main()