-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbsi_course_scraper.py
54 lines (49 loc) · 1.97 KB
/
dbsi_course_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import tqdm
import os
SEMESTER_PREFIX = "4420-f24"
base_url = f"https://faculty.cc.gatech.edu/~jarulraj/courses/{SEMESTER_PREFIX}/pages/schedule.html"
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
slides_path = Path("Slides")
code_path = Path("Code")
slides_path.mkdir(exist_ok=True)
code_path.mkdir(exist_ok=True)
trs = soup.find_all("tr")
for tr in tqdm.tqdm(trs):
tds = tr.find_all("td")
if len(tds) < 3:
continue
reading = reading if tds[-2].text.strip() == '"' else tds[-2].text
for td in tds:
links = td.find_all("a")
if links:
topic = links[0].text
with open(code_path / "code_links.txt", "a") as f:
if not topic.startswith("["):
f.write(f"{'-' * 40}\n")
f.write(f"{topic}\n")
f.write(f"{'-' * 40}\n")
if reading.strip() != '"' and "§" in reading:
reading = reading.replace("§", "Chapter(s)")
f.write(f"Reading: {reading}\n")
for link in links:
link_url = link.get("href")
if link_url.endswith("pdf"):
slides_name = link_url.split("/")[-1]
slides_response = requests.get(link_url)
with open(slides_path / slides_name, "wb") as fs:
fs.write(slides_response.content)
elif link_url.endswith("cpp"):
f.write(f"{link_url}\n")
# In the code file, replace multiple newlines with a single newline
with open(code_path / "code_links.txt", "r") as f:
lines = f.readlines()
new_lines = []
for line in lines:
if line.strip() != "":
new_lines.append(line + "\n")
with open(code_path / "code_links.txt", "w") as f:
f.writelines(new_lines)