diff --git a/tools/save-freddit/out/.gitignore b/tools/save-freddit/out/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/tools/save-freddit/out/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/tools/save-freddit/requirements.txt b/tools/save-freddit/requirements.txt deleted file mode 100644 index 9688b8e..0000000 --- a/tools/save-freddit/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -Requests==2.31.0 diff --git a/tools/save-freddit/save-freddit.py b/tools/save-freddit/save-freddit.py deleted file mode 100644 index fdefe3a..0000000 --- a/tools/save-freddit/save-freddit.py +++ /dev/null @@ -1,51 +0,0 @@ -import src.collect_posts as collect_posts -import src.collect_post_pages as collect_post_pages -import src.collect_images as collect_images - -import os -import sys - -# ARGS: subreddit, output_dir. Optional (--): --save_json, --stages, --collect_posts, --collect_post_pages, --collect_images - -subreddit = sys.argv[1] -output_dir = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2][:2] != '--' else f'out/{subreddit}' - -# make output dirs if non-existent -if not os.path.exists(output_dir): - os.makedirs(output_dir) -if not os.path.exists(f'{output_dir}/pages'): - os.makedirs(f'{output_dir}/pages') -if not os.path.exists(f'{output_dir}/posts'): - os.makedirs(f'{output_dir}/posts') -if not os.path.exists(f'{output_dir}/images'): - os.makedirs(f'{output_dir}/images') - -# optional args -save_json = False - -if '--save_json' in sys.argv: - save_json = True - -stage1 = True -stage2 = True -stage3 = True - -if "--stages" in sys.argv: - stage1, stage2, stage3 = False, False, False - -if '--collect_posts' in sys.argv: - stage1 = True - -if '--collect_post_pages' in sys.argv: - stage2 = True - -if '--collect_images' in sys.argv: - stage3 = True - -# collect posts -if stage1: - collect_posts.collect_posts(subreddit, output_dir, save_json=True) -if stage2: - collect_post_pages.collect_post_pages(subreddit, output_dir) -if stage3: - collect_images.collect_images(output_dir, subreddit) diff --git a/tools/save-freddit/src/__pycache__/.gitignore b/tools/save-freddit/src/__pycache__/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/tools/save-freddit/src/__pycache__/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/tools/save-freddit/src/collect_images.py b/tools/save-freddit/src/collect_images.py deleted file mode 100644 index 8b13789..0000000 --- a/tools/save-freddit/src/collect_images.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/save-freddit/src/collect_post_pages.py b/tools/save-freddit/src/collect_post_pages.py deleted file mode 100644 index cf4d4ff..0000000 --- a/tools/save-freddit/src/collect_post_pages.py +++ /dev/null @@ -1,45 +0,0 @@ -import requests -import json -import time -import os -import random - - -def collect_post_pages(subreddit, output_dir): - with open(f'{output_dir}/post_urls.json', 'r') as f: - posts = json.load(f) - - sleep_timer = 1 # should be safe - - headers = {'User-Agent': 'Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0'} - - i = 0 - - for url in posts: - i += 1 - url = url.replace('https://www.reddit.com', 'https://old.reddit.com') - - # Replace any characters that are not allowed in filenames - title = url.replace('/', '_').replace(':', '_').replace('?', '_').replace('&', '_').replace('=', '_').replace(' ', '_') - - # clean title - title = title.replace(f'https___old.reddit.com_r_{subreddit}_comments_', '') - - # if out file exists, skip - if os.path.exists(f'out/posts/{title}.json'): - print('File exists, skipping.') - continue - - response = requests.get(url + '.json', headers=headers) - json_data = response.json() - - print(url, response) - - with open(f'{output_dir}/posts/{title}.json', 'w') as f: - f.write(json.dumps(json_data, indent=4)) - - print(f"{i}/{len(posts)}") - - # Delay spoofing - sleep_timer_random = sleep_timer + random.uniform(-1, 1) - time.sleep(sleep_timer_random) diff --git a/tools/save-freddit/src/collect_posts.py b/tools/save-freddit/src/collect_posts.py deleted file mode 100644 index 7ed825c..0000000 --- a/tools/save-freddit/src/collect_posts.py +++ /dev/null @@ -1,61 +0,0 @@ -import requests -import json -import time -import random -import sys - -sleep_timer = 3 # should be safe -headers = {'User-Agent': 'Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0'} - -def write_json(data, filename): - with open(filename, 'w') as f: - f.write(json.dumps(data, indent=4)) - -def collect_posts(subreddit, output_dir, save_json=False): - - output_filename = f'{output_dir}/post_urls.json' - post_urls = [] - base_reddit_url = f'https://old.reddit.com' - reddit_url = base_reddit_url + f'/r/{subreddit}' - params = "" - - page = 0 - pages = 0 - - while True: - try: - page += 1 - - response = requests.get(reddit_url + "/.json" + params, headers=headers) - json_data = response.json() - - if save_json: - with open(f'{output_dir}/pages/{pages}.json', 'w') as f: - f.write(json.dumps(json_data, indent=4)) - - for c in json_data['data']['children']: - post_urls.append(base_reddit_url + c['data']['permalink']) - - print(f'Page {page} done. {len(post_urls)} links found.') - - pages += 25 - - after = json_data['data']['after'] - - if after is None: - break - - params = f"?count={pages}&after={after}" - print(params) - - # Delay spoofing - sleep_timer_random = sleep_timer + random.uniform(-1, 1) - time.sleep(sleep_timer_random) - - except KeyboardInterrupt: - write_json(post_urls, output_filename) - sys.exit(0) - - write_json(post_urls, output_filename) - - return True \ No newline at end of file