Skip to content

Commit

Permalink
Merge pull request #517 from aliparlakci/development
Browse files Browse the repository at this point in the history
v2.4
  • Loading branch information
aliparlakci committed Sep 12, 2021
2 parents 900f9a9 + 063caf0 commit afe3b71
Show file tree
Hide file tree
Showing 33 changed files with 323 additions and 131 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma
- Can be specified multiple times
- Disables certain modules from being used
- See [Disabling Modules](#disabling-modules) for more information and a list of module names
- `--include-id-file`
- This will add any submission with the IDs in the files provided
- Can be specified multiple times
- Format is one ID per line
- `--log`
- This allows one to specify the location of the logfile
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below
Expand Down
11 changes: 6 additions & 5 deletions bdfr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import click

from bdfr.archiver import Archiver
from bdfr.cloner import RedditCloner
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
from bdfr.cloner import RedditCloner

logger = logging.getLogger()

Expand All @@ -17,6 +17,7 @@
click.option('--authenticate', is_flag=True, default=None),
click.option('--config', type=str, default=None),
click.option('--disable-module', multiple=True, default=None, type=str),
click.option('--include-id-file', multiple=True, default=None),
click.option('--log', type=str, default=None),
click.option('--saved', is_flag=True, default=None),
click.option('--search', default=None, type=str),
Expand All @@ -26,12 +27,12 @@
click.option('-L', '--limit', default=None, type=int),
click.option('-l', '--link', multiple=True, default=None, type=str),
click.option('-m', '--multireddit', multiple=True, default=None, type=str),
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')),
default=None),
click.option('-s', '--subreddit', multiple=True, default=None, type=str),
click.option('-v', '--verbose', default=None, count=True),
click.option('-u', '--user', type=str, multiple=True, default=None),
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
'controversial', 'rising', 'relevance')), default=None),
click.option('-u', '--user', type=str, multiple=True, default=None),
click.option('-v', '--verbose', default=None, count=True),
]

_downloader_options = [
Expand Down
6 changes: 3 additions & 3 deletions bdfr/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,17 @@ def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
logger.info(f'Record for entry item {praw_item.id} written to disk')

def _write_entry_json(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.json')
resource = Resource(entry.source, '', lambda: None, '.json')
content = json.dumps(entry.compile())
self._write_content_to_disk(resource, content)

def _write_entry_xml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.xml')
resource = Resource(entry.source, '', lambda: None, '.xml')
content = dict2xml.dict2xml(entry.compile(), wrap='root')
self._write_content_to_disk(resource, content)

def _write_entry_yaml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.yaml')
resource = Resource(entry.source, '', lambda: None, '.yaml')
content = yaml.dump(entry.compile())
self._write_content_to_disk(resource, content)

Expand Down
1 change: 1 addition & 0 deletions bdfr/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self):
self.exclude_id_file = []
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
self.folder_scheme: str = '{SUBREDDIT}'
self.include_id_file = []
self.limit: Optional[int] = None
self.link: list[str] = []
self.log: Optional[str] = None
Expand Down
21 changes: 14 additions & 7 deletions bdfr/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import configparser
import importlib.resources
import itertools
import logging
import logging.handlers
import re
Expand Down Expand Up @@ -78,7 +79,12 @@ def _setup_internal_objects(self):
self.create_reddit_instance()
self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))

self.excluded_submission_ids = self.read_excluded_ids()
self.excluded_submission_ids = set.union(
self.read_id_files(self.args.exclude_id_file),
set(self.args.exclude_id),
)

self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))

self.master_hash_list = {}
self.authenticator = self.create_authenticator()
Expand Down Expand Up @@ -184,8 +190,9 @@ def load_config(self):
logger.debug(f'Loading configuration from {path}')
break
if not self.config_location:
self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0]
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
with importlib.resources.path('bdfr', 'default_config.cfg') as path:
self.config_location = path
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
if not self.config_location:
raise errors.BulkDownloaderException('Could not find a configuration file to load')
self.cfg_parser.read(self.config_location)
Expand Down Expand Up @@ -403,13 +410,13 @@ def check_subreddit_status(subreddit: praw.models.Subreddit):
except prawcore.Forbidden:
raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped')

def read_excluded_ids(self) -> set[str]:
@staticmethod
def read_id_files(file_locations: list[str]) -> set[str]:
out = []
out.extend(self.args.exclude_id)
for id_file in self.args.exclude_id_file:
for id_file in file_locations:
id_file = Path(id_file).resolve().expanduser()
if not id_file.exists():
logger.warning(f'ID exclusion file at {id_file} does not exist')
logger.warning(f'ID file at {id_file} does not exist')
continue
with open(id_file, 'r') as file:
for line in file:
Expand Down
2 changes: 1 addition & 1 deletion bdfr/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _download_submission(self, submission: praw.models.Submission):
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
continue
try:
res.download(self.args.max_wait_time)
res.download({'max_wait_time': self.args.max_wait_time})
except errors.BulkDownloaderException as e:
logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
f'with downloader {downloader_class.__name__}: {e}')
Expand Down
56 changes: 34 additions & 22 deletions bdfr/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
import time
import urllib.parse
from typing import Optional
from typing import Callable, Optional

import _hashlib
import requests
Expand All @@ -18,40 +18,52 @@


class Resource:
def __init__(self, source_submission: Submission, url: str, extension: str = None):
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
self.source_submission = source_submission
self.content: Optional[bytes] = None
self.url = url
self.hash: Optional[_hashlib.HASH] = None
self.extension = extension
self.download_function = download_function
if not self.extension:
self.extension = self._determine_extension()

@staticmethod
def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]:
try:
response = requests.get(url)
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
return response.content
elif response.status_code in (408, 429):
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
else:
raise BulkDownloaderException(
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
time.sleep(current_wait_time)
if current_wait_time < max_wait_time:
current_wait_time += 60
return Resource.retry_download(url, max_wait_time, current_wait_time)
def retry_download(url: str) -> Callable:
max_wait_time = 300

def http_download(download_parameters: dict) -> Optional[bytes]:
current_wait_time = 60
if 'max_wait_time' in download_parameters:
max_wait_time = download_parameters['max_wait_time']
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
raise
max_wait_time = 300
while True:
try:
response = requests.get(url)
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
return response.content
elif response.status_code in (408, 429):
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
else:
raise BulkDownloaderException(
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
time.sleep(current_wait_time)
if current_wait_time < max_wait_time:
current_wait_time += 60
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
raise
return http_download

def download(self, max_wait_time: int):
def download(self, download_parameters: Optional[dict] = None):
if download_parameters is None:
download_parameters = {}
if not self.content:
try:
content = self.retry_download(self.url, max_wait_time)
content = self.download_function(download_parameters)
except requests.exceptions.ConnectionError as e:
raise BulkDownloaderException(f'Could not download resource: {e}')
except BulkDownloaderException:
Expand Down
2 changes: 1 addition & 1 deletion bdfr/site_downloaders/direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def __init__(self, post: Submission):
super().__init__(post)

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
return [Resource(self.post, self.post.url)]
return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))]
6 changes: 4 additions & 2 deletions bdfr/site_downloaders/download_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vidble import Vidble
from bdfr.site_downloaders.youtube import Youtube


Expand Down Expand Up @@ -46,11 +47,12 @@ def pull_lever(url: str) -> Type[BaseDownloader]:
return Direct
elif re.match(r'pornhub\.com.*', sanitised_url):
return PornHub
elif re.match(r'vidble\.com', sanitised_url):
return Vidble
elif YoutubeDlFallback.can_handle_link(sanitised_url):
return YoutubeDlFallback
else:
raise NotADownloadableLinkError(
f'No downloader module exists for url {url}')
raise NotADownloadableLinkError(f'No downloader module exists for url {url}')

@staticmethod
def sanitise_url(url: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion bdfr/site_downloaders/erome.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
for link in links:
if not re.match(r'https?://.*', link):
link = 'https://' + link
out.append(Resource(self.post, link))
out.append(Resource(self.post, link, Resource.retry_download(link)))
return out

@staticmethod
Expand Down
26 changes: 11 additions & 15 deletions bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
from typing import Optional

import youtube_dl
from praw.models import Submission

from bdfr.resource import Resource
Expand All @@ -20,21 +19,18 @@ def __init__(self, post: Submission):
super(YoutubeDlFallback, self).__init__(post)

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = super()._download_video({})
out = Resource(
self.post,
self.post.url,
super()._download_video({}),
super().get_video_attributes(self.post.url)['ext'],
)
return [out]

@staticmethod
def can_handle_link(url: str) -> bool:
yt_logger = logging.getLogger('youtube-dl')
yt_logger.setLevel(logging.CRITICAL)
with youtube_dl.YoutubeDL({
'logger': yt_logger,
}) as ydl:
try:
result = ydl.extract_info(url, download=False)
if result:
return True
except Exception as e:
logger.exception(e)
return False
return False
attributes = YoutubeDlFallback.get_video_attributes(url)
if attributes:
return True
else:
return False
4 changes: 2 additions & 2 deletions bdfr/site_downloaders/gallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, post: Submission):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
try:
image_urls = self._get_links(self.post.gallery_data['items'])
except AttributeError:
except (AttributeError, TypeError):
try:
image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
except (AttributeError, IndexError, TypeError):
Expand All @@ -31,7 +31,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l

if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls]
return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls]

@ staticmethod
def _get_links(id_dict: list[dict]) -> list[str]:
Expand Down
2 changes: 1 addition & 1 deletion bdfr/site_downloaders/imgur.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l

def _compute_image_url(self, image: dict) -> Resource:
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
return Resource(self.post, image_url)
return Resource(self.post, image_url, Resource.retry_download(image_url))

@staticmethod
def _get_data(link: str) -> dict:
Expand Down
7 changes: 6 additions & 1 deletion bdfr/site_downloaders/pornhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,10 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
'format': 'best',
'nooverwrites': True,
}
out = self._download_video(ytdl_options)
out = Resource(
self.post,
self.post.url,
super()._download_video(ytdl_options),
super().get_video_attributes(self.post.url)['ext'],
)
return [out]
2 changes: 1 addition & 1 deletion bdfr/site_downloaders/redgifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self, post: Submission):

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
media_url = self._get_link(self.post.url)
return [Resource(self.post, media_url, '.mp4')]
return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')]

@staticmethod
def _get_link(url: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion bdfr/site_downloaders/self_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, post: Submission):
super().__init__(post)

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = Resource(self.post, self.post.url, '.txt')
out = Resource(self.post, self.post.url, lambda: None, '.txt')
out.content = self.export_to_string().encode('utf-8')
out.create_hash()
return [out]
Expand Down
Loading

0 comments on commit afe3b71

Please sign in to comment.