Merge pull request #517 from aliparlakci/development

v2.4
Serene-Arc · Sep 12, 2021 · afe3b71 · afe3b71
2 parents 900f9a9 + 063caf0
commit afe3b71
Show file tree

Hide file tree

Showing 33 changed files with 323 additions and 131 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma
  - Can be specified multiple times
  - Disables certain modules from being used
  - See [Disabling Modules](#disabling-modules) for more information and a list of module names
+- `--include-id-file`
+ - This will add any submission with the IDs in the files provided
+ - Can be specified multiple times
+ - Format is one ID per line
 - `--log`
  - This allows one to specify the location of the logfile
  - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below

diff --git a/bdfr/__main__.py b/bdfr/__main__.py
@@ -6,9 +6,9 @@
 import click
 
 from bdfr.archiver import Archiver
+from bdfr.cloner import RedditCloner
 from bdfr.configuration import Configuration
 from bdfr.downloader import RedditDownloader
-from bdfr.cloner import RedditCloner
 
 logger = logging.getLogger()
 
@@ -17,6 +17,7 @@
  click.option('--authenticate', is_flag=True, default=None),
  click.option('--config', type=str, default=None),
  click.option('--disable-module', multiple=True, default=None, type=str),
+ click.option('--include-id-file', multiple=True, default=None),
  click.option('--log', type=str, default=None),
  click.option('--saved', is_flag=True, default=None),
  click.option('--search', default=None, type=str),
@@ -26,12 +27,12 @@
  click.option('-L', '--limit', default=None, type=int),
  click.option('-l', '--link', multiple=True, default=None, type=str),
  click.option('-m', '--multireddit', multiple=True, default=None, type=str),
+ click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')),
+ default=None),
  click.option('-s', '--subreddit', multiple=True, default=None, type=str),
- click.option('-v', '--verbose', default=None, count=True),
- click.option('-u', '--user', type=str, multiple=True, default=None),
  click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
- click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
-  'controversial', 'rising', 'relevance')), default=None),
+ click.option('-u', '--user', type=str, multiple=True, default=None),
+ click.option('-v', '--verbose', default=None, count=True),
 ]
 
 _downloader_options = [

diff --git a/bdfr/archiver.py b/bdfr/archiver.py
@@ -76,17 +76,17 @@ def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
  logger.info(f'Record for entry item {praw_item.id} written to disk')
 
  def _write_entry_json(self, entry: BaseArchiveEntry):
- resource = Resource(entry.source, '', '.json')
+ resource = Resource(entry.source, '', lambda: None, '.json')
  content = json.dumps(entry.compile())
  self._write_content_to_disk(resource, content)
 
  def _write_entry_xml(self, entry: BaseArchiveEntry):
- resource = Resource(entry.source, '', '.xml')
+ resource = Resource(entry.source, '', lambda: None, '.xml')
  content = dict2xml.dict2xml(entry.compile(), wrap='root')
  self._write_content_to_disk(resource, content)
 
  def _write_entry_yaml(self, entry: BaseArchiveEntry):
- resource = Resource(entry.source, '', '.yaml')
+ resource = Resource(entry.source, '', lambda: None, '.yaml')
  content = yaml.dump(entry.compile())
  self._write_content_to_disk(resource, content)
 

diff --git a/bdfr/configuration.py b/bdfr/configuration.py
@@ -18,6 +18,7 @@ def __init__(self):
  self.exclude_id_file = []
  self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
  self.folder_scheme: str = '{SUBREDDIT}'
+ self.include_id_file = []
  self.limit: Optional[int] = None
  self.link: list[str] = []
  self.log: Optional[str] = None

diff --git a/bdfr/connector.py b/bdfr/connector.py
@@ -3,6 +3,7 @@
 
 import configparser
 import importlib.resources
+import itertools
 import logging
 import logging.handlers
 import re
@@ -78,7 +79,12 @@ def _setup_internal_objects(self):
  self.create_reddit_instance()
  self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))
 
- self.excluded_submission_ids = self.read_excluded_ids()
+ self.excluded_submission_ids = set.union(
+ self.read_id_files(self.args.exclude_id_file),
+ set(self.args.exclude_id),
+ )
+
+ self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))
 
  self.master_hash_list = {}
  self.authenticator = self.create_authenticator()
@@ -184,8 +190,9 @@ def load_config(self):
  logger.debug(f'Loading configuration from {path}')
  break
  if not self.config_location:
- self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0]
- shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
+ with importlib.resources.path('bdfr', 'default_config.cfg') as path:
+ self.config_location = path
+ shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
  if not self.config_location:
  raise errors.BulkDownloaderException('Could not find a configuration file to load')
  self.cfg_parser.read(self.config_location)
@@ -403,13 +410,13 @@ def check_subreddit_status(subreddit: praw.models.Subreddit):
  except prawcore.Forbidden:
  raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped')
 
- def read_excluded_ids(self) -> set[str]:
+ @staticmethod
+ def read_id_files(file_locations: list[str]) -> set[str]:
  out = []
- out.extend(self.args.exclude_id)
- for id_file in self.args.exclude_id_file:
+ for id_file in file_locations:
  id_file = Path(id_file).resolve().expanduser()
  if not id_file.exists():
- logger.warning(f'ID exclusion file at {id_file} does not exist')
+ logger.warning(f'ID file at {id_file} does not exist')
  continue
  with open(id_file, 'r') as file:
  for line in file:

diff --git a/bdfr/downloader.py b/bdfr/downloader.py
@@ -82,7 +82,7 @@ def _download_submission(self, submission: praw.models.Submission):
  logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
  continue
  try:
- res.download(self.args.max_wait_time)
+ res.download({'max_wait_time': self.args.max_wait_time})
  except errors.BulkDownloaderException as e:
  logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
  f'with downloader {downloader_class.__name__}: {e}')

diff --git a/bdfr/resource.py b/bdfr/resource.py
@@ -6,7 +6,7 @@
 import re
 import time
 import urllib.parse
-from typing import Optional
+from typing import Callable, Optional
 
 import _hashlib
 import requests
@@ -18,40 +18,52 @@
 
 
 class Resource:
- def __init__(self, source_submission: Submission, url: str, extension: str = None):
+ def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
  self.source_submission = source_submission
  self.content: Optional[bytes] = None
  self.url = url
  self.hash: Optional[_hashlib.HASH] = None
  self.extension = extension
+ self.download_function = download_function
  if not self.extension:
  self.extension = self._determine_extension()
 
  @staticmethod
- def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]:
- try:
- response = requests.get(url)
- if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
- return response.content
- elif response.status_code in (408, 429):
- raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
- else:
- raise BulkDownloaderException(
- f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
- except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
- logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
- time.sleep(current_wait_time)
- if current_wait_time < max_wait_time:
- current_wait_time += 60
- return Resource.retry_download(url, max_wait_time, current_wait_time)
+ def retry_download(url: str) -> Callable:
+ max_wait_time = 300
+
+ def http_download(download_parameters: dict) -> Optional[bytes]:
+ current_wait_time = 60
+ if 'max_wait_time' in download_parameters:
+ max_wait_time = download_parameters['max_wait_time']
  else:
- logger.error(f'Max wait time exceeded for resource at url {url}')
- raise
+ max_wait_time = 300
+ while True:
+ try:
+ response = requests.get(url)
+ if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
+ return response.content
+ elif response.status_code in (408, 429):
+ raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
+ else:
+ raise BulkDownloaderException(
+ f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
+ except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
+ logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
+ time.sleep(current_wait_time)
+ if current_wait_time < max_wait_time:
+ current_wait_time += 60
+ else:
+ logger.error(f'Max wait time exceeded for resource at url {url}')
+ raise
+ return http_download
 
- def download(self, max_wait_time: int):
+ def download(self, download_parameters: Optional[dict] = None):
+ if download_parameters is None:
+ download_parameters = {}
  if not self.content:
  try:
- content = self.retry_download(self.url, max_wait_time)
+ content = self.download_function(download_parameters)
  except requests.exceptions.ConnectionError as e:
  raise BulkDownloaderException(f'Could not download resource: {e}')
  except BulkDownloaderException:

diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py
@@ -14,4 +14,4 @@ def __init__(self, post: Submission):
  super().__init__(post)
 
  def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
- return [Resource(self.post, self.post.url)]
+ return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))]
diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py
@@ -16,6 +16,7 @@
 from bdfr.site_downloaders.pornhub import PornHub
 from bdfr.site_downloaders.redgifs import Redgifs
 from bdfr.site_downloaders.self_post import SelfPost
+from bdfr.site_downloaders.vidble import Vidble
 from bdfr.site_downloaders.youtube import Youtube
 
 
@@ -46,11 +47,12 @@ def pull_lever(url: str) -> Type[BaseDownloader]:
  return Direct
  elif re.match(r'pornhub\.com.*', sanitised_url):
  return PornHub
+ elif re.match(r'vidble\.com', sanitised_url):
+ return Vidble
  elif YoutubeDlFallback.can_handle_link(sanitised_url):
  return YoutubeDlFallback
  else:
- raise NotADownloadableLinkError(
- f'No downloader module exists for url {url}')
+ raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
 
  @staticmethod
  def sanitise_url(url: str) -> str:

diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py
@@ -29,7 +29,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
  for link in links:
  if not re.match(r'https?://.*', link):
  link = 'https://' + link
- out.append(Resource(self.post, link))
+ out.append(Resource(self.post, link, Resource.retry_download(link)))
  return out
 
  @staticmethod

diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py
@@ -4,7 +4,6 @@
 import logging
 from typing import Optional
 
-import youtube_dl
 from praw.models import Submission
 
 from bdfr.resource import Resource
@@ -20,21 +19,18 @@ def __init__(self, post: Submission):
  super(YoutubeDlFallback, self).__init__(post)
 
  def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
- out = super()._download_video({})
+ out = Resource(
+ self.post,
+ self.post.url,
+ super()._download_video({}),
+ super().get_video_attributes(self.post.url)['ext'],
+ )
  return [out]
 
  @staticmethod
  def can_handle_link(url: str) -> bool:
- yt_logger = logging.getLogger('youtube-dl')
- yt_logger.setLevel(logging.CRITICAL)
- with youtube_dl.YoutubeDL({
- 'logger': yt_logger,
- }) as ydl:
- try:
- result = ydl.extract_info(url, download=False)
- if result:
- return True
- except Exception as e:
- logger.exception(e)
- return False
- return False
+ attributes = YoutubeDlFallback.get_video_attributes(url)
+ if attributes:
+ return True
+ else:
+ return False
diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py
@@ -21,7 +21,7 @@ def __init__(self, post: Submission):
  def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
  try:
  image_urls = self._get_links(self.post.gallery_data['items'])
- except AttributeError:
+ except (AttributeError, TypeError):
  try:
  image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
  except (AttributeError, IndexError, TypeError):
@@ -31,7 +31,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
 
  if not image_urls:
  raise SiteDownloaderError('No images found in Reddit gallery')
- return [Resource(self.post, url) for url in image_urls]
+ return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls]
 
  @ staticmethod
  def _get_links(id_dict: list[dict]) -> list[str]:

diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py
@@ -33,7 +33,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
 
  def _compute_image_url(self, image: dict) -> Resource:
  image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
- return Resource(self.post, image_url)
+ return Resource(self.post, image_url, Resource.retry_download(image_url))
 
  @staticmethod
  def _get_data(link: str) -> dict:

diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py
@@ -22,5 +22,10 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
  'format': 'best',
  'nooverwrites': True,
  }
- out = self._download_video(ytdl_options)
+ out = Resource(
+ self.post,
+ self.post.url,
+ super()._download_video(ytdl_options),
+ super().get_video_attributes(self.post.url)['ext'],
+ )
  return [out]
diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py
@@ -18,7 +18,7 @@ def __init__(self, post: Submission):
 
  def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
  media_url = self._get_link(self.post.url)
- return [Resource(self.post, media_url, '.mp4')]
+ return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')]
 
  @staticmethod
  def _get_link(url: str) -> str:

diff --git a/bdfr/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py
@@ -17,7 +17,7 @@ def __init__(self, post: Submission):
  super().__init__(post)
 
  def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
- out = Resource(self.post, self.post.url, '.txt')
+ out = Resource(self.post, self.post.url, lambda: None, '.txt')
  out.content = self.export_to_string().encode('utf-8')
  out.create_hash()
  return [out]