From c1916f3102329f548f2e9624e2f4a422d786662b Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 21 Sep 2020 00:36:24 -0400
Subject: [PATCH] tumblr_backup: --continue and related new behavior

Included revisions:
- Allow setting options in BACKUP_CHANGING_OPTIONS to match the backup
- Fix incorrectly sorted imports
- Use lists in a few places for consistency with JSON
- Set before to oldest_tstamp even without --period
- Fix argument check to allow --auto 0
- Pass all path parts to open_text in save_post to ensure mkdir

Fixes bbolli/tumblr-utils#115
---
 tumblr_backup.py | 324 ++++++++++++++++++++++++++++++++++-------------
 util.py          |  17 +++
 wget.py          |  22 +---
 3 files changed, 254 insertions(+), 109 deletions(-)

diff --git a/tumblr_backup.py b/tumblr_backup.py
index 4ffd6da..be46341 100755
--- a/tumblr_backup.py
+++ b/tumblr_backup.py
@@ -4,6 +4,7 @@
 from __future__ import absolute_import, division, print_function, with_statement
 
 # standard Python library imports
+import contextlib
 import errno
 import hashlib
 import imghdr
@@ -23,11 +24,12 @@
 from glob import glob
 from os.path import join, split, splitext
 from posixpath import basename as urlbasename, join as urlpathjoin, splitext as urlsplitext
+from tempfile import NamedTemporaryFile
 from xml.sax.saxutils import escape
 
 from util import (AsyncCallable, ConnectionFile, LockedQueue, MultiCondition, PY3, disable_unraisable_hook,
-                  is_dns_working, make_requests_session, no_internet, nullcontext, path_is_on_vfat, to_bytes,
-                  to_unicode)
+                  is_dns_working, make_requests_session, no_internet, nullcontext, opendir, path_is_on_vfat, to_bytes,
+                  to_unicode, try_unlink)
 from wget import HTTPError, HTTP_RETRY, HTTP_TIMEOUT, WGError, WgetRetrieveWrapper, setup_wget, urlopen
 
 try:
@@ -160,6 +162,11 @@ def test_jpg(h, f):
 FILE_ENCODING = 'utf-8'
 TIME_ENCODING = locale.getlocale(locale.LC_TIME)[1] or FILE_ENCODING
 
+MUST_MATCH_OPTIONS = ('dirs', 'likes', 'blosxom', 'hostdirs', 'image_names')
+BACKUP_CHANGING_OPTIONS = (
+    'save_images', 'save_video', 'save_video_tumblr', 'save_audio', 'save_notes', 'copy_notes', 'notes_limit', 'json',
+    'count', 'skip', 'period', 'request', 'filter', 'no_reblog', 'exif', 'prev_archives')
+
 main_thread_lock = threading.RLock()
 multicond = MultiCondition(main_thread_lock)
 disable_note_scraper = set()  # type: Set[str]
@@ -167,6 +174,15 @@ def test_jpg(h, f):
 prev_resps = None  # type: Optional[Tuple[str, ...]]
 
 
+def load_bs4(reason):
+    sys.modules['soupsieve'] = ()  # type: ignore[assignment]
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        raise RuntimeError("Cannot {} without module 'bs4'".format(reason))
+    return BeautifulSoup
+
+
 class Logger(object):
     def __init__(self):
         self.lock = threading.Lock()
@@ -228,10 +244,32 @@ def open_file(open_fn, parts):
     return open_fn(path_to(*parts))
 
 
+@contextlib.contextmanager
 def open_text(*parts):
-    return open_file(
-        lambda f: io.open(f, 'w', encoding=FILE_ENCODING, errors='xmlcharrefreplace'), parts
-    )
+    dest_path = open_file(lambda f: f, parts)
+    dest_dirname, dest_basename = split(dest_path)
+
+    with NamedTemporaryFile('w', prefix='.{}.'.format(dest_basename), dir=dest_dirname, delete=False) as partf:
+        # Yield the file for writing
+        with io.open(partf.fileno(), 'w', encoding=FILE_ENCODING, errors='xmlcharrefreplace', closefd=False) as f:
+            yield f
+
+        # NamedTemporaryFile is created 0600, set mode to the usual 0644
+        os.fchmod(partf.fileno(), 0o644)
+
+        # Flush buffers and sync the inode
+        partf.flush()
+        os.fsync(partf)  # type: ignore
+
+        pfname = partf.name
+
+    # Move to final destination
+    if PY3:
+        os.replace(pfname, dest_path)
+    else:
+        if os.name == 'nt':
+            try_unlink(dest_path)  # Avoid potential FileExistsError
+        os.rename(pfname, dest_path)
 
 
 def strftime(fmt, t=None):
@@ -252,24 +290,25 @@ def get_api_url(account):
     )
 
 
-def set_period():
+def set_period(period):
     """Prepare the period start and end timestamps"""
     i = 0
-    tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
-    if len(options.period) >= 6:
+    tm = [int(period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
+    if len(period) >= 6:
         i = 1
-        tm[1] = int(options.period[4:6])
-    if len(options.period) == 8:
+        tm[1] = int(period[4:6])
+    if len(period) == 8:
         i = 2
-        tm[2] = int(options.period[6:8])
+        tm[2] = int(period[6:8])
 
     def mktime(tml):
         tmt = tuple(tml)  # type: Any
         return time.mktime(tmt)
 
-    options.p_start = int(mktime(tm))
+    p_start = int(mktime(tm))
     tm[i] += 1
-    options.p_stop = int(mktime(tm))
+    p_stop = int(mktime(tm))
+    return [p_start, p_stop]
 
 
 class ApiParser(object):
@@ -565,6 +604,18 @@ def dup(fd): return fd
         return True  # Either we copied it or we didn't need to
 
 
+def check_optional_modules():
+    if options.exif:
+        if pyexiv2 is None:
+            raise RuntimeError("--exif: module 'pyexiv2' is not installed")
+        if not hasattr(pyexiv2, 'ImageMetadata'):
+            raise RuntimeError("--exif: module 'pyexiv2' is missing features, perhaps you need 'py3exiv2'?")
+    if options.filter is not None and pyjq is None:
+        raise RuntimeError("--filter: module 'pyjq' is not installed")
+    if options.prev_archives and scandir is None:
+        raise RuntimeError("--prev-archives: Python is less than 3.5 and module 'scandir' is not installed")
+
+
 class Index(object):
     def __init__(self, blog, body_class='index'):
         self.blog = blog
@@ -650,7 +701,8 @@ def next_month(inc):
 
             archive.append(self.blog.footer(base, pp, np))
 
-            arch.write('\n'.join(archive))
+            with arch as archf:
+                archf.write('\n'.join(archive))
 
         assert first_file is not None
         return first_file
@@ -758,7 +810,8 @@ def footer(base, previous_page, next_page):
         return f
 
     @staticmethod
-    def get_post_timestamps(posts):
+    def get_post_timestamps(posts, reason):
+        BeautifulSoup = load_bs4(reason)
         for post in posts:
             with io.open(post, encoding=FILE_ENCODING) as pf:
                 soup = BeautifulSoup(pf, 'lxml')
@@ -767,6 +820,90 @@ def get_post_timestamps(posts):
             # No datetime.fromisoformat or datetime.timestamp on Python 2
             yield (datetime.strptime(postdate, '%Y-%m-%dT%H:%M:%SZ') - datetime(1970, 1, 1)) // timedelta(seconds=1)
 
+    @classmethod
+    def process_existing_backup(cls, account, prev_archive):
+        complete_backup = os.path.exists(path_to('.complete'))
+        if options.resume and complete_backup:
+            raise RuntimeError('{}: Cannot continue complete backup'.format(account))
+        try:
+            with io.open(path_to('.first_run_options'), encoding=FILE_ENCODING) as f:
+                first_run_options = json.load(f)
+        except EnvironmentError as e:
+            if getattr(e, 'errno', None) != errno.ENOENT:
+                raise
+            first_run_options = None
+
+        class Options(object):
+            def __init__(self, fro): self.fro = fro
+            def differs(self, opt): return opt not in self.fro or orig_options[opt] != self.fro[opt]
+            def first(self, opts): return {opt: self.fro.get(opt, '<not present>') for opt in opts}
+            @staticmethod
+            def this(opts): return {opt: orig_options[opt] for opt in opts}
+
+        # These options must always match
+        if first_run_options is not None:
+            opts = Options(first_run_options)
+            mustmatchdiff = tuple(filter(opts.differs, MUST_MATCH_OPTIONS))
+            if mustmatchdiff:
+                raise RuntimeError('{}: The script was given {} but the existing backup was made with {}'.format(
+                    account, opts.this(mustmatchdiff), opts.first(mustmatchdiff)))
+
+            backdiff = tuple(filter(opts.differs, BACKUP_CHANGING_OPTIONS))
+            if options.resume:
+                backdiff_nondef = tuple(opt for opt in backdiff if orig_options[opt] != parser.get_default(opt))
+                if backdiff_nondef:
+                    raise RuntimeError('{}: The script was given {} but the existing backup was made with {}'.format(
+                        account, opts.this(backdiff_nondef), opts.first(backdiff_nondef)))
+            elif complete_backup:
+                pass  # Complete archives may be added to with different options
+            elif not backdiff:
+                raise RuntimeError('{}: Found incomplete archive, try --continue'.format(account))
+            elif not options.ignore_resume:
+                raise RuntimeError('{}: Refusing to make a different backup (with {} instead of {}) over an incomplete '
+                                   'archive. Delete the old backup to start fresh, or skip this check with '
+                                   '--continue=ignore.'.format(account, opts.this(backdiff), opts.first(backdiff)))
+
+        if prev_archive is not None:
+            try:
+                with io.open(join(prev_archive, '.first_run_options'), encoding=FILE_ENCODING) as f:
+                    pa_first_run_options = json.load(f)
+            except EnvironmentError as e:
+                if getattr(e, 'errno', None) != errno.ENOENT:
+                    raise
+                pa_first_run_options = None
+
+            # These options must always match
+            if pa_first_run_options is not None:
+                pa_opts = Options(pa_first_run_options)
+                mustmatchdiff = tuple(filter(pa_opts.differs, MUST_MATCH_OPTIONS))
+                if mustmatchdiff:
+                    raise RuntimeError('{}: The script was given {} but the previous archive was made with {}'.format(
+                        account, pa_opts.this(mustmatchdiff), pa_opts.first(mustmatchdiff)))
+
+        oldest_tstamp = None
+        if not complete_backup:
+            # Read every post to find the oldest timestamp we've saved.
+            filter_ = join('*', dir_index) if options.dirs else '*' + post_ext
+            post_glob = glob(path_to(post_dir, filter_))
+            if options.resume and post_glob:
+                log('Found incomplete backup. Finding oldest post (may take a while)\n', account=True)
+                oldest_tstamp = min(cls.get_post_timestamps(post_glob, 'continue incomplete backup'))
+
+        if first_run_options is not None and options.resume:
+            # Load saved options
+            for opt in BACKUP_CHANGING_OPTIONS:
+                setattr(options, opt, first_run_options[opt])
+        else:
+            # Load original options
+            for opt in BACKUP_CHANGING_OPTIONS:
+                setattr(options, opt, orig_options[opt])
+            if first_run_options is None and not (complete_backup or post_glob):
+                # Presumably this is the initial backup of this blog
+                with open_text('.first_run_options') as f:
+                    f.write(to_unicode(json.dumps(orig_options)))
+
+        return oldest_tstamp
+
     def backup(self, account, prev_archive):
         """makes single files and an index for every post on a public Tumblr blog account"""
 
@@ -794,6 +931,9 @@ def backup(self, account, prev_archive):
         self.post_count = 0
         self.filter_skipped = 0
 
+        oldest_tstamp = self.process_existing_backup(account, prev_archive)
+        check_optional_modules()
+
         # get the highest post id already saved
         ident_max = None
         if options.incremental:
@@ -803,10 +943,8 @@ def backup(self, account, prev_archive):
                 pass  # No posts to read
             elif options.likes:
                 # Read every post to find the newest timestamp we've saved.
-                if BeautifulSoup is None:
-                    raise RuntimeError("Incremental likes backup: module 'bs4' is not installed")
                 log('Finding newest liked post (may take a while)\n', account=True)
-                ident_max = max(self.get_post_timestamps(post_glob))
+                ident_max = max(self.get_post_timestamps(post_glob, 'backup likes incrementally'))
             else:
                 ident_max = max(long(splitext(split(f)[1])[0]) for f in post_glob)
             if ident_max is not None:
@@ -844,6 +982,9 @@ def backup(self, account, prev_archive):
         # use the meta information to create a HTML header
         TumblrPost.post_header = self.header(body_class='post')
 
+        jq_filter = None if options.filter is None else pyjq.compile(options.filter)  # pytype: disable=attribute-error
+        request_sets = None if options.request is None else {typ: set(tags) for typ, tags in options.request.items()}
+
         # start the thread pool
         backup_pool = ThreadPool()
 
@@ -860,17 +1001,17 @@ def sort_key(x): return x[0]['liked_timestamp'] if options.likes else long(x[0][
                     log('Stopping backup: Incremental backup complete\n', account=True)
                     return False
                 if options.period:
-                    if post.date >= options.p_stop:
+                    if post.date >= options.period[1]:
                         raise RuntimeError('Found post with date ({}) older than before param ({})'.format(
-                            post.date, options.p_stop))
-                    if post.date < options.p_start:
+                            post.date, options.period[1]))
+                    if post.date < options.period[0]:
                         log('Stopping backup: Reached end of period\n', account=True)
                         return False
-                if options.request:
-                    if post.typ not in options.request:
+                if request_sets:
+                    if post.typ not in request_sets:
                         continue
-                    tags = options.request[post.typ]
-                    if not (TAG_ANY in tags or tags & post.tags_lower):
+                    tags = request_sets[post.typ]
+                    if not (TAG_ANY in tags or tags & {t.lower() for t in post.tags}):
                         continue
                 if options.no_reblog:
                     if 'reblogged_from_name' in p or 'reblogged_root_name' in p:
@@ -882,7 +1023,7 @@ def sort_key(x): return x[0]['liked_timestamp'] if options.likes else long(x[0][
                         continue
                 if os.path.exists(path_to(*post.get_path())) and options.no_post_clobber:
                     continue  # Post exists and no-clobber enabled
-                if options.filter and not options.filter.first(p):
+                if jq_filter and not jq_filter.first(p):
                     self.filter_skipped += 1
                     continue
 
@@ -903,7 +1044,9 @@ def sort_key(x): return x[0]['liked_timestamp'] if options.likes else long(x[0][
             # Get the JSON entries from the API, which we can only do for MAX_POSTS posts at once.
             # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
             i = options.skip
-            before = options.p_stop if options.period else None
+            before = options.period[1] if options.period else None
+            if oldest_tstamp is not None:
+                before = oldest_tstamp if before is None else min(before, oldest_tstamp)
 
             while True:
                 # find the upper bound
@@ -964,6 +1107,17 @@ def sort_key(x): return x[0]['liked_timestamp'] if options.likes else long(x[0][
             ix.build_index()
             ix.save_index()
 
+        if not os.path.exists(path_to('.complete')):
+            # Make .complete file
+            sf = opendir(save_folder, os.O_RDONLY)
+            try:
+                os.fdatasync(sf)
+                with io.open(open_file(lambda f: f, ('.complete',)), 'wb') as f:
+                    os.fsync(f)  # type: ignore
+                os.fdatasync(sf)
+            finally:
+                os.close(sf)
+
         log.status(None)
         skipped_msg = (', {} did not match filter'.format(self.filter_skipped)) if self.filter_skipped else ''
         log(
@@ -992,7 +1146,7 @@ def __init__(self, post, backup_account, respfile, prev_archive):
         self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z'
         self.tm = time.localtime(self.date)
         self.title = u''
-        self.tags = post['tags']
+        self.tags = post['tags']  # type: Text
         self.note_count = post.get('note_count')
         if self.note_count is None:
             self.note_count = post.get('notes', {}).get('count')
@@ -1002,9 +1156,6 @@ def __init__(self, post, backup_account, respfile, prev_archive):
         self.reblogged_root = post.get('reblogged_root_url')
         self.source_title = post.get('source_title', '')
         self.source_url = post.get('source_url', '')
-        self.tags_lower = None  # type: Optional[Set[str]]
-        if options.request:
-            self.tags_lower = {t.lower() for t in self.tags}
         self.file_name = join(self.ident, dir_index) if options.dirs else self.ident + post_ext
         self.llink = self.ident if options.dirs else self.file_name
         self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir
@@ -1155,6 +1306,11 @@ def get_youtube_url(self, youtube_url):
         }
         if options.cookiefile is not None:
             ydl_options['cookiefile'] = options.cookiefile
+        try:
+            import youtube_dl
+            from youtube_dl.utils import sanitize_filename
+        except ImportError:
+            raise RuntimeError("--save-video: module 'youtube_dl' is not installed")
         ydl = youtube_dl.YoutubeDL(ydl_options)
         ydl.add_default_info_extractors()
         try:
@@ -1310,6 +1466,9 @@ def get_post(self):
 
         notes_html = u''
 
+        if options.save_notes or options.copy_notes:
+            BeautifulSoup = load_bs4('save notes' if options.save_notes else 'copy notes')
+
         if options.copy_notes:
             # Copy notes from prev_archive
             with io.open(join(self.prev_archive, post_dir, self.ident + post_ext)) as post_file:
@@ -1319,6 +1478,8 @@ def get_post(self):
                 notes_html = u''.join([n.prettify() for n in notes.find_all('li')])
 
         if options.save_notes and self.backup_account not in disable_note_scraper and not notes_html.strip():
+            import note_scraper
+
             # Scrape and save notes
             while True:
                 ns_stdout_rd, ns_stdout_wr = multiprocessing.Pipe(duplex=False)
@@ -1392,9 +1553,10 @@ def get_path(self):
 
     def save_post(self):
         """saves this post locally"""
-        with open_text(*self.get_path()) as f:
+        path_parts = self.get_path()
+        with open_text(*path_parts) as f:
             f.write(self.get_post())
-        os.utime(f.name, (self.date, self.date))
+        os.utime(path_to(*path_parts), (self.date, self.date))
         if options.json:
             with open_text(json_dir, self.ident + '.json') as f:
                 f.write(self.get_json_content())
@@ -1568,10 +1730,6 @@ def handle_term_signal(signum, frame):
     import argparse
 
     class CSVCallback(argparse.Action):
-        def __call__(self, parser, namespace, values, option_string=None):
-            setattr(namespace, self.dest, set(values.split(',')))
-
-    class CSVListCallback(argparse.Action):
         def __call__(self, parser, namespace, values, option_string=None):
             setattr(namespace, self.dest, list(values.split(',')))
 
@@ -1584,10 +1742,12 @@ def __call__(self, parser, namespace, values, option_string=None):
                 if typ != TYPE_ANY and typ not in POST_TYPES:
                     parser.error("{}: invalid post type '{}'".format(option_string, typ))
                 for typ in POST_TYPES if typ == TYPE_ANY else (typ,):
-                    if parts:
-                        request[typ] = request.get(typ, set()).union(parts)
-                    else:
-                        request[typ] = {TAG_ANY}
+                    if not parts:
+                        request[typ] = [TAG_ANY]
+                        continue
+                    if typ not in request:
+                        request[typ] = []
+                    request[typ].extend(parts)
             setattr(namespace, self.dest, request)
 
     class TagsCallback(RequestCallback):
@@ -1596,6 +1756,18 @@ def __call__(self, parser, namespace, values, option_string=None):
                 parser, namespace, TYPE_ANY + ':' + values.replace(',', ':'), option_string,
             )
 
+    class PeriodCallback(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            try:
+                pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[values]
+            except KeyError:
+                period = values.replace('-', '')
+                if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', period):
+                    parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
+            else:
+                period = time.strftime(pformat)
+            setattr(namespace, self.dest, set_period(period))
+
     parser = argparse.ArgumentParser(usage='%(prog)s [options] blog-name ...',
                                      description='Makes a local backup of Tumblr blogs.')
     parser.add_argument('-O', '--outdir', help='set the output directory (default: blog-name)')
@@ -1623,7 +1795,8 @@ def __call__(self, parser, namespace, values, option_string=None):
                              ' (useful for cron jobs)')
     parser.add_argument('-n', '--count', type=int, help='save only COUNT posts')
     parser.add_argument('-s', '--skip', type=int, default=0, help='skip the first SKIP posts')
-    parser.add_argument('-p', '--period', help="limit the backup to PERIOD ('y', 'm', 'd' or YYYY[MM[DD]])")
+    parser.add_argument('-p', '--period', action=PeriodCallback,
+                        help="limit the backup to PERIOD ('y', 'm', 'd' or YYYY[MM[DD]])")
     parser.add_argument('-N', '--posts-per-page', type=int, default=50, metavar='COUNT',
                         help='set the number of posts per monthly page, 0 for unlimited')
     parser.add_argument('-Q', '--request', action=RequestCallback,
@@ -1640,11 +1813,11 @@ def __call__(self, parser, namespace, values, option_string=None):
     parser.add_argument('--no-reblog', action='store_true', help="don't save reblogged posts")
     parser.add_argument('-I', '--image-names', choices=('o', 'i', 'bi'), default='o', metavar='FMT',
                         help="image filename format ('o'=original, 'i'=<post-id>, 'bi'=<blog-name>_<post-id>)")
-    parser.add_argument('-e', '--exif', action=CSVCallback, default=set(), metavar='KW',
+    parser.add_argument('-e', '--exif', action=CSVCallback, default=[], metavar='KW',
                         help='add EXIF keyword tags to each picture'
                              " (comma-separated values; '-' to remove all tags, '' to add no extra tags)")
     parser.add_argument('-S', '--no-ssl-verify', action='store_true', help='ignore SSL verification errors')
-    parser.add_argument('--prev-archives', action=CSVListCallback, default=[], metavar='DIRS',
+    parser.add_argument('--prev-archives', action=CSVCallback, default=[], metavar='DIRS',
                         help='comma-separated list of directories (one per blog) containing previous blog archives')
     parser.add_argument('--no-post-clobber', action='store_true', help='Do not re-download existing posts')
     parser.add_argument('-M', '--timestamping', action='store_true',
@@ -1658,27 +1831,21 @@ def __call__(self, parser, namespace, values, option_string=None):
     parser.add_argument('--hostdirs', action='store_true', help='Generate host-prefixed directories for media')
     parser.add_argument('--user-agent', help='User agent string to use with HTTP requests')
     parser.add_argument('--threads', type=int, default=20, help='number of threads to use for post retrieval')
+    parser.add_argument('--continue', action='store_true', dest='resume', help='Continue an incomplete first backup')
+    parser.add_argument('--continue=ignore', action='store_true', dest='ignore_resume',
+                        help='Force backup over an incomplete archive with different options')
     parser.add_argument('blogs', nargs='*')
     options = parser.parse_args()
+    blogs = options.blogs or DEFAULT_BLOGS
+    del options.blogs
+    orig_options = vars(options).copy()
 
+    if not blogs:
+        parser.error('Missing blog-name')
+    if sum(1 for arg in ('resume', 'ignore_resume', 'incremental', 'auto') if getattr(options, arg) not in (None, False)) > 1:
+        parser.error('Only one of --continue, --continue=ignore, --incremental, or --auto may be given')
     if options.auto is not None and options.auto != time.localtime().tm_hour:
         options.incremental = True
-    if options.period:
-        try:
-            pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
-            options.period = time.strftime(pformat)
-        except KeyError:
-            options.period = options.period.replace('-', '')
-            if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', options.period):
-                parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
-        set_period()
-
-    wget_retrieve = WgetRetrieveWrapper(options, log)
-    setup_wget(not options.no_ssl_verify, options.user_agent)
-
-    blogs = options.blogs or DEFAULT_BLOGS
-    if not blogs:
-        parser.error("Missing blog-name")
     if options.count is not None and options.count < 0:
         parser.error('--count: count must not be negative')
     if options.count == 0 and (options.incremental or options.auto is not None):
@@ -1691,44 +1858,16 @@ def __call__(self, parser, namespace, values, option_string=None):
         parser.error("-O can only be used for a single blog-name")
     if options.dirs and options.tag_index:
         parser.error("-D cannot be used with --tag-index")
-    if options.exif:
-        if pyexiv2 is None:
-            parser.error("--exif: module 'pyexiv2' is not installed")
-        if not hasattr(pyexiv2, 'ImageMetadata'):
-            parser.error("--exif: module 'pyexiv2' is missing features, perhaps you need 'py3exiv2'?")
-    if options.save_video:
-        try:
-            import youtube_dl
-            from youtube_dl.utils import sanitize_filename
-        except ImportError:
-            parser.error("--save-video: module 'youtube_dl' is not installed")
-    if options.save_notes or options.copy_notes:
-        sys.modules['soupsieve'] = ()  # type: ignore[assignment]
-        try:
-            from bs4 import BeautifulSoup
-        except ImportError:
-            parser.error("--{}: module 'bs4' is not installed".format(
-                'save-notes' if options.save_notes else 'copy-notes'
-            ))
     if options.cookiefile is not None and not os.access(options.cookiefile, os.R_OK):
         parser.error('--cookiefile: file cannot be read')
-    if options.save_notes:
-        import note_scraper
-    if options.copy_notes:
-        if not options.prev_archives:
-            parser.error('--copy-notes requires --prev-archives')
+    if options.copy_notes and not options.prev_archives:
+        parser.error('--copy-notes requires --prev-archives')
     if options.notes_limit is not None:
         if not options.save_notes:
             parser.error('--notes-limit requires --save-notes')
         if options.notes_limit < 1:
             parser.error('--notes-limit: Value must be at least 1')
-    if options.filter is not None:
-        if pyjq is None:
-            parser.error("--filter: module 'pyjq' is not installed")
-        options.filter = pyjq.compile(options.filter)
     if options.prev_archives:
-        if scandir is None:
-            parser.error("--prev-archives: Python is less than 3.5 and module 'scandir' is not installed")
         if len(options.prev_archives) != len(blogs):
             parser.error('--prev-archives: expected {} directories, got {}'.format(
                 len(blogs), len(options.prev_archives),
@@ -1746,12 +1885,17 @@ def __call__(self, parser, namespace, values, option_string=None):
     if options.threads < 1:
         parser.error('--threads: must use at least one thread')
 
+    check_optional_modules()
+
     if not API_KEY:
         sys.stderr.write('''\
 Missing API_KEY; please get your own API key at
 https://www.tumblr.com/oauth/apps\n''')
         sys.exit(1)
 
+    wget_retrieve = WgetRetrieveWrapper(options, log)
+    setup_wget(not options.no_ssl_verify, options.user_agent)
+
     ApiParser.setup()
     tb = TumblrBackup()
     try:
diff --git a/util.py b/util.py
index 8e9316e..1fdb024 100644
--- a/util.py
+++ b/util.py
@@ -4,6 +4,7 @@
 
 import collections
 import contextlib
+import errno
 import io
 import os
 import socket
@@ -512,3 +513,19 @@ def quit(self):
         except queue.Full:
             pass
         self.thread.join()
+
+
+def opendir(dir_, flags):
+    try:
+        flags |= os.O_DIRECTORY
+    except AttributeError:
+        dir_ += os.path.sep  # Fallback, some systems don't support O_DIRECTORY
+    return os.open(dir_, flags)
+
+
+def try_unlink(path):
+    try:
+        os.unlink(path)
+    except EnvironmentError as e:
+        if getattr(e, 'errno', None) != errno.ENOENT:
+            raise
diff --git a/wget.py b/wget.py
index 324b9f7..2b07663 100644
--- a/wget.py
+++ b/wget.py
@@ -13,7 +13,8 @@
 from tempfile import NamedTemporaryFile
 from wsgiref.handlers import format_date_time
 
-from util import PY3, URLLIB3_FROM_PIP, get_supported_encodings, is_dns_working, no_internet, setup_urllib3_ssl
+from util import (PY3, URLLIB3_FROM_PIP, get_supported_encodings, is_dns_working, no_internet, opendir,
+                  setup_urllib3_ssl, try_unlink)
 
 try:
     from urllib.parse import urljoin, urlsplit
@@ -569,14 +570,8 @@ def _retrieve_loop(hstat, url, dest_file, adjust_basename, options, log):
     got_head = False  # used for time-stamping
     dest_dirname, dest_basename = os.path.split(dest_file)
 
-    flags = os.O_RDONLY
-    try:
-        flags |= os.O_DIRECTORY
-    except AttributeError:
-        dest_dirname += os.path.sep  # Fallback, some systems don't support O_DIRECTORY
-
     if os.name == 'posix':  # Opening directories is a POSIX feature
-        hstat.dest_dir = os.open(dest_dirname, flags)
+        hstat.dest_dir = opendir(dest_dirname, os.O_RDONLY)
     hstat.set_part_file_supplier(functools.partial(
         lambda pfx, dir_: NamedTemporaryFile('wb', prefix=pfx, dir=dir_, delete=False),
         '.{}.'.format(dest_basename), dest_dirname,
@@ -732,20 +727,9 @@ def _retrieve_loop(hstat, url, dest_file, adjust_basename, options, log):
             os.replace(os.path.basename(pfname), new_dest_basename,
                        src_dir_fd=hstat.dest_dir, dst_dir_fd=hstat.dest_dir)
 
-        # Sync the directory and return
-        if hstat.dest_dir is not None:
-            os.fdatasync(hstat.dest_dir)
         return
 
 
-def try_unlink(path):
-    try:
-        os.unlink(path)
-    except EnvironmentError as e:
-        if getattr(e, 'errno', None) != errno.ENOENT:
-            raise
-
-
 def setup_wget(ssl_verify, user_agent):
     if not ssl_verify:
         # Hide the InsecureRequestWarning from urllib3