From 3c2712a67d717d4aa29e3b868f955b6c3efd842a Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Mon, 23 Oct 2023 09:39:10 -0400
Subject: [PATCH] Add black formatting

Reformat Python source code with black, and run a formatting check
during CI.

Closes #14
---
 .github/workflows/run-tests.yaml |   3 +
 tests/test_warcdb.py             |  40 ++++---
 warcdb/__init__.py               | 194 +++++++++++++++++--------------
 3 files changed, 135 insertions(+), 102 deletions(-)

diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
index 66962c1..1b03e6f 100644
--- a/.github/workflows/run-tests.yaml
+++ b/.github/workflows/run-tests.yaml
@@ -20,6 +20,9 @@ jobs:
           pip install poetry
           poetry install
 
+      - name: Check formatting
+        run: poetry run black --check .
+
       - name: Run tests
         run: |
           poetry run pytest
diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py
index c4c12fe..bc6b41c 100644
--- a/tests/test_warcdb.py
+++ b/tests/test_warcdb.py
@@ -13,15 +13,18 @@
 # all these WARC files were created with wget except for apod.warc.gz which was
 # created with browsertrix-crawler
 
-@pytest.mark.parametrize("warc_path", [str(tests_dir / "google.warc"),
-                                       str(tests_dir / "google.warc.gz"),
-                                       str(tests_dir / "no-warc-info.warc"),
-                                       str(tests_dir / "scoop.wacz"),
-                                       "https://tselai.com/data/google.warc",
-                                       "https://tselai.com/data/google.warc.gz"
-                                       ])
-
 
+@pytest.mark.parametrize(
+    "warc_path",
+    [
+        str(tests_dir / "google.warc"),
+        str(tests_dir / "google.warc.gz"),
+        str(tests_dir / "no-warc-info.warc"),
+        str(tests_dir / "scoop.wacz"),
+        "https://tselai.com/data/google.warc",
+        "https://tselai.com/data/google.warc.gz",
+    ],
+)
 def test_import(warc_path):
     runner = CliRunner()
     args = ["import", db_file, warc_path]
@@ -29,24 +32,35 @@ def test_import(warc_path):
     assert result.exit_code == 0
     db = sqlite_utils.Database(db_file)
     assert set(db.table_names()) == {
-        'metadata', 'request', 'resource', 'response', 'warcinfo', '_sqlite_migrations'
+        "metadata",
+        "request",
+        "resource",
+        "response",
+        "warcinfo",
+        "_sqlite_migrations",
     }
 
     if warc_path == str(tests_dir / "google.warc"):
-        assert db.table('warcinfo').get('<urn:uuid:7ABED2CA-7CBD-48A0-92E5-0059EBFC111A>')
-        assert db.table('request').get('<urn:uuid:524F62DD-D788-4085-B14D-22B0CDC0AC53>')
+        assert db.table("warcinfo").get(
+            "<urn:uuid:7ABED2CA-7CBD-48A0-92E5-0059EBFC111A>"
+        )
+        assert db.table("request").get(
+            "<urn:uuid:524F62DD-D788-4085-B14D-22B0CDC0AC53>"
+        )
 
     os.remove(db_file)
 
 
 def test_column_names():
     runner = CliRunner()
-    runner.invoke(warcdb_cli, ["import", db_file, str(pathlib.Path('tests/google.warc'))])
+    runner.invoke(
+        warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))]
+    )
 
     # make sure that the columns are named correctly (lowercase with underscores)
     db = sqlite_utils.Database(db_file)
     for table in db.tables:
         for col in table.columns:
-            assert re.match(r'^[a-z_]+', col.name), f'column {col.name} named correctly'
+            assert re.match(r"^[a-z_]+", col.name), f"column {col.name} named correctly"
 
     os.remove(db_file)
diff --git a/warcdb/__init__.py b/warcdb/__init__.py
index 315d412..751111d 100644
--- a/warcdb/__init__.py
+++ b/warcdb/__init__.py
@@ -17,7 +17,7 @@
 
 
 def dict_union(*args):
-    """ Utility function to union multiple dicts """
+    """Utility function to union multiple dicts"""
     # https://stackoverflow.com/a/15936211/1333954
     return dict(chain.from_iterable(d.iteritems() for d in args))
 
@@ -26,10 +26,10 @@ def dict_union(*args):
 
 
 def headers_to_json(self):
-    return dumps([{'header': h, 'value': v} for h, v in self.headers])
+    return dumps([{"header": h, "value": v} for h, v in self.headers])
 
 
-setattr(StatusAndHeaders, 'to_json', headers_to_json)
+setattr(StatusAndHeaders, "to_json", headers_to_json)
 
 """ Monkeypatch warcio.ArcWarcRecord.payload """
 
@@ -39,7 +39,7 @@ def record_payload(self: ArcWarcRecord):
     return self.content_stream().read()
 
 
-setattr(ArcWarcRecord, 'payload', record_payload)
+setattr(ArcWarcRecord, "payload", record_payload)
 
 """ Monkeypatch warcio.ArcWarcRecord.as_dict() """
 
@@ -47,10 +47,10 @@ def record_payload(self: ArcWarcRecord):
 @cache
 def record_as_dict(self: ArcWarcRecord):
     """Method to easily represent a record as a dict, to be fed into db_utils.Database.insert()"""
-    return {k.lower().replace('-', '_'): v for k, v in self.rec_headers.headers}
+    return {k.lower().replace("-", "_"): v for k, v in self.rec_headers.headers}
 
 
-setattr(ArcWarcRecord, 'as_dict', record_as_dict)
+setattr(ArcWarcRecord, "as_dict", record_as_dict)
 
 """ Monkeypatch warcio.ArcWarcRecord.to_json() """
 
@@ -78,8 +78,8 @@ class WarcDB(MutableMapping):
 
     def __init__(self, *args, **kwargs):
         # First pop warcdb - specific params
-        self._batch_size = kwargs.pop('batch_size', 1000)
-        self._records_table = kwargs.get('records_table', 'records')
+        self._batch_size = kwargs.pop("batch_size", 1000)
+        self._records_table = kwargs.get("records_table", "records")
 
         # Pass the rest to sqlite_utils
         self._db = sqlite_utils.Database(*args, **kwargs)
@@ -99,16 +99,16 @@ def records(self):
 
     @property
     def http_headers(self):
-        return self.table('http_headers')
+        return self.table("http_headers")
 
     @property
     def payloads(self):
-        return self.table('payloads')
+        return self.table("payloads")
 
     """MutableMapping abstract methods"""
 
     def __setitem__(self, key, value: ArcWarcRecord):
-        """ This is the only client-facing way to mutate the file.
+        """This is the only client-facing way to mutate the file.
         Any normalization should happen here.
         """
         # Any normalizations happens here
@@ -140,103 +140,112 @@ def __iadd__(self, r: ArcWarcRecord):
         All 'warcinfo' and 'metadata' records shall not have a payload.
         """
         col_type_conversions = {
-            'content_length': int,
-            'payload': str,
-            'warc_date': datetime.datetime,
-
+            "content_length": int,
+            "payload": str,
+            "warc_date": datetime.datetime,
         }
         record_dict = r.as_dict()
 
         # Certain rec_types have payload
-        has_payload = r.rec_type in ['warcinfo', 'request', 'response', 'metadata', 'resource']
+        has_payload = r.rec_type in [
+            "warcinfo",
+            "request",
+            "response",
+            "metadata",
+            "resource",
+        ]
         if has_payload:
-            record_dict['payload'] = r.payload()
+            record_dict["payload"] = r.payload()
 
         # Certain rec_types have http_headers
         has_http_headers = r.http_headers is not None
         if has_http_headers:
-            record_dict['http_headers'] = r.http_headers.to_json()
+            record_dict["http_headers"] = r.http_headers.to_json()
 
         """Depending on the record type we insert to appropriate record"""
-        if r.rec_type == 'warcinfo':
-
-            self.db.table('warcinfo').insert(record_dict,
-                                             pk='warc_record_id',
-                                             alter=True,
-                                             ignore=True,
-                                             columns=col_type_conversions)
-        elif r.rec_type == 'request':
-            self.db.table('request').insert(record_dict,
-                                            pk='warc_record_id',
-                                            foreign_keys=[
-                                                ("warc_warcinfo_id", "warcinfo", "warc-record-id")
-                                            ],
-                                            alter=True,
-                                            ignore=True,
-                                            columns=col_type_conversions
-                                            )
-
-        elif r.rec_type == 'response':
-            self.db.table('response').insert(record_dict,
-                                             pk='warc_record_id',
-                                             foreign_keys=[
-                                                 ("warc_warcinfo_id", "warcinfo", "warc_record_id"),
-                                                 ("warc_concurrent_to", "request", "warc_record_id")
-                                             ],
-                                             alter=True,
-                                             ignore=True,
-                                             columns=col_type_conversions
-                                             )
-
-        elif r.rec_type == 'metadata':
-            self.db.table('metadata').insert(record_dict,
-                                             pk='warc_record_id',
-                                             foreign_keys=[
-                                                 ("warc-warcinfo-id", "warcinfo", "warc_record_id"),
-                                                 ("warc_concurrent_to", "response", "warc_record_id")
-                                             ],
-                                             alter=True,
-                                             ignore=True,
-                                             columns=col_type_conversions
-                                             )
-
-        elif r.rec_type == 'resource':
-            self.db.table('resource').insert(record_dict,
-                                             pk='warc_record_id',
-                                             foreign_keys=[
-                                                 ("warc-warcinfo-id", "warcinfo", "warc_record_id"),
-                                                 ("warc_concurrent_to", "metadata", "warc_record_id")
-                                             ],
-                                             alter=True,
-                                             ignore=True,
-                                             columns=col_type_conversions
-                                             )
+        if r.rec_type == "warcinfo":
+            self.db.table("warcinfo").insert(
+                record_dict,
+                pk="warc_record_id",
+                alter=True,
+                ignore=True,
+                columns=col_type_conversions,
+            )
+        elif r.rec_type == "request":
+            self.db.table("request").insert(
+                record_dict,
+                pk="warc_record_id",
+                foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc-record-id")],
+                alter=True,
+                ignore=True,
+                columns=col_type_conversions,
+            )
+
+        elif r.rec_type == "response":
+            self.db.table("response").insert(
+                record_dict,
+                pk="warc_record_id",
+                foreign_keys=[
+                    ("warc_warcinfo_id", "warcinfo", "warc_record_id"),
+                    ("warc_concurrent_to", "request", "warc_record_id"),
+                ],
+                alter=True,
+                ignore=True,
+                columns=col_type_conversions,
+            )
+
+        elif r.rec_type == "metadata":
+            self.db.table("metadata").insert(
+                record_dict,
+                pk="warc_record_id",
+                foreign_keys=[
+                    ("warc-warcinfo-id", "warcinfo", "warc_record_id"),
+                    ("warc_concurrent_to", "response", "warc_record_id"),
+                ],
+                alter=True,
+                ignore=True,
+                columns=col_type_conversions,
+            )
+
+        elif r.rec_type == "resource":
+            self.db.table("resource").insert(
+                record_dict,
+                pk="warc_record_id",
+                foreign_keys=[
+                    ("warc-warcinfo-id", "warcinfo", "warc_record_id"),
+                    ("warc_concurrent_to", "metadata", "warc_record_id"),
+                ],
+                alter=True,
+                ignore=True,
+                columns=col_type_conversions,
+            )
 
         else:
-            raise ValueError(f"Record type <{r.rec_type}> is not supported"
-                             f"Only [warcinfo, request, response, metadata, resource] are.")
+            raise ValueError(
+                f"Record type <{r.rec_type}> is not supported"
+                f"Only [warcinfo, request, response, metadata, resource] are."
+            )
         return self
 
 
 from sqlite_utils import cli as sqlite_utils_cli
 
 warcdb_cli = sqlite_utils_cli.cli
-warcdb_cli.help = \
-    "Commands for interacting with .warcdb files\n\nBased on SQLite-Utils"
+warcdb_cli.help = "Commands for interacting with .warcdb files\n\nBased on SQLite-Utils"
 
 
-@warcdb_cli.command('import')
+@warcdb_cli.command("import")
 @click.argument(
     "db_path",
     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
 )
-@click.argument('warc_path',
-                type=click.STRING,
-                nargs=-1
-                )
-@click.option('--batch-size',
-              type=click.INT, default=1000,
-              help="Batch size for chunked INSERTs [Note: ignored for now]", )
+@click.argument("warc_path", type=click.STRING, nargs=-1)
+@click.option(
+    "--batch-size",
+    type=click.INT,
+    default=1000,
+    help="Batch size for chunked INSERTs [Note: ignored for now]",
+)
 def import_(db_path, warc_path, batch_size):
     """
     Import a WARC file into the database
@@ -251,16 +260,23 @@ def import_(db_path, warc_path, batch_size):
 
     def to_import():
         for f in always_iterable(warc_path):
-            if f.startswith('http'):
-                yield from tqdm(ArchiveIterator(req.get(f, stream=True).raw, arc2warc=True), desc=f)
-            elif f.endswith('.wacz'):
+            if f.startswith("http"):
+                yield from tqdm(
+                    ArchiveIterator(req.get(f, stream=True).raw, arc2warc=True), desc=f
+                )
+            elif f.endswith(".wacz"):
                 # TODO: can we support loading WACZ files by URL?
                 wacz = zipfile.ZipFile(f)
-                warcs = filter(lambda f: f.filename.endswith('warc.gz'), wacz.infolist())
+                warcs = filter(
+                    lambda f: f.filename.endswith("warc.gz"), wacz.infolist()
+                )
                 for warc in warcs:
-                    yield from tqdm(ArchiveIterator(wacz.open(warc.filename, 'r'), arc2warc=True), desc=warc.filename)
+                    yield from tqdm(
+                        ArchiveIterator(wacz.open(warc.filename, "r"), arc2warc=True),
+                        desc=warc.filename,
+                    )
             else:
-                yield from tqdm(ArchiveIterator(open(f, 'rb'), arc2warc=True), desc=f)
+                yield from tqdm(ArchiveIterator(open(f, "rb"), arc2warc=True), desc=f)
 
     for r in to_import():
         db += r