From e73c20f316f54e6d316b24eba1e1b79b74b50994 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 16:21:31 -0400
Subject: [PATCH 01/14] Add script to download counts from regulations.gov,
 dashboard, and redis

---
 scripts/get_counts.py | 265 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100755 scripts/get_counts.py

diff --git a/scripts/get_counts.py b/scripts/get_counts.py
new file mode 100755
index 00000000..589101e5
--- /dev/null
+++ b/scripts/get_counts.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+
+import argparse
+import datetime as dt
+import json
+import os
+import pathlib
+import sys
+from typing import Any, TypedDict
+
+import redis
+import requests
+
+REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/"
+
+
+class EntityCount(TypedDict):
+    downloaded: int
+    jobs: int
+    total: int
+    last_timestamp: dt.datetime
+
+
+class Output(TypedDict):
+    creation_timestamp: dt.datetime
+    dockets: EntityCount
+    documents: EntityCount
+    comments: EntityCount
+
+
+class OutputEncoder(json.JSONEncoder):
+    def default(self, o: Any) -> Any:
+        if isinstance(o, dt.datetime):
+            return o.strftime("%Y-%m-%d %H:%M:%S")
+        return super().default(o)
+
+
+def _download_regulation_count(
+    url: str, headers: dict[str, str], params: dict[str, str]
+) -> int:
+    response = requests.get(
+        url,
+        headers=headers,
+        params=params,
+    )
+    response.raise_for_status()
+    return response.json()["meta"]["totalElemnts"]
+
+
+def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output:
+    """Get counts from regulations.gov given a last_timestamp
+
+    Exactly 6 Regulations.gov API calls are made during this function
+    """
+    output: Output = {
+        "creation_timestamp": dt.datetime.now(dt.timezone.utc),
+        "dockets": {
+            "downloaded": -1,
+            "jobs": 0,
+            "total": -1,
+            "last_timestamp": last_timestamp,
+        },
+        "documents": {
+            "downloaded": -1,
+            "jobs": 0,
+            "total": -1,
+            "last_timestamp": last_timestamp,
+        },
+        "comments": {
+            "downloaded": -1,
+            "jobs": 0,
+            "total": -1,
+            "last_timestamp": last_timestamp,
+        },
+    }
+
+    headers = {"X-Api-Key": api_key}
+    # NOTE: we set pagesize to be 5 since we only care about the metadata
+    downloaded_filter = {
+        "filter[lastModifiedDate][le]": last_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
+        "page[size]": 5,
+    }
+
+    for entity_type in ("dockets", "documents", "comments"):
+        downloaded = _download_regulation_count(
+            REGULATIONS_BASE_URL + entity_type, headers, downloaded_filter
+        )
+        total = _download_regulation_count(
+            REGULATIONS_BASE_URL + entity_type, headers, {"page[size]": "5"}
+        )
+        output[entity_type]["downloaded"] = downloaded
+        output[entity_type]["total"] = total
+
+    return output
+
+
+def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output:
+    """Get the counts of a running mirrulations instance via it's dashboard"""
+    response = requests.get(dashboard_url + "/data")
+    response.raise_for_status()
+
+    content = response.json()
+
+    output: Output = {
+        "creation_timestamp": dt.datetime.now(dt.timezone.utc),
+        "dockets": {
+            "downloaded": content["num_dockets_done"],
+            "jobs": content["num_jobs_dockets_queued"],
+            "total": content["regulations_total_dockets"],
+            "last_timestamp": last_timestamp,
+        },
+        "documents": {
+            "downloaded": content["num_documents_done"],
+            "jobs": content["num_jobs_documents_queued"],
+            "total": content["regulations_total_documents"],
+            "last_timestamp": last_timestamp,
+        },
+        "comments": {
+            "downloaded": content["num_comments_done"],
+            "jobs": content["num_jobs_comments_queued"],
+            "total": content["regulations_total_comments"],
+            "last_timestamp": last_timestamp,
+        },
+    }
+
+    return output
+
+
+def get_redis(db: redis.Redis) -> Output:
+    """Get the counts of a running mirrulations instance via a Redis connection"""
+
+    output: Output = {
+        "creation_timestamp": dt.datetime.now(dt.timezone.utc),
+        "dockets": {
+            "downloaded": int(db.get("num_dockets_done")),
+            "jobs": int(db.get("num_jobs_dockets_waiting")),
+            "total": int(db.get("regulations_total_dockets")),
+            "last_timestamp": dt.datetime.strptime(
+                str(db.get("dockets_last_timestamp")), "%Y-%m-%d %H:%M:%S"
+            ),
+        },
+        "documents": {
+            "downloaded": int(db.get("num_documents_done")),
+            "jobs": int(db.get("num_jobs_documents_waiting")),
+            "total": int(db.get("regulations_total_documents")),
+            "last_timestamp": dt.datetime.strptime(
+                str(db.get("documents_last_timestamp")), "%Y-%m-%d %H:%M:%S"
+            ),
+        },
+        "comments": {
+            "downloaded": int(db.get("num_comments_done")),
+            "jobs": int(db.get("num_jobs_comments_waiting")),
+            "total": int(db.get("regulations_total_comments")),
+            "last_timestamp": dt.datetime.strptime(
+                str(db.get("comments_last_timestamp")), "%Y-%m-%d %H:%M:%S"
+            ),
+        },
+    }
+
+    return output
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Get Docket, Document, and Comment counts from multiple sources"
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        metavar="PATH",
+        type=str,
+        default="-",
+        help="file to output to, use '-' for stdout (default '%(default)s')",
+    )
+    subparsers = parser.add_subparsers(
+        dest="source", required=True, help="The source to get counts from"
+    )
+
+    regulations = subparsers.add_parser(
+        "regulations", help="download counts from regulations.gov"
+    )
+    regulations.add_argument(
+        "-a",
+        "--api-key",
+        help="Regulations.gov api key, defaults to value of `API_KEY` environment variable",
+        default=os.getenv("API_KEY"),
+        type=str,
+    )
+    regulations.add_argument(
+        "-t",
+        "--last-timestamp",
+        metavar="TIMESTAMP",
+        type=dt.datetime.fromisoformat,
+        default=dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"),
+        help="last timestamp that is assumed to have been downloaded in ISO 8601 format 'YYYY-MM-DDTHH:mm:ssZ' (default '%(default)s')",
+    )
+
+    dashboard = subparsers.add_parser(
+        "dashboard", help="get counts from a mirrulations dashboard"
+    )
+    dashboard.add_argument(
+        "-u",
+        "--url",
+        metavar="DASHBOARD_URL",
+        default="http://localhost",
+        help="dashboard url (default '%(default)s')",
+    )
+    dashboard.add_argument(
+        "last_timestamp",
+        type=dt.datetime.fromisoformat,
+        default=dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"),
+        help="last timestamp that is assumed to have been downloaded in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')",
+    )
+
+    redis_args = subparsers.add_parser("redis", help="get counts from redis")
+    redis_args.add_argument(
+        "--hostname",
+        metavar="HOSTNAME",
+        default="localhost",
+        help="redis server hostname (default '%(default)s')",
+    )
+    redis_args.add_argument(
+        "-p",
+        "--port",
+        metavar="PORT",
+        type=int,
+        default=6379,
+        help="port for redis server (default '%(default)s')",
+    )
+    redis_args.add_argument(
+        "-n",
+        "--db",
+        metavar="DB_NUMBER",
+        type=int,
+        default=0,
+        help="redis database number (default '%(default)s')",
+    )
+
+    args = parser.parse_args()
+    print(args)
+
+    path: str = args.output
+    output_file = sys.stdout if path == "-" else open(pathlib.Path(path), "w")
+
+    source = args.source
+    if source == "regulations":
+        api_key = args.api_key
+        if api_key is None or api_key == "":
+            print("No api key found, exitting", file=sys.stderr)
+            sys.exit(1)
+        output = get_regulation(api_key, args.last_timestamp)
+    elif source == "dashboard":
+        output = get_dashboard(args.url, args.last_timestamp)
+    elif source == "redis":
+        db = redis.Redis(host=args.hostname, port=args.port, db=args.db)
+        output = get_redis(db)
+    else:
+        print("Unrecognized source, exitting", file=sys.stderr)
+        sys.exit(1)
+
+    if path == "-":
+        json.dump(output, sys.stdout, cls=OutputEncoder)
+    else:
+        with open(pathlib.Path(path), "w") as fp:
+            json.dump(output, fp, cls=OutputEncoder)

From 7c9c49702f2990fbae2233eb91c52597958847f4 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 16:37:00 -0400
Subject: [PATCH 02/14] Move count types to seperate file

---
 scripts/counts.py     | 24 ++++++++++++++++++++++++
 scripts/get_counts.py | 34 +++++-----------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)
 create mode 100644 scripts/counts.py

diff --git a/scripts/counts.py b/scripts/counts.py
new file mode 100644
index 00000000..3ec162c1
--- /dev/null
+++ b/scripts/counts.py
@@ -0,0 +1,24 @@
+import json
+import datetime as dt
+from typing import Any, TypedDict
+
+
+class EntityCount(TypedDict):
+    downloaded: int
+    jobs: int
+    total: int
+    last_timestamp: dt.datetime
+
+
+class Output(TypedDict):
+    creation_timestamp: dt.datetime
+    dockets: EntityCount
+    documents: EntityCount
+    comments: EntityCount
+
+
+class OutputEncoder(json.JSONEncoder):
+    def default(self, o: Any) -> Any:
+        if isinstance(o, dt.datetime):
+            return o.strftime("%Y-%m-%d %H:%M:%S")
+        return super().default(o)
diff --git a/scripts/get_counts.py b/scripts/get_counts.py
index 589101e5..4335a233 100755
--- a/scripts/get_counts.py
+++ b/scripts/get_counts.py
@@ -6,7 +6,7 @@
 import os
 import pathlib
 import sys
-from typing import Any, TypedDict
+from counts import Output, OutputEncoder
 
 import redis
 import requests
@@ -14,27 +14,6 @@
 REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/"
 
 
-class EntityCount(TypedDict):
-    downloaded: int
-    jobs: int
-    total: int
-    last_timestamp: dt.datetime
-
-
-class Output(TypedDict):
-    creation_timestamp: dt.datetime
-    dockets: EntityCount
-    documents: EntityCount
-    comments: EntityCount
-
-
-class OutputEncoder(json.JSONEncoder):
-    def default(self, o: Any) -> Any:
-        if isinstance(o, dt.datetime):
-            return o.strftime("%Y-%m-%d %H:%M:%S")
-        return super().default(o)
-
-
 def _download_regulation_count(
     url: str, headers: dict[str, str], params: dict[str, str]
 ) -> int:
@@ -162,7 +141,8 @@ def get_redis(db: redis.Redis) -> Output:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Get Docket, Document, and Comment counts from multiple sources"
+        "Get Counts",
+        description="Get Docket, Document, and Comment counts from multiple sources",
     )
     parser.add_argument(
         "-o",
@@ -237,10 +217,6 @@ def get_redis(db: redis.Redis) -> Output:
     )
 
     args = parser.parse_args()
-    print(args)
-
-    path: str = args.output
-    output_file = sys.stdout if path == "-" else open(pathlib.Path(path), "w")
 
     source = args.source
     if source == "regulations":
@@ -258,8 +234,8 @@ def get_redis(db: redis.Redis) -> Output:
         print("Unrecognized source, exitting", file=sys.stderr)
         sys.exit(1)
 
-    if path == "-":
+    if args.output == "-":
         json.dump(output, sys.stdout, cls=OutputEncoder)
     else:
-        with open(pathlib.Path(path), "w") as fp:
+        with open(pathlib.Path(args.output), "w") as fp:
             json.dump(output, fp, cls=OutputEncoder)

From 9357d7257145b9de592fddfe36196038be4316a2 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 17:20:50 -0400
Subject: [PATCH 03/14] Rename counts classes and fix typo

---
 scripts/counts.py     | 17 +++++++++++++++--
 scripts/get_counts.py | 20 ++++++++++----------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/scripts/counts.py b/scripts/counts.py
index 3ec162c1..2423fb58 100644
--- a/scripts/counts.py
+++ b/scripts/counts.py
@@ -10,15 +10,28 @@ class EntityCount(TypedDict):
     last_timestamp: dt.datetime
 
 
-class Output(TypedDict):
+class Counts(TypedDict):
     creation_timestamp: dt.datetime
     dockets: EntityCount
     documents: EntityCount
     comments: EntityCount
 
 
-class OutputEncoder(json.JSONEncoder):
+class CountsEncoder(json.JSONEncoder):
     def default(self, o: Any) -> Any:
         if isinstance(o, dt.datetime):
             return o.strftime("%Y-%m-%d %H:%M:%S")
         return super().default(o)
+
+
+class CountsDecoder(json.JSONDecoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(object_hook=self.object_hook, *args, **kwargs)
+
+    def object_hook(self, obj: Any) -> Any:
+        for key, value in obj.items():
+            try:
+                obj[key] = dt.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
+            except (ValueError, TypeError):
+                pass
+        return obj
diff --git a/scripts/get_counts.py b/scripts/get_counts.py
index 4335a233..aacbd1d1 100755
--- a/scripts/get_counts.py
+++ b/scripts/get_counts.py
@@ -6,7 +6,7 @@
 import os
 import pathlib
 import sys
-from counts import Output, OutputEncoder
+from counts import Counts, CountsEncoder
 
 import redis
 import requests
@@ -23,15 +23,15 @@ def _download_regulation_count(
         params=params,
     )
     response.raise_for_status()
-    return response.json()["meta"]["totalElemnts"]
+    return response.json()["meta"]["totalElements"]
 
 
-def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output:
+def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Counts:
     """Get counts from regulations.gov given a last_timestamp
 
     Exactly 6 Regulations.gov API calls are made during this function
     """
-    output: Output = {
+    output: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
         "dockets": {
             "downloaded": -1,
@@ -73,14 +73,14 @@ def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output:
     return output
 
 
-def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output:
+def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts:
     """Get the counts of a running mirrulations instance via it's dashboard"""
     response = requests.get(dashboard_url + "/data")
     response.raise_for_status()
 
     content = response.json()
 
-    output: Output = {
+    output: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
         "dockets": {
             "downloaded": content["num_dockets_done"],
@@ -105,10 +105,10 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output:
     return output
 
 
-def get_redis(db: redis.Redis) -> Output:
+def get_redis(db: redis.Redis) -> Counts:
     """Get the counts of a running mirrulations instance via a Redis connection"""
 
-    output: Output = {
+    output: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
         "dockets": {
             "downloaded": int(db.get("num_dockets_done")),
@@ -235,7 +235,7 @@ def get_redis(db: redis.Redis) -> Output:
         sys.exit(1)
 
     if args.output == "-":
-        json.dump(output, sys.stdout, cls=OutputEncoder)
+        json.dump(output, sys.stdout, cls=CountsEncoder)
     else:
         with open(pathlib.Path(args.output), "w") as fp:
-            json.dump(output, fp, cls=OutputEncoder)
+            json.dump(output, fp, cls=CountsEncoder)

From b25ab8bc59e3a0a18a4714f9099582a727fd21c4 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 17:34:16 -0400
Subject: [PATCH 04/14] Add script to correct counts obtained from

---
 scripts/correct_counts.py | 107 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100755 scripts/correct_counts.py

diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py
new file mode 100755
index 00000000..74f2e5fa
--- /dev/null
+++ b/scripts/correct_counts.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+from copy import deepcopy
+import json
+import pathlib
+import sys
+from counts import Counts, CountsEncoder, CountsDecoder
+
+import argparse
+
+
+class JobsInQueueException(Exception):
+    pass
+
+
+def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts:
+    filtered = deepcopy(recieved)
+    for entity_type in ("dockets", "documents", "comments"):
+        total_ = filtered[entity_type]["total"]
+        downloaded = filtered[entity_type]["downloaded"]
+        if filtered[entity_type]["jobs"] > 0 and not ignore_queue:
+            raise JobsInQueueException(
+                f'{entity_type} has {filtered[entity_type]["jobs"]} in queue'
+            )
+        filtered[entity_type]["downloaded"] = min(total_, downloaded)
+
+    return filtered
+
+
+def strategy_diff(recieved: Counts, ignore_queue: bool) -> Counts:
+    filtered = deepcopy(recieved)
+    for entity_type in ("dockets", "documents", "comments"):
+        total_ = filtered[entity_type]["total"]
+        downloaded = filtered[entity_type]["downloaded"]
+        jobs = filtered[entity_type]["jobs"]
+        if jobs > 0 and not ignore_queue:
+            raise JobsInQueueException(
+                f'{entity_type} has {filtered[entity_type]["jobs"]} in queue'
+            )
+        filtered[entity_type]["downloaded"] = min(total_ - jobs, downloaded)
+
+    return filtered
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Correct Counts",
+        description="Correct counts in json format by either capping downloaded with `total` or capping with `total - jobs`",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        metavar="OUTPUT_PATH",
+        type=str,
+        default="-",
+        help="file to output to, use '-' for stdout (default '%(default)s')",
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        metavar="INPUT_PATH",
+        type=str,
+        default="-",
+        help="file to read from, use '-' for stdin (default '%(default)s')",
+    )
+    parser.add_argument(
+        "-s",
+        "--strategy",
+        type=str,
+        default="cap_with_total",
+        choices=("cap_with_total", "diff_total_with_jobs"),
+        help="the correction strategy to use (default '%(default)s')",
+    )
+    parser.add_argument(
+        "--ignore-queue",
+        action="store_true",
+        help="continue even if there are queued jobs",
+    )
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
+    else:
+        with open(pathlib.Path(args.input), "r") as fp:
+            input_counts = json.load(fp, cls=CountsDecoder)
+
+    try:
+        if args.strategy == "cap_with_total":
+            modified_counts = strategy_cap(input_counts, args.ignore_queue)
+        elif args.strategy == "diff_total_with_jobs":
+            modified_counts = strategy_diff(input_counts, args.ignore_queue)
+        else:
+            print(f"Unrecognized strategy {args.strategy}, exitting", file=sys.stderr)
+            sys.exit(1)
+    except JobsInQueueException as e:
+        print(
+            f"Found jobs in queue: {e}\nUse `--ignore-queue` to continue",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    if args.output == "-":
+        json.dump(modified_counts, sys.stdout, cls=CountsEncoder)
+    else:
+        with open(pathlib.Path(args.output), "w") as fp:
+            json.dump(modified_counts, fp, cls=CountsEncoder)

From 691a79ff8bfe3a46cdd6c97933a973e31d0ee7e6 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 19:40:13 -0400
Subject: [PATCH 05/14] Add script to set redis values from json

---
 scripts/set_counts.py | 146 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100755 scripts/set_counts.py

diff --git a/scripts/set_counts.py b/scripts/set_counts.py
new file mode 100755
index 00000000..d26ae619
--- /dev/null
+++ b/scripts/set_counts.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import pathlib
+import redis
+import sys
+
+from counts import Counts, CountsDecoder
+
+ANSI_RESET = "\033[0m"
+ANSI_BOLD = "\033[1m"
+ANSI_BLINK = "\033[5m"
+ANSI_BLINK_OFF = "\033[25m"
+ANSI_FG_RED = "\033[31m"
+
+
+def _get_vals(db: redis.Redis, entity_type: str) -> dict[str, int | str]:
+    done_raw: str | None = db.get(f"num_{entity_type}_done")
+    if done_raw is not None:
+        done = int(done_raw)
+    else:
+        done = "None"
+
+    total_raw: str | None = db.get(f"regulations_total_{entity_type}")
+    if total_raw is not None:
+        total = int(total_raw)
+    else:
+        total = "None"
+
+    timestamp: str = db.get(f"{entity_type}_last_timestamp") or "None"
+
+    return {"done": done, "timestamp": timestamp, "total": total}
+
+
+def _print_changes(info: str, original: str, new: str) -> None:
+    if original != new:
+        print(
+            info,
+            ANSI_FG_RED + ANSI_BOLD + original,
+            f"{ANSI_BLINK}--->{ANSI_BLINK_OFF}",
+            new + ANSI_RESET,
+        )
+    else:
+        print(info, original, "--->", new)
+
+
+def show_changes(db: redis.Redis, counts: Counts) -> None:
+    for entity_type in ("dockets", "documents", "comments"):
+        vals = _get_vals(db, entity_type)
+        _print_changes(
+            f"num_{entity_type}_done:\n  ",
+            str(vals["done"]),
+            str(counts[entity_type]["downloaded"]),
+        )
+        _print_changes(
+            f"regulations_total_{entity_type}:\n  ",
+            str(vals["total"]),
+            str(counts[entity_type]["total"]),
+        )
+        _print_changes(
+            f"{entity_type}_last_timestamp:\n  ",
+            str(vals["timestamp"]),
+            counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
+        )
+        print()
+
+
+def set_values(db: redis.Redis, counts: Counts):
+    for entity_type in ("dockets", "documents", "comments"):
+        db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"])
+        db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"])
+        db.set(
+            f"{entity_type}_last_timestamp",
+            counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Set Counts", description="Set counts in Redis database from json"
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        metavar="INPUT_PATH",
+        type=str,
+        default="-",
+        help="file to read from, use '-' for stdin (default '%(default)s')",
+    )
+    parser.add_argument(
+        "-y",
+        "--yes",
+        dest="changes_confirmed",
+        action="store_true",
+        help="Do not check for confirmation when setting values",
+    )
+    parser.add_argument(
+        "--host",
+        metavar="HOSTNAME",
+        default="localhost",
+        help="redis server hostname (default '%(default)s')",
+    )
+    parser.add_argument(
+        "-p",
+        "--port",
+        metavar="PORT",
+        type=int,
+        default=6379,
+        help="port for redis server (default '%(default)s')",
+    )
+    parser.add_argument(
+        "-n",
+        "--db",
+        metavar="DB_NUMBER",
+        type=int,
+        default=0,
+        help="redis database number (default '%(default)s')",
+    )
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
+    else:
+        with open(pathlib.Path(args.input), "r") as fp:
+            input_counts = json.load(fp, cls=CountsDecoder)
+
+    db = redis.Redis(args.host, args.port, args.db, decode_responses=True)
+    changes_confirmed: bool = args.changes_confirmed
+
+    if changes_confirmed:
+        set_values(db, input_counts)
+    else:
+        show_changes(db, input_counts)
+        response = (
+            input("Are you sure you want to make the above changes [y/n]: ")
+            .strip()
+            .lower()
+        )
+        changes_confirmed = response == "y" or response == "yes"
+        if changes_confirmed:
+            set_values(db, input_counts)
+        else:
+            print("No values set, exitting")
+            sys.exit()

From 67d4c5a36ce0288f0f56569082d777d03cf3e041 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 20:07:02 -0400
Subject: [PATCH 06/14] Refacor get_counts script to handle missing redis keys

---
 scripts/get_counts.py | 68 ++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/scripts/get_counts.py b/scripts/get_counts.py
index aacbd1d1..7fb02162 100755
--- a/scripts/get_counts.py
+++ b/scripts/get_counts.py
@@ -14,6 +14,10 @@
 REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/"
 
 
+class MissingRedisKeyException(Exception):
+    pass
+
+
 def _download_regulation_count(
     url: str, headers: dict[str, str], params: dict[str, str]
 ) -> int:
@@ -80,7 +84,7 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts:
 
     content = response.json()
 
-    output: Counts = {
+    counts: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
         "dockets": {
             "downloaded": content["num_dockets_done"],
@@ -102,41 +106,39 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts:
         },
     }
 
-    return output
+    return counts
+
+
+def _get_key_or_raise(db: redis.Redis, key: str) -> str:
+    value: str | None = db.get(key)
+    if value is None:
+        raise MissingRedisKeyException(f"missing redis key: {key}")
+
+    return value
 
 
 def get_redis(db: redis.Redis) -> Counts:
     """Get the counts of a running mirrulations instance via a Redis connection"""
 
-    output: Counts = {
+    counts: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
-        "dockets": {
-            "downloaded": int(db.get("num_dockets_done")),
-            "jobs": int(db.get("num_jobs_dockets_waiting")),
-            "total": int(db.get("regulations_total_dockets")),
-            "last_timestamp": dt.datetime.strptime(
-                str(db.get("dockets_last_timestamp")), "%Y-%m-%d %H:%M:%S"
-            ),
-        },
-        "documents": {
-            "downloaded": int(db.get("num_documents_done")),
-            "jobs": int(db.get("num_jobs_documents_waiting")),
-            "total": int(db.get("regulations_total_documents")),
-            "last_timestamp": dt.datetime.strptime(
-                str(db.get("documents_last_timestamp")), "%Y-%m-%d %H:%M:%S"
-            ),
-        },
-        "comments": {
-            "downloaded": int(db.get("num_comments_done")),
-            "jobs": int(db.get("num_jobs_comments_waiting")),
-            "total": int(db.get("regulations_total_comments")),
-            "last_timestamp": dt.datetime.strptime(
-                str(db.get("comments_last_timestamp")), "%Y-%m-%d %H:%M:%S"
-            ),
-        },
     }
 
-    return output
+    for entity_type in ("dockets", "documents", "comments"):
+        # Getting any of these values can raise an exception
+        downloaded = _get_key_or_raise(db, f"num_{entity_type}_done")
+        jobs = _get_key_or_raise(db, f"num_jobs_{entity_type}_waiting")
+        total = _get_key_or_raise(db, f"regulations_total_{entity_type}")
+        last_timestamp = _get_key_or_raise(db, f"{entity_type}_last_timestamp")
+
+        counts[entity_type] = {
+            "downloaded": int(downloaded),
+            "jobs": int(jobs),
+            "total": int(total),
+            "last_timestamp": dt.datetime.strptime(last_timestamp, "%Y-%m-%d %H:%M:%S"),
+        }
+
+    return counts
 
 
 if __name__ == "__main__":
@@ -228,8 +230,14 @@ def get_redis(db: redis.Redis) -> Counts:
     elif source == "dashboard":
         output = get_dashboard(args.url, args.last_timestamp)
     elif source == "redis":
-        db = redis.Redis(host=args.hostname, port=args.port, db=args.db)
-        output = get_redis(db)
+        db = redis.Redis(
+            host=args.hostname, port=args.port, db=args.db, decode_responses=True
+        )
+        try:
+            output = get_redis(db)
+        except MissingRedisKeyException as e:
+            print(f"Missing a redis key, exitting\n{e}", file=sys.stderr)
+            sys.exit(1)
     else:
         print("Unrecognized source, exitting", file=sys.stderr)
         sys.exit(1)

From d8f68b15c0122b109037011233accee005dae04d Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 20:12:41 -0400
Subject: [PATCH 07/14] Add requirements.txt for scripts

---
 scripts/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 scripts/requirements.txt

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 00000000..28bd01f9
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1,2 @@
+requests
+redis

From a4bd129b2957427b80d4f8e9587da94ccad0d605 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Wed, 16 Oct 2024 21:05:28 -0400
Subject: [PATCH 08/14] Add documentation for scripts

---
 docs/scripts.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 docs/scripts.md

diff --git a/docs/scripts.md b/docs/scripts.md
new file mode 100644
index 00000000..9b4fd565
--- /dev/null
+++ b/docs/scripts.md
@@ -0,0 +1,104 @@
+# Script Documentation
+
+## Summary
+
+Some tasks are small enough that the project architecture should not change, but the large enough that they should not be performed by hand.
+Files in the `scripts` directory exist to fill this space.
+
+Currently, the following scripts are provided.
+
+* `get_counts.py`
+    * get docket, document, and comment counts from regulations.gov, a mirrulations dashboard, or a mirrulations Redis instance as json
+    * when using regulations.gov a timestamp can be given to make all dockets, documents, and comments before the timestamp count as if they were downloaded
+* `correct_counts.py`
+    * correct possible errors within a counts json file generated by `get_counts.py`
+* `set_counts.py`
+    * set values in a mirrulations Redis instance using json generated by `get_counts.py`
+
+All of the scripts above share a common format
+<details>
+<summary><code>get_counts.py</code> common format</summary>
+
+```json
+{
+  "creation_timestamp": "2024-10-16 15:00:00",
+  "dockets": {
+    "downloaded": 253807,
+    "jobs": 0,
+    "total": 253807,
+    "last_timestamp": "2024-10-13 04:04:18"
+  },
+  "documents": {
+    "downloaded": 1843774,
+    "jobs": 0,
+    "total": 1843774,
+    "last_timestamp": "2024-10-13 04:04:18"
+  },
+  "comments": {
+    "downloaded": 22240501,
+    "jobs": 10,
+    "total": 22240511,
+    "last_timestamp": "2024-10-13 04:04:18"
+  }
+}
+```
+
+</details>
+
+## Description
+
+### `get_counts.py`
+
+`get_counts.py` gets counts from one of three sources: regulations.gov, a Mirrulations Redis instance, a Mirrulations dashboard via HTTP.
+
+When reading from regulations.gov a UTC timestamp can be specified to mock having downloaded all dockets, documents, and comments from before that timestamp.
+
+When reading from a dashboard a UTC timestamp must be specified since the dashboard API does not provide one.
+
+### `correct_counts.py`
+
+`correct_counts.py` corrects counts from `get_counts.py` using one of two strategies: set downloaded counts for a type to the minimum of `downloaded` and `total` for that type, or set downloaded counts to the minimum of `total -jobs` and `downloaded`.
+By default any queued jobs will cause the script to exit and output nothing, this behavior can be changed with the `--ignore-queue` flag.
+
+### `set_counts.py`
+
+`set_counts.py` sets values from `get_counts.py` in a Redis instance.
+By default the script will prompt for user input before changing any values.
+This behavior can be changed using the `--yes` flag, which should be used **WITH GREAT CARE, ESPECIALLY IN PRODUCTION!!!**.
+
+## Setup
+
+First a virtual environment should be created to download dependencies to.
+
+```bash
+cd scripts
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+Make sure when you are in the correct environment when running scripts.
+
+## Examples
+
+### Cap Docket, Document, and Comment downloaded counts by the counts from Regulations.gov
+
+```bash
+./get_counts.py redis | ./correct_counts.py | ./set_counts.py -y
+```
+
+### Set Docket, Document, Comment downloaded counts while jobs are in the queue
+
+```bash
+./get_counts.py dashboard | ./correct_counts.py --ignore-queue --strategy diff_total_with_jobs | ./set_counts.py -y
+```
+
+### Download Counts for a Certain Time from Regulations.gov
+
+```bash
+./get_counts.py --api-key $API_KEY -o aug_6_2022.json -t 2024-08-06T06:20:50Z
+
+EXPORT API_KEY=<REGULATIONS.GOV_API_KEY>
+./get_counts.py regulations -o oct_01_2024.json --last-timestamp 2024-10-01T15:30:10Z
+./set_counts.py -i oct_01_2024.json
+```

From 5a2ef93e46aec712d634a69b95ca2a4a5ab2c590 Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Mon, 28 Oct 2024 10:46:13 -0400
Subject: [PATCH 09/14] Add queue size field to shared output

---
 scripts/counts.py        |  1 +
 scripts/get_counts.py    |  5 +++++
 scripts/job_queue.py     | 40 ++++++++++++++++++++++++++++++++++++++++
 scripts/requirements.txt |  1 +
 4 files changed, 47 insertions(+)
 create mode 100644 scripts/job_queue.py

diff --git a/scripts/counts.py b/scripts/counts.py
index 2423fb58..28b066ca 100644
--- a/scripts/counts.py
+++ b/scripts/counts.py
@@ -12,6 +12,7 @@ class EntityCount(TypedDict):
 
 class Counts(TypedDict):
     creation_timestamp: dt.datetime
+    queue_size: int
     dockets: EntityCount
     documents: EntityCount
     comments: EntityCount
diff --git a/scripts/get_counts.py b/scripts/get_counts.py
index 7fb02162..4df89929 100755
--- a/scripts/get_counts.py
+++ b/scripts/get_counts.py
@@ -7,6 +7,7 @@
 import pathlib
 import sys
 from counts import Counts, CountsEncoder
+from job_queue import RabbitMQ
 
 import redis
 import requests
@@ -37,6 +38,7 @@ def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Counts:
     """
     output: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
+        "queue_size": 0,
         "dockets": {
             "downloaded": -1,
             "jobs": 0,
@@ -86,6 +88,7 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts:
 
     counts: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
+        "queue_size": content["num_jobs_waiting"],
         "dockets": {
             "downloaded": content["num_dockets_done"],
             "jobs": content["num_jobs_dockets_queued"],
@@ -123,6 +126,8 @@ def get_redis(db: redis.Redis) -> Counts:
     counts: Counts = {
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
     }
+    queue = RabbitMQ("jobs_waiting_queue")
+    counts["queue_size"] = queue
 
     for entity_type in ("dockets", "documents", "comments"):
         # Getting any of these values can raise an exception
diff --git a/scripts/job_queue.py b/scripts/job_queue.py
new file mode 100644
index 00000000..6b932978
--- /dev/null
+++ b/scripts/job_queue.py
@@ -0,0 +1,40 @@
+# pylint: disable=too-many-arguments
+import sys
+import pika
+
+
+class RabbitMQ:
+    """
+    Encapsulate calls to RabbitMQ in one place
+    """
+
+    def __init__(self, queue_name):
+        """
+        Create a new RabbitMQ object
+        @param queue_name: the name of the queue to use
+        """
+        self.queue_name = queue_name
+        self.connection = None
+        self.channel = None
+
+    def _ensure_channel(self):
+        if self.connection is None or not self.connection.is_open:
+            connection_parameter = pika.ConnectionParameters("rabbitmq")
+            self.connection = pika.BlockingConnection(connection_parameter)
+            self.channel = self.connection.channel()
+            self.channel.queue_declare(self.queue_name, durable=True)
+
+    def size(self):
+        """
+        Get the number of jobs in the queue.
+        Can't be sure Channel is active between ensure_channel()
+        and queue_declare() which is the reasoning for implementation of try
+        except
+        @return: a non-negative integer
+        """
+        self._ensure_channel()
+        try:
+            queue = self.channel.queue_declare(self.queue_name, durable=True)
+            return queue.method.message_count
+        except pika.exceptions.StreamLostError:
+            print("FAILURE: RabbitMQ Channel Connection Lost", file=sys.stderr)
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 28bd01f9..5e8dc051 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,2 +1,3 @@
 requests
 redis
+pika

From 2cd7490e037c05e35dba08fe431815609da76b9a Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Mon, 28 Oct 2024 10:47:39 -0400
Subject: [PATCH 10/14] Change cap strategy to use shared queue size

---
 scripts/correct_counts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py
index 74f2e5fa..870ba38c 100755
--- a/scripts/correct_counts.py
+++ b/scripts/correct_counts.py
@@ -18,7 +18,7 @@ def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts:
     for entity_type in ("dockets", "documents", "comments"):
         total_ = filtered[entity_type]["total"]
         downloaded = filtered[entity_type]["downloaded"]
-        if filtered[entity_type]["jobs"] > 0 and not ignore_queue:
+        if filtered["queue_size"] != 0 and not ignore_queue:
             raise JobsInQueueException(
                 f'{entity_type} has {filtered[entity_type]["jobs"]} in queue'
             )

From 058414c5e71fa374b6f34020c84dbd695e31f6db Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Mon, 28 Oct 2024 10:51:37 -0400
Subject: [PATCH 11/14] Add better error logging

---
 scripts/set_counts.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/scripts/set_counts.py b/scripts/set_counts.py
index d26ae619..a5480360 100755
--- a/scripts/set_counts.py
+++ b/scripts/set_counts.py
@@ -68,12 +68,17 @@ def show_changes(db: redis.Redis, counts: Counts) -> None:
 
 def set_values(db: redis.Redis, counts: Counts):
     for entity_type in ("dockets", "documents", "comments"):
-        db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"])
-        db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"])
-        db.set(
-            f"{entity_type}_last_timestamp",
-            counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
-        )
+        try:
+            db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"])
+            db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"])
+            db.set(
+                f"{entity_type}_last_timestamp",
+                counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
+            )
+        except Exception as e:
+            print(f"Error occurred while setting values for {entity_type}, exitting", file=sys.stderr)
+            print(e)
+            return
 
 
 if __name__ == "__main__":

From 9a3441cff1be1eb552caeb3b2dc7e58ee87914ed Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Mon, 28 Oct 2024 11:06:19 -0400
Subject: [PATCH 12/14] Add script to get, correct, and set counts in redis

---
 scripts/get_correct_set.sh | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 scripts/get_correct_set.sh

diff --git a/scripts/get_correct_set.sh b/scripts/get_correct_set.sh
new file mode 100644
index 00000000..6fbca092
--- /dev/null
+++ b/scripts/get_correct_set.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+WORK_DIR="/home/cs334/mirrulations/scripts/"
+LOG_FILE=/var/log/mirrulations_counts.log
+START_TIME=$(date -u -Iseconds)
+echo "$START_TIME: RUnning" > $LOG_FILE
+cd $WORK_DIR
+
+PYTHON=".venv/bin/python3"
+
+$PYTHON get_counts redis -o "/tmp/mirrulations_$START_TIME.json" 2>> $LOG_FILE &&
+    $PYTHON correct_counts -i "/tmp/mirrulations_$START_TIME.json" -o "/tmp/mirrulations_${START_TIME}_corrected.json" 2>> $LOG_FILE &&
+    $PYTHON set_counts -y -i "/tmp/mirrulations_${START_TIME}_corrected.json" 2>> $LOG_FILE
+
+rm "/tmp/mirrulations_${START_TIME}_corrected.json" "/tmp/mirrulations_$START_TIME.json"

From 88092cbac7d488865a59b5837d2a45a9d39b82ab Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Tue, 29 Oct 2024 09:52:15 -0400
Subject: [PATCH 13/14] Improve logging

---
 scripts/correct_counts.py | 25 ++++++++++++++++---------
 scripts/get_counts.py     |  2 +-
 scripts/job_queue.py      |  5 +++--
 scripts/set_counts.py     | 25 +++++++++++++++++++------
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py
index 870ba38c..e276e994 100755
--- a/scripts/correct_counts.py
+++ b/scripts/correct_counts.py
@@ -4,6 +4,7 @@
 import json
 import pathlib
 import sys
+from json import JSONDecodeError
 from counts import Counts, CountsEncoder, CountsDecoder
 
 import argparse
@@ -15,13 +16,11 @@ class JobsInQueueException(Exception):
 
 def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts:
     filtered = deepcopy(recieved)
+    if filtered["queue_size"] != 0 and not ignore_queue:
+        raise JobsInQueueException(f'Found jobs in job queue: {filtered["queue_size"]}')
     for entity_type in ("dockets", "documents", "comments"):
         total_ = filtered[entity_type]["total"]
         downloaded = filtered[entity_type]["downloaded"]
-        if filtered["queue_size"] != 0 and not ignore_queue:
-            raise JobsInQueueException(
-                f'{entity_type} has {filtered[entity_type]["jobs"]} in queue'
-            )
         filtered[entity_type]["downloaded"] = min(total_, downloaded)
 
     return filtered
@@ -79,11 +78,19 @@ def strategy_diff(recieved: Counts, ignore_queue: bool) -> Counts:
 
     args = parser.parse_args()
 
-    if args.input == "-":
-        input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
-    else:
-        with open(pathlib.Path(args.input), "r") as fp:
-            input_counts = json.load(fp, cls=CountsDecoder)
+    try:
+        if args.input == "-":
+            input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
+        else:
+            try:
+                with open(pathlib.Path(args.input), "r") as fp:
+                    input_counts = json.load(fp, cls=CountsDecoder)
+            except FileNotFoundError:
+                print(f"Missing file {args.input}, exitting", file=sys.stderr)
+                sys.exit(2)
+    except JSONDecodeError:
+        print(f"Malformed input file {args.input}, exitting", file=sys.stderr)
+        sys.exit(2)
 
     try:
         if args.strategy == "cap_with_total":
diff --git a/scripts/get_counts.py b/scripts/get_counts.py
index 4df89929..05f2a93a 100755
--- a/scripts/get_counts.py
+++ b/scripts/get_counts.py
@@ -127,7 +127,7 @@ def get_redis(db: redis.Redis) -> Counts:
         "creation_timestamp": dt.datetime.now(dt.timezone.utc),
     }
     queue = RabbitMQ("jobs_waiting_queue")
-    counts["queue_size"] = queue
+    counts["queue_size"] = queue.size()
 
     for entity_type in ("dockets", "documents", "comments"):
         # Getting any of these values can raise an exception
diff --git a/scripts/job_queue.py b/scripts/job_queue.py
index 6b932978..5658210f 100644
--- a/scripts/job_queue.py
+++ b/scripts/job_queue.py
@@ -19,12 +19,12 @@ def __init__(self, queue_name):
 
     def _ensure_channel(self):
         if self.connection is None or not self.connection.is_open:
-            connection_parameter = pika.ConnectionParameters("rabbitmq")
+            connection_parameter = pika.ConnectionParameters("localhost")
             self.connection = pika.BlockingConnection(connection_parameter)
             self.channel = self.connection.channel()
             self.channel.queue_declare(self.queue_name, durable=True)
 
-    def size(self):
+    def size(self) -> int:
         """
         Get the number of jobs in the queue.
         Can't be sure Channel is active between ensure_channel()
@@ -38,3 +38,4 @@ def size(self):
             return queue.method.message_count
         except pika.exceptions.StreamLostError:
             print("FAILURE: RabbitMQ Channel Connection Lost", file=sys.stderr)
+            return 0
diff --git a/scripts/set_counts.py b/scripts/set_counts.py
index a5480360..8e2803cd 100755
--- a/scripts/set_counts.py
+++ b/scripts/set_counts.py
@@ -76,7 +76,10 @@ def set_values(db: redis.Redis, counts: Counts):
                 counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
             )
         except Exception as e:
-            print(f"Error occurred while setting values for {entity_type}, exitting", file=sys.stderr)
+            print(
+                f"Error occurred while setting values for {entity_type}, exitting",
+                file=sys.stderr,
+            )
             print(e)
             return
 
@@ -125,11 +128,21 @@ def set_values(db: redis.Redis, counts: Counts):
 
     args = parser.parse_args()
 
-    if args.input == "-":
-        input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
-    else:
-        with open(pathlib.Path(args.input), "r") as fp:
-            input_counts = json.load(fp, cls=CountsDecoder)
+    try:
+        if args.input == "-":
+            input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder)
+        else:
+            try:
+                with open(pathlib.Path(args.input), "r") as fp:
+                    input_counts = json.load(fp, cls=CountsDecoder)
+            except FileNotFoundError:
+                print(
+                    f"Input file {args.input} does not exist, exitting", file=sys.stderr
+                )
+                sys.exit(2)
+    except json.JSONDecodeError:
+        print(f"Malformed input file {args.input}, exitting", file=sys.stderr)
+        sys.exit(2)
 
     db = redis.Redis(args.host, args.port, args.db, decode_responses=True)
     changes_confirmed: bool = args.changes_confirmed

From a4d141e0bdae1560dc0f401c5c503613b867bebe Mon Sep 17 00:00:00 2001
From: JP Appel <jeanpierre.appel01@gmail.com>
Date: Tue, 29 Oct 2024 10:16:36 -0400
Subject: [PATCH 14/14] Update documentation

---
 docs/scripts.md            | 8 ++++++++
 scripts/get_correct_set.sh | 0
 2 files changed, 8 insertions(+)
 mode change 100644 => 100755 scripts/get_correct_set.sh

diff --git a/docs/scripts.md b/docs/scripts.md
index 9b4fd565..e809ad1e 100644
--- a/docs/scripts.md
+++ b/docs/scripts.md
@@ -14,6 +14,8 @@ Currently, the following scripts are provided.
     * correct possible errors within a counts json file generated by `get_counts.py`
 * `set_counts.py`
     * set values in a mirrulations Redis instance using json generated by `get_counts.py`
+* `get_correct_set.sh`
+    * run `get_counts.py`, `correct_counts.py`, and `set_counts.py`, logging relevant information
 
 All of the scripts above share a common format
 <details>
@@ -47,6 +49,12 @@ All of the scripts above share a common format
 
 ## Description
 
+### `get_correct_set.sh`
+
+`get_correct_set.sh` gets counts using `get_counts.py` from Redis, corrects them using `correct_counts.py`, and on success sets them using `set_counts.py`.
+It attempts to log to `/var/log/mirrulations_counts.log`.
+By default, it expects a virtual environment with all required dependencies in `/home/cs334/mirrulations/scripts/.venv`.
+
 ### `get_counts.py`
 
 `get_counts.py` gets counts from one of three sources: regulations.gov, a Mirrulations Redis instance, a Mirrulations dashboard via HTTP.
diff --git a/scripts/get_correct_set.sh b/scripts/get_correct_set.sh
old mode 100644
new mode 100755