From e73c20f316f54e6d316b24eba1e1b79b74b50994 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 16:21:31 -0400 Subject: [PATCH 01/14] Add script to download counts from regulations.gov, dashboard, and redis --- scripts/get_counts.py | 265 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100755 scripts/get_counts.py diff --git a/scripts/get_counts.py b/scripts/get_counts.py new file mode 100755 index 00000000..589101e5 --- /dev/null +++ b/scripts/get_counts.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 + +import argparse +import datetime as dt +import json +import os +import pathlib +import sys +from typing import Any, TypedDict + +import redis +import requests + +REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/" + + +class EntityCount(TypedDict): + downloaded: int + jobs: int + total: int + last_timestamp: dt.datetime + + +class Output(TypedDict): + creation_timestamp: dt.datetime + dockets: EntityCount + documents: EntityCount + comments: EntityCount + + +class OutputEncoder(json.JSONEncoder): + def default(self, o: Any) -> Any: + if isinstance(o, dt.datetime): + return o.strftime("%Y-%m-%d %H:%M:%S") + return super().default(o) + + +def _download_regulation_count( + url: str, headers: dict[str, str], params: dict[str, str] +) -> int: + response = requests.get( + url, + headers=headers, + params=params, + ) + response.raise_for_status() + return response.json()["meta"]["totalElemnts"] + + +def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output: + """Get counts from regulations.gov given a last_timestamp + + Exactly 6 Regulations.gov API calls are made during this function + """ + output: Output = { + "creation_timestamp": dt.datetime.now(dt.timezone.utc), + "dockets": { + "downloaded": -1, + "jobs": 0, + "total": -1, + "last_timestamp": last_timestamp, + }, + "documents": { + "downloaded": -1, + "jobs": 0, + "total": -1, + "last_timestamp": last_timestamp, + }, + "comments": { + "downloaded": -1, + "jobs": 0, + "total": -1, + "last_timestamp": last_timestamp, + }, + } + + headers = {"X-Api-Key": api_key} + # NOTE: we set pagesize to be 5 since we only care about the metadata + downloaded_filter = { + "filter[lastModifiedDate][le]": last_timestamp.strftime("%Y-%m-%d %H:%M:%S"), + "page[size]": 5, + } + + for entity_type in ("dockets", "documents", "comments"): + downloaded = _download_regulation_count( + REGULATIONS_BASE_URL + entity_type, headers, downloaded_filter + ) + total = _download_regulation_count( + REGULATIONS_BASE_URL + entity_type, headers, {"page[size]": "5"} + ) + output[entity_type]["downloaded"] = downloaded + output[entity_type]["total"] = total + + return output + + +def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output: + """Get the counts of a running mirrulations instance via it's dashboard""" + response = requests.get(dashboard_url + "/data") + response.raise_for_status() + + content = response.json() + + output: Output = { + "creation_timestamp": dt.datetime.now(dt.timezone.utc), + "dockets": { + "downloaded": content["num_dockets_done"], + "jobs": content["num_jobs_dockets_queued"], + "total": content["regulations_total_dockets"], + "last_timestamp": last_timestamp, + }, + "documents": { + "downloaded": content["num_documents_done"], + "jobs": content["num_jobs_documents_queued"], + "total": content["regulations_total_documents"], + "last_timestamp": last_timestamp, + }, + "comments": { + "downloaded": content["num_comments_done"], + "jobs": content["num_jobs_comments_queued"], + "total": content["regulations_total_comments"], + "last_timestamp": last_timestamp, + }, + } + + return output + + +def get_redis(db: redis.Redis) -> Output: + """Get the counts of a running mirrulations instance via a Redis connection""" + + output: Output = { + "creation_timestamp": dt.datetime.now(dt.timezone.utc), + "dockets": { + "downloaded": int(db.get("num_dockets_done")), + "jobs": int(db.get("num_jobs_dockets_waiting")), + "total": int(db.get("regulations_total_dockets")), + "last_timestamp": dt.datetime.strptime( + str(db.get("dockets_last_timestamp")), "%Y-%m-%d %H:%M:%S" + ), + }, + "documents": { + "downloaded": int(db.get("num_documents_done")), + "jobs": int(db.get("num_jobs_documents_waiting")), + "total": int(db.get("regulations_total_documents")), + "last_timestamp": dt.datetime.strptime( + str(db.get("documents_last_timestamp")), "%Y-%m-%d %H:%M:%S" + ), + }, + "comments": { + "downloaded": int(db.get("num_comments_done")), + "jobs": int(db.get("num_jobs_comments_waiting")), + "total": int(db.get("regulations_total_comments")), + "last_timestamp": dt.datetime.strptime( + str(db.get("comments_last_timestamp")), "%Y-%m-%d %H:%M:%S" + ), + }, + } + + return output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get Docket, Document, and Comment counts from multiple sources" + ) + parser.add_argument( + "-o", + "--output", + metavar="PATH", + type=str, + default="-", + help="file to output to, use '-' for stdout (default '%(default)s')", + ) + subparsers = parser.add_subparsers( + dest="source", required=True, help="The source to get counts from" + ) + + regulations = subparsers.add_parser( + "regulations", help="download counts from regulations.gov" + ) + regulations.add_argument( + "-a", + "--api-key", + help="Regulations.gov api key, defaults to value of `API_KEY` environment variable", + default=os.getenv("API_KEY"), + type=str, + ) + regulations.add_argument( + "-t", + "--last-timestamp", + metavar="TIMESTAMP", + type=dt.datetime.fromisoformat, + default=dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + help="last timestamp that is assumed to have been downloaded in ISO 8601 format 'YYYY-MM-DDTHH:mm:ssZ' (default '%(default)s')", + ) + + dashboard = subparsers.add_parser( + "dashboard", help="get counts from a mirrulations dashboard" + ) + dashboard.add_argument( + "-u", + "--url", + metavar="DASHBOARD_URL", + default="http://localhost", + help="dashboard url (default '%(default)s')", + ) + dashboard.add_argument( + "last_timestamp", + type=dt.datetime.fromisoformat, + default=dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + help="last timestamp that is assumed to have been downloaded in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')", + ) + + redis_args = subparsers.add_parser("redis", help="get counts from redis") + redis_args.add_argument( + "--hostname", + metavar="HOSTNAME", + default="localhost", + help="redis server hostname (default '%(default)s')", + ) + redis_args.add_argument( + "-p", + "--port", + metavar="PORT", + type=int, + default=6379, + help="port for redis server (default '%(default)s')", + ) + redis_args.add_argument( + "-n", + "--db", + metavar="DB_NUMBER", + type=int, + default=0, + help="redis database number (default '%(default)s')", + ) + + args = parser.parse_args() + print(args) + + path: str = args.output + output_file = sys.stdout if path == "-" else open(pathlib.Path(path), "w") + + source = args.source + if source == "regulations": + api_key = args.api_key + if api_key is None or api_key == "": + print("No api key found, exitting", file=sys.stderr) + sys.exit(1) + output = get_regulation(api_key, args.last_timestamp) + elif source == "dashboard": + output = get_dashboard(args.url, args.last_timestamp) + elif source == "redis": + db = redis.Redis(host=args.hostname, port=args.port, db=args.db) + output = get_redis(db) + else: + print("Unrecognized source, exitting", file=sys.stderr) + sys.exit(1) + + if path == "-": + json.dump(output, sys.stdout, cls=OutputEncoder) + else: + with open(pathlib.Path(path), "w") as fp: + json.dump(output, fp, cls=OutputEncoder) From 7c9c49702f2990fbae2233eb91c52597958847f4 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 16:37:00 -0400 Subject: [PATCH 02/14] Move count types to seperate file --- scripts/counts.py | 24 ++++++++++++++++++++++++ scripts/get_counts.py | 34 +++++----------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) create mode 100644 scripts/counts.py diff --git a/scripts/counts.py b/scripts/counts.py new file mode 100644 index 00000000..3ec162c1 --- /dev/null +++ b/scripts/counts.py @@ -0,0 +1,24 @@ +import json +import datetime as dt +from typing import Any, TypedDict + + +class EntityCount(TypedDict): + downloaded: int + jobs: int + total: int + last_timestamp: dt.datetime + + +class Output(TypedDict): + creation_timestamp: dt.datetime + dockets: EntityCount + documents: EntityCount + comments: EntityCount + + +class OutputEncoder(json.JSONEncoder): + def default(self, o: Any) -> Any: + if isinstance(o, dt.datetime): + return o.strftime("%Y-%m-%d %H:%M:%S") + return super().default(o) diff --git a/scripts/get_counts.py b/scripts/get_counts.py index 589101e5..4335a233 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -6,7 +6,7 @@ import os import pathlib import sys -from typing import Any, TypedDict +from counts import Output, OutputEncoder import redis import requests @@ -14,27 +14,6 @@ REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/" -class EntityCount(TypedDict): - downloaded: int - jobs: int - total: int - last_timestamp: dt.datetime - - -class Output(TypedDict): - creation_timestamp: dt.datetime - dockets: EntityCount - documents: EntityCount - comments: EntityCount - - -class OutputEncoder(json.JSONEncoder): - def default(self, o: Any) -> Any: - if isinstance(o, dt.datetime): - return o.strftime("%Y-%m-%d %H:%M:%S") - return super().default(o) - - def _download_regulation_count( url: str, headers: dict[str, str], params: dict[str, str] ) -> int: @@ -162,7 +141,8 @@ def get_redis(db: redis.Redis) -> Output: if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Get Docket, Document, and Comment counts from multiple sources" + "Get Counts", + description="Get Docket, Document, and Comment counts from multiple sources", ) parser.add_argument( "-o", @@ -237,10 +217,6 @@ def get_redis(db: redis.Redis) -> Output: ) args = parser.parse_args() - print(args) - - path: str = args.output - output_file = sys.stdout if path == "-" else open(pathlib.Path(path), "w") source = args.source if source == "regulations": @@ -258,8 +234,8 @@ def get_redis(db: redis.Redis) -> Output: print("Unrecognized source, exitting", file=sys.stderr) sys.exit(1) - if path == "-": + if args.output == "-": json.dump(output, sys.stdout, cls=OutputEncoder) else: - with open(pathlib.Path(path), "w") as fp: + with open(pathlib.Path(args.output), "w") as fp: json.dump(output, fp, cls=OutputEncoder) From 9357d7257145b9de592fddfe36196038be4316a2 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 17:20:50 -0400 Subject: [PATCH 03/14] Rename counts classes and fix typo --- scripts/counts.py | 17 +++++++++++++++-- scripts/get_counts.py | 20 ++++++++++---------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/scripts/counts.py b/scripts/counts.py index 3ec162c1..2423fb58 100644 --- a/scripts/counts.py +++ b/scripts/counts.py @@ -10,15 +10,28 @@ class EntityCount(TypedDict): last_timestamp: dt.datetime -class Output(TypedDict): +class Counts(TypedDict): creation_timestamp: dt.datetime dockets: EntityCount documents: EntityCount comments: EntityCount -class OutputEncoder(json.JSONEncoder): +class CountsEncoder(json.JSONEncoder): def default(self, o: Any) -> Any: if isinstance(o, dt.datetime): return o.strftime("%Y-%m-%d %H:%M:%S") return super().default(o) + + +class CountsDecoder(json.JSONDecoder): + def __init__(self, *args, **kwargs): + super().__init__(object_hook=self.object_hook, *args, **kwargs) + + def object_hook(self, obj: Any) -> Any: + for key, value in obj.items(): + try: + obj[key] = dt.datetime.strptime(value, "%Y-%m-%d %H:%M:%S") + except (ValueError, TypeError): + pass + return obj diff --git a/scripts/get_counts.py b/scripts/get_counts.py index 4335a233..aacbd1d1 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -6,7 +6,7 @@ import os import pathlib import sys -from counts import Output, OutputEncoder +from counts import Counts, CountsEncoder import redis import requests @@ -23,15 +23,15 @@ def _download_regulation_count( params=params, ) response.raise_for_status() - return response.json()["meta"]["totalElemnts"] + return response.json()["meta"]["totalElements"] -def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output: +def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Counts: """Get counts from regulations.gov given a last_timestamp Exactly 6 Regulations.gov API calls are made during this function """ - output: Output = { + output: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), "dockets": { "downloaded": -1, @@ -73,14 +73,14 @@ def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Output: return output -def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output: +def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts: """Get the counts of a running mirrulations instance via it's dashboard""" response = requests.get(dashboard_url + "/data") response.raise_for_status() content = response.json() - output: Output = { + output: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), "dockets": { "downloaded": content["num_dockets_done"], @@ -105,10 +105,10 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Output: return output -def get_redis(db: redis.Redis) -> Output: +def get_redis(db: redis.Redis) -> Counts: """Get the counts of a running mirrulations instance via a Redis connection""" - output: Output = { + output: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), "dockets": { "downloaded": int(db.get("num_dockets_done")), @@ -235,7 +235,7 @@ def get_redis(db: redis.Redis) -> Output: sys.exit(1) if args.output == "-": - json.dump(output, sys.stdout, cls=OutputEncoder) + json.dump(output, sys.stdout, cls=CountsEncoder) else: with open(pathlib.Path(args.output), "w") as fp: - json.dump(output, fp, cls=OutputEncoder) + json.dump(output, fp, cls=CountsEncoder) From b25ab8bc59e3a0a18a4714f9099582a727fd21c4 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 17:34:16 -0400 Subject: [PATCH 04/14] Add script to correct counts obtained from --- scripts/correct_counts.py | 107 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 scripts/correct_counts.py diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py new file mode 100755 index 00000000..74f2e5fa --- /dev/null +++ b/scripts/correct_counts.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +from copy import deepcopy +import json +import pathlib +import sys +from counts import Counts, CountsEncoder, CountsDecoder + +import argparse + + +class JobsInQueueException(Exception): + pass + + +def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts: + filtered = deepcopy(recieved) + for entity_type in ("dockets", "documents", "comments"): + total_ = filtered[entity_type]["total"] + downloaded = filtered[entity_type]["downloaded"] + if filtered[entity_type]["jobs"] > 0 and not ignore_queue: + raise JobsInQueueException( + f'{entity_type} has {filtered[entity_type]["jobs"]} in queue' + ) + filtered[entity_type]["downloaded"] = min(total_, downloaded) + + return filtered + + +def strategy_diff(recieved: Counts, ignore_queue: bool) -> Counts: + filtered = deepcopy(recieved) + for entity_type in ("dockets", "documents", "comments"): + total_ = filtered[entity_type]["total"] + downloaded = filtered[entity_type]["downloaded"] + jobs = filtered[entity_type]["jobs"] + if jobs > 0 and not ignore_queue: + raise JobsInQueueException( + f'{entity_type} has {filtered[entity_type]["jobs"]} in queue' + ) + filtered[entity_type]["downloaded"] = min(total_ - jobs, downloaded) + + return filtered + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + "Correct Counts", + description="Correct counts in json format by either capping downloaded with `total` or capping with `total - jobs`", + ) + parser.add_argument( + "-o", + "--output", + metavar="OUTPUT_PATH", + type=str, + default="-", + help="file to output to, use '-' for stdout (default '%(default)s')", + ) + parser.add_argument( + "-i", + "--input", + metavar="INPUT_PATH", + type=str, + default="-", + help="file to read from, use '-' for stdin (default '%(default)s')", + ) + parser.add_argument( + "-s", + "--strategy", + type=str, + default="cap_with_total", + choices=("cap_with_total", "diff_total_with_jobs"), + help="the correction strategy to use (default '%(default)s')", + ) + parser.add_argument( + "--ignore-queue", + action="store_true", + help="continue even if there are queued jobs", + ) + + args = parser.parse_args() + + if args.input == "-": + input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) + else: + with open(pathlib.Path(args.input), "r") as fp: + input_counts = json.load(fp, cls=CountsDecoder) + + try: + if args.strategy == "cap_with_total": + modified_counts = strategy_cap(input_counts, args.ignore_queue) + elif args.strategy == "diff_total_with_jobs": + modified_counts = strategy_diff(input_counts, args.ignore_queue) + else: + print(f"Unrecognized strategy {args.strategy}, exitting", file=sys.stderr) + sys.exit(1) + except JobsInQueueException as e: + print( + f"Found jobs in queue: {e}\nUse `--ignore-queue` to continue", + file=sys.stderr, + ) + sys.exit(2) + + if args.output == "-": + json.dump(modified_counts, sys.stdout, cls=CountsEncoder) + else: + with open(pathlib.Path(args.output), "w") as fp: + json.dump(modified_counts, fp, cls=CountsEncoder) From 691a79ff8bfe3a46cdd6c97933a973e31d0ee7e6 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 19:40:13 -0400 Subject: [PATCH 05/14] Add script to set redis values from json --- scripts/set_counts.py | 146 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100755 scripts/set_counts.py diff --git a/scripts/set_counts.py b/scripts/set_counts.py new file mode 100755 index 00000000..d26ae619 --- /dev/null +++ b/scripts/set_counts.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import argparse +import json +import pathlib +import redis +import sys + +from counts import Counts, CountsDecoder + +ANSI_RESET = "\033[0m" +ANSI_BOLD = "\033[1m" +ANSI_BLINK = "\033[5m" +ANSI_BLINK_OFF = "\033[25m" +ANSI_FG_RED = "\033[31m" + + +def _get_vals(db: redis.Redis, entity_type: str) -> dict[str, int | str]: + done_raw: str | None = db.get(f"num_{entity_type}_done") + if done_raw is not None: + done = int(done_raw) + else: + done = "None" + + total_raw: str | None = db.get(f"regulations_total_{entity_type}") + if total_raw is not None: + total = int(total_raw) + else: + total = "None" + + timestamp: str = db.get(f"{entity_type}_last_timestamp") or "None" + + return {"done": done, "timestamp": timestamp, "total": total} + + +def _print_changes(info: str, original: str, new: str) -> None: + if original != new: + print( + info, + ANSI_FG_RED + ANSI_BOLD + original, + f"{ANSI_BLINK}--->{ANSI_BLINK_OFF}", + new + ANSI_RESET, + ) + else: + print(info, original, "--->", new) + + +def show_changes(db: redis.Redis, counts: Counts) -> None: + for entity_type in ("dockets", "documents", "comments"): + vals = _get_vals(db, entity_type) + _print_changes( + f"num_{entity_type}_done:\n ", + str(vals["done"]), + str(counts[entity_type]["downloaded"]), + ) + _print_changes( + f"regulations_total_{entity_type}:\n ", + str(vals["total"]), + str(counts[entity_type]["total"]), + ) + _print_changes( + f"{entity_type}_last_timestamp:\n ", + str(vals["timestamp"]), + counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"), + ) + print() + + +def set_values(db: redis.Redis, counts: Counts): + for entity_type in ("dockets", "documents", "comments"): + db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"]) + db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"]) + db.set( + f"{entity_type}_last_timestamp", + counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"), + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + "Set Counts", description="Set counts in Redis database from json" + ) + parser.add_argument( + "-i", + "--input", + metavar="INPUT_PATH", + type=str, + default="-", + help="file to read from, use '-' for stdin (default '%(default)s')", + ) + parser.add_argument( + "-y", + "--yes", + dest="changes_confirmed", + action="store_true", + help="Do not check for confirmation when setting values", + ) + parser.add_argument( + "--host", + metavar="HOSTNAME", + default="localhost", + help="redis server hostname (default '%(default)s')", + ) + parser.add_argument( + "-p", + "--port", + metavar="PORT", + type=int, + default=6379, + help="port for redis server (default '%(default)s')", + ) + parser.add_argument( + "-n", + "--db", + metavar="DB_NUMBER", + type=int, + default=0, + help="redis database number (default '%(default)s')", + ) + + args = parser.parse_args() + + if args.input == "-": + input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) + else: + with open(pathlib.Path(args.input), "r") as fp: + input_counts = json.load(fp, cls=CountsDecoder) + + db = redis.Redis(args.host, args.port, args.db, decode_responses=True) + changes_confirmed: bool = args.changes_confirmed + + if changes_confirmed: + set_values(db, input_counts) + else: + show_changes(db, input_counts) + response = ( + input("Are you sure you want to make the above changes [y/n]: ") + .strip() + .lower() + ) + changes_confirmed = response == "y" or response == "yes" + if changes_confirmed: + set_values(db, input_counts) + else: + print("No values set, exitting") + sys.exit() From 67d4c5a36ce0288f0f56569082d777d03cf3e041 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 20:07:02 -0400 Subject: [PATCH 06/14] Refacor get_counts script to handle missing redis keys --- scripts/get_counts.py | 68 ++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/scripts/get_counts.py b/scripts/get_counts.py index aacbd1d1..7fb02162 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -14,6 +14,10 @@ REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/" +class MissingRedisKeyException(Exception): + pass + + def _download_regulation_count( url: str, headers: dict[str, str], params: dict[str, str] ) -> int: @@ -80,7 +84,7 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts: content = response.json() - output: Counts = { + counts: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), "dockets": { "downloaded": content["num_dockets_done"], @@ -102,41 +106,39 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts: }, } - return output + return counts + + +def _get_key_or_raise(db: redis.Redis, key: str) -> str: + value: str | None = db.get(key) + if value is None: + raise MissingRedisKeyException(f"missing redis key: {key}") + + return value def get_redis(db: redis.Redis) -> Counts: """Get the counts of a running mirrulations instance via a Redis connection""" - output: Counts = { + counts: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), - "dockets": { - "downloaded": int(db.get("num_dockets_done")), - "jobs": int(db.get("num_jobs_dockets_waiting")), - "total": int(db.get("regulations_total_dockets")), - "last_timestamp": dt.datetime.strptime( - str(db.get("dockets_last_timestamp")), "%Y-%m-%d %H:%M:%S" - ), - }, - "documents": { - "downloaded": int(db.get("num_documents_done")), - "jobs": int(db.get("num_jobs_documents_waiting")), - "total": int(db.get("regulations_total_documents")), - "last_timestamp": dt.datetime.strptime( - str(db.get("documents_last_timestamp")), "%Y-%m-%d %H:%M:%S" - ), - }, - "comments": { - "downloaded": int(db.get("num_comments_done")), - "jobs": int(db.get("num_jobs_comments_waiting")), - "total": int(db.get("regulations_total_comments")), - "last_timestamp": dt.datetime.strptime( - str(db.get("comments_last_timestamp")), "%Y-%m-%d %H:%M:%S" - ), - }, } - return output + for entity_type in ("dockets", "documents", "comments"): + # Getting any of these values can raise an exception + downloaded = _get_key_or_raise(db, f"num_{entity_type}_done") + jobs = _get_key_or_raise(db, f"num_jobs_{entity_type}_waiting") + total = _get_key_or_raise(db, f"regulations_total_{entity_type}") + last_timestamp = _get_key_or_raise(db, f"{entity_type}_last_timestamp") + + counts[entity_type] = { + "downloaded": int(downloaded), + "jobs": int(jobs), + "total": int(total), + "last_timestamp": dt.datetime.strptime(last_timestamp, "%Y-%m-%d %H:%M:%S"), + } + + return counts if __name__ == "__main__": @@ -228,8 +230,14 @@ def get_redis(db: redis.Redis) -> Counts: elif source == "dashboard": output = get_dashboard(args.url, args.last_timestamp) elif source == "redis": - db = redis.Redis(host=args.hostname, port=args.port, db=args.db) - output = get_redis(db) + db = redis.Redis( + host=args.hostname, port=args.port, db=args.db, decode_responses=True + ) + try: + output = get_redis(db) + except MissingRedisKeyException as e: + print(f"Missing a redis key, exitting\n{e}", file=sys.stderr) + sys.exit(1) else: print("Unrecognized source, exitting", file=sys.stderr) sys.exit(1) From d8f68b15c0122b109037011233accee005dae04d Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 20:12:41 -0400 Subject: [PATCH 07/14] Add requirements.txt for scripts --- scripts/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 scripts/requirements.txt diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..28bd01f9 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,2 @@ +requests +redis From a4bd129b2957427b80d4f8e9587da94ccad0d605 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 21:05:28 -0400 Subject: [PATCH 08/14] Add documentation for scripts --- docs/scripts.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 docs/scripts.md diff --git a/docs/scripts.md b/docs/scripts.md new file mode 100644 index 00000000..9b4fd565 --- /dev/null +++ b/docs/scripts.md @@ -0,0 +1,104 @@ +# Script Documentation + +## Summary + +Some tasks are small enough that the project architecture should not change, but the large enough that they should not be performed by hand. +Files in the `scripts` directory exist to fill this space. + +Currently, the following scripts are provided. + +* `get_counts.py` + * get docket, document, and comment counts from regulations.gov, a mirrulations dashboard, or a mirrulations Redis instance as json + * when using regulations.gov a timestamp can be given to make all dockets, documents, and comments before the timestamp count as if they were downloaded +* `correct_counts.py` + * correct possible errors within a counts json file generated by `get_counts.py` +* `set_counts.py` + * set values in a mirrulations Redis instance using json generated by `get_counts.py` + +All of the scripts above share a common format +
+get_counts.py common format + +```json +{ + "creation_timestamp": "2024-10-16 15:00:00", + "dockets": { + "downloaded": 253807, + "jobs": 0, + "total": 253807, + "last_timestamp": "2024-10-13 04:04:18" + }, + "documents": { + "downloaded": 1843774, + "jobs": 0, + "total": 1843774, + "last_timestamp": "2024-10-13 04:04:18" + }, + "comments": { + "downloaded": 22240501, + "jobs": 10, + "total": 22240511, + "last_timestamp": "2024-10-13 04:04:18" + } +} +``` + +
+ +## Description + +### `get_counts.py` + +`get_counts.py` gets counts from one of three sources: regulations.gov, a Mirrulations Redis instance, a Mirrulations dashboard via HTTP. + +When reading from regulations.gov a UTC timestamp can be specified to mock having downloaded all dockets, documents, and comments from before that timestamp. + +When reading from a dashboard a UTC timestamp must be specified since the dashboard API does not provide one. + +### `correct_counts.py` + +`correct_counts.py` corrects counts from `get_counts.py` using one of two strategies: set downloaded counts for a type to the minimum of `downloaded` and `total` for that type, or set downloaded counts to the minimum of `total -jobs` and `downloaded`. +By default any queued jobs will cause the script to exit and output nothing, this behavior can be changed with the `--ignore-queue` flag. + +### `set_counts.py` + +`set_counts.py` sets values from `get_counts.py` in a Redis instance. +By default the script will prompt for user input before changing any values. +This behavior can be changed using the `--yes` flag, which should be used **WITH GREAT CARE, ESPECIALLY IN PRODUCTION!!!**. + +## Setup + +First a virtual environment should be created to download dependencies to. + +```bash +cd scripts +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +Make sure when you are in the correct environment when running scripts. + +## Examples + +### Cap Docket, Document, and Comment downloaded counts by the counts from Regulations.gov + +```bash +./get_counts.py redis | ./correct_counts.py | ./set_counts.py -y +``` + +### Set Docket, Document, Comment downloaded counts while jobs are in the queue + +```bash +./get_counts.py dashboard | ./correct_counts.py --ignore-queue --strategy diff_total_with_jobs | ./set_counts.py -y +``` + +### Download Counts for a Certain Time from Regulations.gov + +```bash +./get_counts.py --api-key $API_KEY -o aug_6_2022.json -t 2024-08-06T06:20:50Z + +EXPORT API_KEY= +./get_counts.py regulations -o oct_01_2024.json --last-timestamp 2024-10-01T15:30:10Z +./set_counts.py -i oct_01_2024.json +``` From 5a2ef93e46aec712d634a69b95ca2a4a5ab2c590 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 28 Oct 2024 10:46:13 -0400 Subject: [PATCH 09/14] Add queue size field to shared output --- scripts/counts.py | 1 + scripts/get_counts.py | 5 +++++ scripts/job_queue.py | 40 ++++++++++++++++++++++++++++++++++++++++ scripts/requirements.txt | 1 + 4 files changed, 47 insertions(+) create mode 100644 scripts/job_queue.py diff --git a/scripts/counts.py b/scripts/counts.py index 2423fb58..28b066ca 100644 --- a/scripts/counts.py +++ b/scripts/counts.py @@ -12,6 +12,7 @@ class EntityCount(TypedDict): class Counts(TypedDict): creation_timestamp: dt.datetime + queue_size: int dockets: EntityCount documents: EntityCount comments: EntityCount diff --git a/scripts/get_counts.py b/scripts/get_counts.py index 7fb02162..4df89929 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -7,6 +7,7 @@ import pathlib import sys from counts import Counts, CountsEncoder +from job_queue import RabbitMQ import redis import requests @@ -37,6 +38,7 @@ def get_regulation(api_key: str, last_timestamp: dt.datetime) -> Counts: """ output: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), + "queue_size": 0, "dockets": { "downloaded": -1, "jobs": 0, @@ -86,6 +88,7 @@ def get_dashboard(dashboard_url: str, last_timestamp: dt.datetime) -> Counts: counts: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), + "queue_size": content["num_jobs_waiting"], "dockets": { "downloaded": content["num_dockets_done"], "jobs": content["num_jobs_dockets_queued"], @@ -123,6 +126,8 @@ def get_redis(db: redis.Redis) -> Counts: counts: Counts = { "creation_timestamp": dt.datetime.now(dt.timezone.utc), } + queue = RabbitMQ("jobs_waiting_queue") + counts["queue_size"] = queue for entity_type in ("dockets", "documents", "comments"): # Getting any of these values can raise an exception diff --git a/scripts/job_queue.py b/scripts/job_queue.py new file mode 100644 index 00000000..6b932978 --- /dev/null +++ b/scripts/job_queue.py @@ -0,0 +1,40 @@ +# pylint: disable=too-many-arguments +import sys +import pika + + +class RabbitMQ: + """ + Encapsulate calls to RabbitMQ in one place + """ + + def __init__(self, queue_name): + """ + Create a new RabbitMQ object + @param queue_name: the name of the queue to use + """ + self.queue_name = queue_name + self.connection = None + self.channel = None + + def _ensure_channel(self): + if self.connection is None or not self.connection.is_open: + connection_parameter = pika.ConnectionParameters("rabbitmq") + self.connection = pika.BlockingConnection(connection_parameter) + self.channel = self.connection.channel() + self.channel.queue_declare(self.queue_name, durable=True) + + def size(self): + """ + Get the number of jobs in the queue. + Can't be sure Channel is active between ensure_channel() + and queue_declare() which is the reasoning for implementation of try + except + @return: a non-negative integer + """ + self._ensure_channel() + try: + queue = self.channel.queue_declare(self.queue_name, durable=True) + return queue.method.message_count + except pika.exceptions.StreamLostError: + print("FAILURE: RabbitMQ Channel Connection Lost", file=sys.stderr) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 28bd01f9..5e8dc051 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,2 +1,3 @@ requests redis +pika From 2cd7490e037c05e35dba08fe431815609da76b9a Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 28 Oct 2024 10:47:39 -0400 Subject: [PATCH 10/14] Change cap strategy to use shared queue size --- scripts/correct_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py index 74f2e5fa..870ba38c 100755 --- a/scripts/correct_counts.py +++ b/scripts/correct_counts.py @@ -18,7 +18,7 @@ def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts: for entity_type in ("dockets", "documents", "comments"): total_ = filtered[entity_type]["total"] downloaded = filtered[entity_type]["downloaded"] - if filtered[entity_type]["jobs"] > 0 and not ignore_queue: + if filtered["queue_size"] != 0 and not ignore_queue: raise JobsInQueueException( f'{entity_type} has {filtered[entity_type]["jobs"]} in queue' ) From 058414c5e71fa374b6f34020c84dbd695e31f6db Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 28 Oct 2024 10:51:37 -0400 Subject: [PATCH 11/14] Add better error logging --- scripts/set_counts.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/set_counts.py b/scripts/set_counts.py index d26ae619..a5480360 100755 --- a/scripts/set_counts.py +++ b/scripts/set_counts.py @@ -68,12 +68,17 @@ def show_changes(db: redis.Redis, counts: Counts) -> None: def set_values(db: redis.Redis, counts: Counts): for entity_type in ("dockets", "documents", "comments"): - db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"]) - db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"]) - db.set( - f"{entity_type}_last_timestamp", - counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"), - ) + try: + db.set(f"num_{entity_type}_done", counts[entity_type]["downloaded"]) + db.set(f"regulations_total_{entity_type}", counts[entity_type]["total"]) + db.set( + f"{entity_type}_last_timestamp", + counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"), + ) + except Exception as e: + print(f"Error occurred while setting values for {entity_type}, exitting", file=sys.stderr) + print(e) + return if __name__ == "__main__": From 9a3441cff1be1eb552caeb3b2dc7e58ee87914ed Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 28 Oct 2024 11:06:19 -0400 Subject: [PATCH 12/14] Add script to get, correct, and set counts in redis --- scripts/get_correct_set.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 scripts/get_correct_set.sh diff --git a/scripts/get_correct_set.sh b/scripts/get_correct_set.sh new file mode 100644 index 00000000..6fbca092 --- /dev/null +++ b/scripts/get_correct_set.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +WORK_DIR="/home/cs334/mirrulations/scripts/" +LOG_FILE=/var/log/mirrulations_counts.log +START_TIME=$(date -u -Iseconds) +echo "$START_TIME: RUnning" > $LOG_FILE +cd $WORK_DIR + +PYTHON=".venv/bin/python3" + +$PYTHON get_counts redis -o "/tmp/mirrulations_$START_TIME.json" 2>> $LOG_FILE && + $PYTHON correct_counts -i "/tmp/mirrulations_$START_TIME.json" -o "/tmp/mirrulations_${START_TIME}_corrected.json" 2>> $LOG_FILE && + $PYTHON set_counts -y -i "/tmp/mirrulations_${START_TIME}_corrected.json" 2>> $LOG_FILE + +rm "/tmp/mirrulations_${START_TIME}_corrected.json" "/tmp/mirrulations_$START_TIME.json" From 88092cbac7d488865a59b5837d2a45a9d39b82ab Mon Sep 17 00:00:00 2001 From: JP Appel Date: Tue, 29 Oct 2024 09:52:15 -0400 Subject: [PATCH 13/14] Improve logging --- scripts/correct_counts.py | 25 ++++++++++++++++--------- scripts/get_counts.py | 2 +- scripts/job_queue.py | 5 +++-- scripts/set_counts.py | 25 +++++++++++++++++++------ 4 files changed, 39 insertions(+), 18 deletions(-) diff --git a/scripts/correct_counts.py b/scripts/correct_counts.py index 870ba38c..e276e994 100755 --- a/scripts/correct_counts.py +++ b/scripts/correct_counts.py @@ -4,6 +4,7 @@ import json import pathlib import sys +from json import JSONDecodeError from counts import Counts, CountsEncoder, CountsDecoder import argparse @@ -15,13 +16,11 @@ class JobsInQueueException(Exception): def strategy_cap(recieved: Counts, ignore_queue: bool) -> Counts: filtered = deepcopy(recieved) + if filtered["queue_size"] != 0 and not ignore_queue: + raise JobsInQueueException(f'Found jobs in job queue: {filtered["queue_size"]}') for entity_type in ("dockets", "documents", "comments"): total_ = filtered[entity_type]["total"] downloaded = filtered[entity_type]["downloaded"] - if filtered["queue_size"] != 0 and not ignore_queue: - raise JobsInQueueException( - f'{entity_type} has {filtered[entity_type]["jobs"]} in queue' - ) filtered[entity_type]["downloaded"] = min(total_, downloaded) return filtered @@ -79,11 +78,19 @@ def strategy_diff(recieved: Counts, ignore_queue: bool) -> Counts: args = parser.parse_args() - if args.input == "-": - input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) - else: - with open(pathlib.Path(args.input), "r") as fp: - input_counts = json.load(fp, cls=CountsDecoder) + try: + if args.input == "-": + input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) + else: + try: + with open(pathlib.Path(args.input), "r") as fp: + input_counts = json.load(fp, cls=CountsDecoder) + except FileNotFoundError: + print(f"Missing file {args.input}, exitting", file=sys.stderr) + sys.exit(2) + except JSONDecodeError: + print(f"Malformed input file {args.input}, exitting", file=sys.stderr) + sys.exit(2) try: if args.strategy == "cap_with_total": diff --git a/scripts/get_counts.py b/scripts/get_counts.py index 4df89929..05f2a93a 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -127,7 +127,7 @@ def get_redis(db: redis.Redis) -> Counts: "creation_timestamp": dt.datetime.now(dt.timezone.utc), } queue = RabbitMQ("jobs_waiting_queue") - counts["queue_size"] = queue + counts["queue_size"] = queue.size() for entity_type in ("dockets", "documents", "comments"): # Getting any of these values can raise an exception diff --git a/scripts/job_queue.py b/scripts/job_queue.py index 6b932978..5658210f 100644 --- a/scripts/job_queue.py +++ b/scripts/job_queue.py @@ -19,12 +19,12 @@ def __init__(self, queue_name): def _ensure_channel(self): if self.connection is None or not self.connection.is_open: - connection_parameter = pika.ConnectionParameters("rabbitmq") + connection_parameter = pika.ConnectionParameters("localhost") self.connection = pika.BlockingConnection(connection_parameter) self.channel = self.connection.channel() self.channel.queue_declare(self.queue_name, durable=True) - def size(self): + def size(self) -> int: """ Get the number of jobs in the queue. Can't be sure Channel is active between ensure_channel() @@ -38,3 +38,4 @@ def size(self): return queue.method.message_count except pika.exceptions.StreamLostError: print("FAILURE: RabbitMQ Channel Connection Lost", file=sys.stderr) + return 0 diff --git a/scripts/set_counts.py b/scripts/set_counts.py index a5480360..8e2803cd 100755 --- a/scripts/set_counts.py +++ b/scripts/set_counts.py @@ -76,7 +76,10 @@ def set_values(db: redis.Redis, counts: Counts): counts[entity_type]["last_timestamp"].strftime("%Y-%m-%d %H:%M:%S"), ) except Exception as e: - print(f"Error occurred while setting values for {entity_type}, exitting", file=sys.stderr) + print( + f"Error occurred while setting values for {entity_type}, exitting", + file=sys.stderr, + ) print(e) return @@ -125,11 +128,21 @@ def set_values(db: redis.Redis, counts: Counts): args = parser.parse_args() - if args.input == "-": - input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) - else: - with open(pathlib.Path(args.input), "r") as fp: - input_counts = json.load(fp, cls=CountsDecoder) + try: + if args.input == "-": + input_counts: Counts = json.load(sys.stdin, cls=CountsDecoder) + else: + try: + with open(pathlib.Path(args.input), "r") as fp: + input_counts = json.load(fp, cls=CountsDecoder) + except FileNotFoundError: + print( + f"Input file {args.input} does not exist, exitting", file=sys.stderr + ) + sys.exit(2) + except json.JSONDecodeError: + print(f"Malformed input file {args.input}, exitting", file=sys.stderr) + sys.exit(2) db = redis.Redis(args.host, args.port, args.db, decode_responses=True) changes_confirmed: bool = args.changes_confirmed From a4d141e0bdae1560dc0f401c5c503613b867bebe Mon Sep 17 00:00:00 2001 From: JP Appel Date: Tue, 29 Oct 2024 10:16:36 -0400 Subject: [PATCH 14/14] Update documentation --- docs/scripts.md | 8 ++++++++ scripts/get_correct_set.sh | 0 2 files changed, 8 insertions(+) mode change 100644 => 100755 scripts/get_correct_set.sh diff --git a/docs/scripts.md b/docs/scripts.md index 9b4fd565..e809ad1e 100644 --- a/docs/scripts.md +++ b/docs/scripts.md @@ -14,6 +14,8 @@ Currently, the following scripts are provided. * correct possible errors within a counts json file generated by `get_counts.py` * `set_counts.py` * set values in a mirrulations Redis instance using json generated by `get_counts.py` +* `get_correct_set.sh` + * run `get_counts.py`, `correct_counts.py`, and `set_counts.py`, logging relevant information All of the scripts above share a common format
@@ -47,6 +49,12 @@ All of the scripts above share a common format ## Description +### `get_correct_set.sh` + +`get_correct_set.sh` gets counts using `get_counts.py` from Redis, corrects them using `correct_counts.py`, and on success sets them using `set_counts.py`. +It attempts to log to `/var/log/mirrulations_counts.log`. +By default, it expects a virtual environment with all required dependencies in `/home/cs334/mirrulations/scripts/.venv`. + ### `get_counts.py` `get_counts.py` gets counts from one of three sources: regulations.gov, a Mirrulations Redis instance, a Mirrulations dashboard via HTTP. diff --git a/scripts/get_correct_set.sh b/scripts/get_correct_set.sh old mode 100644 new mode 100755