From a37d1d5555ad812da5327f8e93e0c6e47c2eb779 Mon Sep 17 00:00:00 2001 From: OnToNothing <126170980+OnToNothing@users.noreply.github.com> Date: Fri, 4 Oct 2024 08:17:09 -0400 Subject: [PATCH 1/6] Update readme client and database doc to reflect system changes --- README.md | 2 +- docs/client.md | 15 +++++++++++ docs/database.md | 68 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8002c6c2..fb6ef101 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ If you are interested in becoming a developer, see `docs/developers.md`. To run Mirrulations, you need Python 3.9 or greater ([MacOSX](https://docs.python-guide.org/starting/install3/osx/) or [Windows](https://docs.python-guide.org/starting/install3/win/)) on your machine to run this, as well as [redis](https://redis.io/) if you are running a server -You will also need a valid API key from Regulations.gov to participate. To apply for a key, you must simply [contact the Regulations Help Desk](regulations@erulemakinghelpdesk.com) and provide your name, email address, organization, and intended use of the API. If you are not with any organizations, just say so in your message. They will email you with a key once they've verified you and activated the key. +You will also need a valid API key from Regulations.gov to participate. To apply for a key, you must simply complete the API key request form (https://open.gsa.gov/api/regulationsgov/) and provide your name, email address, organization, and intended use of the API. After review the key will be sent by email. To download the actual project, you will need to go to our [GitHub page](https://github.com/MoravianUniversity/mirrulations) and [clone](https://help.github.com/articles/cloning-a-repository/) the project to your computer. diff --git a/docs/client.md b/docs/client.md index 5f49ef2c..86760228 100644 --- a/docs/client.md +++ b/docs/client.md @@ -10,3 +10,18 @@ The goal is that the client will request and complete work in order to download data from [regulations.gov](https://www.regulations.gov/). +## Attributes +api_key: Used to authenticate requests made to the API. +client_id: An ID included in the client.env file. +path_generator: An instance of PathGenerator that returns a path for saving job results. +saver: An instance of Saver that handles saving files either to disk or Amazon S3. +redis: A connection to the Redis server for managing job states. +job_queue: A queue from which the client pulls jobs. +cache: An instance of JobStatistics for caching job statistics. + +## Workflow +Initialization: The Client is initialized with a Redis server and a job queue. +Fetching Jobs: The client attempts to fetch a job from the job queue using _get_job_from_job_queue. +Performing Jobs: Depending on the job type, the client performs the job by calling an API endpoint to request a JSON object. +Saving Results: The client saves the job results and any included attachments using the Saver class. +Updating Redis: The client updates the Redis server with job states. diff --git a/docs/database.md b/docs/database.md index 607ddbf5..81fc6186 100644 --- a/docs/database.md +++ b/docs/database.md @@ -2,8 +2,39 @@ ## Database Format -We use [Redis](https://redis.io/) to store jobs as well as key values that must -be remembered. +We use [Redis](https://redis.io/) to store jobs as well as key values + +## Database Structure + +The Redis database is structured with the following keys: + +regulations_total_comments +num_dockets_done +num_documents_done +num_attachments_done +num_jobs_num_jobs_documents_queue_queue +num_pdf_attachment_done +num_jobs_num_jobs_dockets_queue_queue +last_job_id +jobs_in_progress +num_pdf_attachments_done +num_jobs_documents_waiting +num_jobs_comments_waiting +dockets_last_timestamp +num_jobs_num_jobs_comments_queue_queue +invalid_jobs +regulations_total_dockets +client_jobs +last_timestamp +total_num_client_ids +num_extractions_done +regulations_total_documents +mirrulations_bucket_size +num_comments_done +num_jobs_attachments_waiting +documents_last_timestamp +num_jobs_dockets_waiting +comments_last_timestamp ## Job Management @@ -11,14 +42,19 @@ The REDIS database has three "queues", with the names: `jobs_waiting_queue`, `jobs_in_progress`, and `jobs_done`. -`jobs_waiting_queue` is a list, while 'jobs_in_progress' and 'jobs_done' are hashes. -Each stores jobs for clients to process. +The keys serve the following functions: -Keys will be integers, the job ids of the jobs. -These keys will be mapped to integers, the values to be processed. +jobs_waiting_queue: A list holding JSON strings representing each job. -Additionally, the database has an integer value storing the number of clients: -`total_num_client_ids`. +jobs_in_progress: A hash storing jobs currently being processed. + +jobs_done: A hash storing completed jobs. + +The keys client_jobs and total_num_client_ids are used for sotring client information. + +client_jobs: A hash mapping job IDs to client IDs. + +total_num_client_ids: An integer value storing the number of clients. ## Redis Format ## `jobs_waiting_queue` @@ -58,3 +94,19 @@ unique ids for each job. The 'last_client_id' variable is used by the work server to ensure that it generates unique client ids. + +## Job Statistics Keys + +DOCKETS_DONE: Tracks the number of completed dockets. + +DOCUMENTS_DONE: Tracks the number of completed documents. + +COMMENTS_DONE: Tracks the number of completed comments. + +ATTACHMENTS_DONE: Tracks the number of completed attachments. + +PDF_ATTACHMENTS_DONE: Tracks the number of completed PDF attachments. + +EXTRACTIONS_DONE: Tracks the number of completed extractions. + +MIRRULATION_BUCKET_SIZE: Stores the size of the mirrulations bucket. \ No newline at end of file From fa24f218d86ba09f99a7aa1490ffe35a0d523ec4 Mon Sep 17 00:00:00 2001 From: OnToNothing <126170980+OnToNothing@users.noreply.github.com> Date: Fri, 4 Oct 2024 15:34:41 -0400 Subject: [PATCH 2/6] Refactor database.md to remove unused variables and clarify job management --- docs/database.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/docs/database.md b/docs/database.md index 81fc6186..406545b3 100644 --- a/docs/database.md +++ b/docs/database.md @@ -12,30 +12,24 @@ regulations_total_comments num_dockets_done num_documents_done num_attachments_done -num_jobs_num_jobs_documents_queue_queue -num_pdf_attachment_done -num_jobs_num_jobs_dockets_queue_queue last_job_id jobs_in_progress num_pdf_attachments_done num_jobs_documents_waiting num_jobs_comments_waiting dockets_last_timestamp -num_jobs_num_jobs_comments_queue_queue invalid_jobs regulations_total_dockets client_jobs -last_timestamp -total_num_client_ids num_extractions_done regulations_total_documents mirrulations_bucket_size num_comments_done -num_jobs_attachments_waiting documents_last_timestamp num_jobs_dockets_waiting comments_last_timestamp + ## Job Management The REDIS database has three "queues", with the names: @@ -90,10 +84,6 @@ timestamp seen when querying regulations.gov. The `last_job_id` variable is used by the work generator to ensure it generates unique ids for each job. -## Client IDs - -The 'last_client_id' variable is used by the work server to ensure that it -generates unique client ids. ## Job Statistics Keys From 9ca44d369896de218eb866e084b2addd2d45dba9 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Sun, 13 Oct 2024 03:51:21 -0400 Subject: [PATCH 3/6] Add script to get correct current data counts as json `get_counts.py` outputs the current mirrulations download counts along with the official regulations.gov counts. If supplied with start or end timestamps, the regulations.gov data will be for a specific timeframe. --- scripts/get_counts.py | 214 +++++++++++++++++++++++++++++++++++++++ scripts/requirements.txt | 3 + 2 files changed, 217 insertions(+) create mode 100755 scripts/get_counts.py create mode 100644 scripts/requirements.txt diff --git a/scripts/get_counts.py b/scripts/get_counts.py new file mode 100755 index 00000000..403400d2 --- /dev/null +++ b/scripts/get_counts.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +import argparse +import datetime as dt +import json +import os +import pathlib +import sys +from typing import Any, TypedDict + +import redis +import requests + +REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/" + + +class Counts(TypedDict): + dockets: int + documents: int + comments: int + + +class EntityCount(TypedDict): + downloaded: int + total: int + + +class Output(TypedDict): + start_timestamp: dt.datetime + stop_timestamp: dt.datetime + dockets: EntityCount + documents: EntityCount + comments: EntityCount + + +class OutputEncoder(json.JSONEncoder): + def default(self, o: Any) -> Any: + if isinstance(o, dt.datetime): + return o.strftime("%Y-%m-%d %H:%M:%S") + return super().default(o) + + +def get_regulation_count(api_key: str, start: dt.datetime, end: dt.datetime) -> Counts: + headers = {"X-Api-Key": api_key} + counts: Counts = {"dockets": -1, "documents": -1, "comments": -1} + + params = { + "fitler[lastModifiedDate][ge]": start.strftime("%Y-%m-%d %H:%M:%S"), + "filter[lastModifiedDate][le]": end.strftime("%Y-%m-%d %H:%M:%S"), + } + + for type_ in counts: + response = requests.get( + REGULATIONS_BASE_URL + type_, headers=headers, params=params + ) + response.raise_for_status() + counts[type_] = response.json()["meta"]["totalElements"] + + return counts + + +def get_true_prod_count(dashboard_url: str) -> Counts: + """Dumbly get the counts of a running mirrulations instance""" + response = requests.get(dashboard_url + "/data") + response.raise_for_status() + + stats = response.json() + counts = Counts( + dockets=stats["num_dockets_done"], + documents=stats["num_documents_done"], + comments=stats["num_comments_done"], + ) + + return counts + + +def clamp_counts(counts: Counts, max_counts: Counts) -> Counts: + clamped = Counts(dockets=0, documents=0, comments=0) + for key in counts: + if max_counts[key] < counts[key]: + clamped[key] = max(min(counts[key], max_counts[key]), 0) + else: + clamped[key] = counts[key] + return clamped + + +def get_accurate_prod_count(db: redis.Redis, max_counts: Counts) -> Counts: + """Get the counts of a running mirrulations instance, ignoring duplicated downloads + + Args: + db: a redis database connection + strict: true if the resulting counts are allowed to be larger + than the official Regulations.gov counts + """ + counts = Counts( + dockets=int(db.get("num_dockets_done")), + documents=int(db.get("num_documents_done")), + comments=int(db.get("num_comments_done")), + ) + jobs_waiting = { + "dockets": int(db.get("num_jobs_dockets_waiting")), + "documents": int(db.get("num_jobs_documents_waiting")), + "comments": int(db.get("num_jobs_comments_waiting")), + } + + if any(jobs_waiting.values()): + for k in counts: + counts[k] = min(max_counts[k] - jobs_waiting[k], counts[k]) + + return clamp_counts(counts, max_counts) + + +def make_output( + start: dt.datetime, end: dt.datetime, downloaded: Counts, total: Counts +) -> Output: + output: Output = { + "start_timestamp": start, + "stop_timestamp": end, + "dockets": { + "downloaded": downloaded["dockets"], + "total": total["dockets"], + }, + "documents": { + "downloaded": downloaded["documents"], + "total": total["documents"], + }, + "comments": { + "downloaded": downloaded["comments"], + "total": total["comments"], + }, + } + + return output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get Correct Mirrulation Counts as a json document", + epilog="Can be used in conjunction with update_counts.py", + ) + parser.add_argument( + "-f", + "--from", + metavar="START_DATETIME", + dest="start", + type=dt.datetime.fromisoformat, + default=dt.datetime(1776, 7, 4).isoformat(timespec="seconds"), + help="start time (inclusive) for counts in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')", + ) + parser.add_argument( + "-t", + "--to", + metavar="END_DATETIME", + dest="end", + type=dt.datetime.fromisoformat, + default=dt.datetime.now().isoformat(timespec="seconds"), + help="end time (exclusive) for counts in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')", + ) + parser.add_argument( + "-o", + "--output", + metavar="PATH", + help="file to output to, use '-' for stdout (default '%(default)s')", + type=str, + default="-", + ) + parser.add_argument( + "-c", + "--correct", + help="Get corrected counts download counts", + action="store_true", + ) + parser.add_argument( + "-a", + "--api-key", + help="Regulations.gov api key, defaults to value of `API_KEY` environment variable", + default=os.getenv("API_KEY"), + type=str, + ) + parser.add_argument( + "--dashboard", + metavar="URL", + help="URL of dashboard to use, mutually exclusive with '-c', (default '%(default)s')", + default="http://localhost", + ) + + args = parser.parse_args() + + start: dt.datetime = args.start + end: dt.datetime = args.end + out_path: str = args.output + correct: bool = args.correct + dashboard_url: str = args.dashboard + + if args.api_key is None: + print("No api key found, exitting", file=sys.stderr) + sys.exit(1) + + api_key: str = args.api_key + + regulations = Counts(dockets=253807, documents=1843774, comments=22240511) + # regulations = get_regulation_count(api_key, start, end) + if correct: + mirrulations = get_accurate_prod_count(redis.Redis(), regulations) + else: + mirrulations = get_true_prod_count(dashboard_url) + + output = make_output(start, end, mirrulations, regulations) + + if out_path == "-": + json.dump(output, sys.stdout, cls=OutputEncoder) + else: + with open(pathlib.Path(out_path), "w") as fp: + json.dump(output, fp, cls=OutputEncoder) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..fe413504 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +requests +docker +redis From f8e909d4d5a911ac03884685f7cd8579dbb07b3b Mon Sep 17 00:00:00 2001 From: JP Appel Date: Sun, 13 Oct 2024 03:58:34 -0400 Subject: [PATCH 4/6] Fix test data being used instead of regulations.gov --- scripts/get_counts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/get_counts.py b/scripts/get_counts.py index 403400d2..cd7ed6f7 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -198,8 +198,7 @@ def make_output( api_key: str = args.api_key - regulations = Counts(dockets=253807, documents=1843774, comments=22240511) - # regulations = get_regulation_count(api_key, start, end) + regulations = get_regulation_count(api_key, start, end) if correct: mirrulations = get_accurate_prod_count(redis.Redis(), regulations) else: From e93d6f619580feee11d6ff4db7d7de8458078c4e Mon Sep 17 00:00:00 2001 From: OnToNothing <126170980+OnToNothing@users.noreply.github.com> Date: Mon, 14 Oct 2024 00:35:03 -0400 Subject: [PATCH 5/6] Add update_counts.py script to work in tandem with get_counts.py --- scripts/update_counts.py | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 scripts/update_counts.py diff --git a/scripts/update_counts.py b/scripts/update_counts.py new file mode 100644 index 00000000..b867a2f3 --- /dev/null +++ b/scripts/update_counts.py @@ -0,0 +1,87 @@ +import argparse +import json +import redis +import sys + +def list_current_values(r): + keys = [ + "num_dockets_done", + "num_documents_done", + "num_comments_done", + "dockets_last_timestamp", + "documents_last_timestamp", + "comments_last_timestamp" + ] + print("Current Redis Key Values:") + print("-" * 50) + for key in keys: + value = r.get(key) + value = value.decode('utf-8') if value else 'None' + print(f"{key}: {value}") + print("-" * 50) + +def update_values(r, data): + # Extract values from JSON data + num_dockets_done = data.get("dockets", {}).get("downloaded", 0) + num_documents_done = data.get("documents", {}).get("downloaded", 0) + num_comments_done = data.get("comments", {}).get("downloaded", 0) + stop_timestamp = data.get("stop_timestamp", "") + + # Mapping from Redis keys to values + key_value_pairs = { + "num_dockets_done": num_dockets_done, + "num_documents_done": num_documents_done, + "num_comments_done": num_comments_done, + "dockets_last_timestamp": stop_timestamp, + "documents_last_timestamp": stop_timestamp, + "comments_last_timestamp": stop_timestamp + } + + # Display current and new values + print("Current and New Values:") + print("-" * 50) + for key, new_value in key_value_pairs.items(): + current_value = r.get(key) + current_value = current_value.decode('utf-8') if current_value else 'None' + print(f"Key: {key}") + print(f"Current Value: {current_value}") + print(f"New Value: {new_value}") + print("-" * 50) + + # Confirm before updating + confirm = input("Do you want to update these keys with the new values? (yes/no): ").strip().lower() + if confirm == 'yes': + for key, new_value in key_value_pairs.items(): + r.set(key, new_value) + print("Keys have been updated successfully.") + else: + print("No changes have been made.") + +def main(): + parser = argparse.ArgumentParser(description="Redis Key Updater") + parser.add_argument('-i', '--input', help='Path to the JSON file') + args = parser.parse_args() + + # Connect to Redis + try: + r = redis.Redis(host='localhost', port=6379, db=0) + r.ping() + except redis.exceptions.ConnectionError as e: + print(f"Error connecting to Redis: {e}") + sys.exit(1) + + if args.input: + # Load JSON data from the provided file + try: + with open(args.input, 'r') as f: + data = json.load(f) + except Exception as e: + print(f"Error reading JSON file: {e}") + sys.exit(1) + update_values(r, data) + else: + # List current values + list_current_values(r) + +if __name__ == "__main__": + main() From da9856c7fa5ce0afa40605758bfb62702a992f03 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 14 Oct 2024 11:12:00 -0400 Subject: [PATCH 6/6] Fix typo and add reading from stdin to update_counts --- scripts/get_counts.py | 21 +++++++++++++++------ scripts/update_counts.py | 22 +++++++++++++++------- 2 files changed, 30 insertions(+), 13 deletions(-) mode change 100644 => 100755 scripts/update_counts.py diff --git a/scripts/get_counts.py b/scripts/get_counts.py index cd7ed6f7..980841a4 100755 --- a/scripts/get_counts.py +++ b/scripts/get_counts.py @@ -45,7 +45,7 @@ def get_regulation_count(api_key: str, start: dt.datetime, end: dt.datetime) -> counts: Counts = {"dockets": -1, "documents": -1, "comments": -1} params = { - "fitler[lastModifiedDate][ge]": start.strftime("%Y-%m-%d %H:%M:%S"), + "filter[lastModifiedDate][ge]": start.strftime("%Y-%m-%d %H:%M:%S"), "filter[lastModifiedDate][le]": end.strftime("%Y-%m-%d %H:%M:%S"), } @@ -84,13 +84,16 @@ def clamp_counts(counts: Counts, max_counts: Counts) -> Counts: return clamped -def get_accurate_prod_count(db: redis.Redis, max_counts: Counts) -> Counts: +def get_accurate_prod_count( + db: redis.Redis, max_counts: Counts, ignore_queue: bool = False +) -> Counts: """Get the counts of a running mirrulations instance, ignoring duplicated downloads Args: db: a redis database connection strict: true if the resulting counts are allowed to be larger than the official Regulations.gov counts + ignore_queue: continue even if jobs are in the queue """ counts = Counts( dockets=int(db.get("num_dockets_done")), @@ -104,6 +107,9 @@ def get_accurate_prod_count(db: redis.Redis, max_counts: Counts) -> Counts: } if any(jobs_waiting.values()): + if not ignore_queue: + print("Jobs in queue, exitting", file=sys.stderr) + sys.exit(1) for k in counts: counts[k] = min(max_counts[k] - jobs_waiting[k], counts[k]) @@ -170,6 +176,9 @@ def make_output( help="Get corrected counts download counts", action="store_true", ) + parser.add_argument( + "--ignore-queue", help="Continue if jobs are in the job queue", action="store_true" + ) parser.add_argument( "-a", "--api-key", @@ -191,16 +200,16 @@ def make_output( out_path: str = args.output correct: bool = args.correct dashboard_url: str = args.dashboard + ignore_queue: bool = args.ignore_queue - if args.api_key is None: + api_key = args.api_key + if api_key is None or api_key == "": print("No api key found, exitting", file=sys.stderr) sys.exit(1) - api_key: str = args.api_key - regulations = get_regulation_count(api_key, start, end) if correct: - mirrulations = get_accurate_prod_count(redis.Redis(), regulations) + mirrulations = get_accurate_prod_count(redis.Redis(), regulations, ignore_queue) else: mirrulations = get_true_prod_count(dashboard_url) diff --git a/scripts/update_counts.py b/scripts/update_counts.py old mode 100644 new mode 100755 index b867a2f3..35e875c7 --- a/scripts/update_counts.py +++ b/scripts/update_counts.py @@ -1,4 +1,6 @@ +#!/usr/bin/env python3 import argparse +from pathlib import Path import json import redis import sys @@ -20,7 +22,7 @@ def list_current_values(r): print(f"{key}: {value}") print("-" * 50) -def update_values(r, data): +def update_values(r, data, confirmed): # Extract values from JSON data num_dockets_done = data.get("dockets", {}).get("downloaded", 0) num_documents_done = data.get("documents", {}).get("downloaded", 0) @@ -49,8 +51,10 @@ def update_values(r, data): print("-" * 50) # Confirm before updating - confirm = input("Do you want to update these keys with the new values? (yes/no): ").strip().lower() - if confirm == 'yes': + if not confirmed: + confirm = input("Do you want to update these keys with the new values? (yes/no): ").strip().lower() + confirmed = confirm == 'yes' + if confirmed: for key, new_value in key_value_pairs.items(): r.set(key, new_value) print("Keys have been updated successfully.") @@ -59,7 +63,8 @@ def update_values(r, data): def main(): parser = argparse.ArgumentParser(description="Redis Key Updater") - parser.add_argument('-i', '--input', help='Path to the JSON file') + parser.add_argument('-i', '--input', help='Path to the JSON file', default='-') + parser.add_argument('-y', '--yes', help="Automagically accepts all prompts", action="store_true") args = parser.parse_args() # Connect to Redis @@ -73,12 +78,15 @@ def main(): if args.input: # Load JSON data from the provided file try: - with open(args.input, 'r') as f: - data = json.load(f) + if args.input == '-': + data = json.load(sys.stdin) + else: + with open(Path(args.input), 'r') as f: + data = json.load(f) except Exception as e: print(f"Error reading JSON file: {e}") sys.exit(1) - update_values(r, data) + update_values(r, data, args.yes) else: # List current values list_current_values(r)