diff --git a/README.md b/README.md index 8002c6c2..fb6ef101 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ If you are interested in becoming a developer, see `docs/developers.md`. To run Mirrulations, you need Python 3.9 or greater ([MacOSX](https://docs.python-guide.org/starting/install3/osx/) or [Windows](https://docs.python-guide.org/starting/install3/win/)) on your machine to run this, as well as [redis](https://redis.io/) if you are running a server -You will also need a valid API key from Regulations.gov to participate. To apply for a key, you must simply [contact the Regulations Help Desk](regulations@erulemakinghelpdesk.com) and provide your name, email address, organization, and intended use of the API. If you are not with any organizations, just say so in your message. They will email you with a key once they've verified you and activated the key. +You will also need a valid API key from Regulations.gov to participate. To apply for a key, you must simply complete the API key request form (https://open.gsa.gov/api/regulationsgov/) and provide your name, email address, organization, and intended use of the API. After review the key will be sent by email. To download the actual project, you will need to go to our [GitHub page](https://github.com/MoravianUniversity/mirrulations) and [clone](https://help.github.com/articles/cloning-a-repository/) the project to your computer. diff --git a/docs/client.md b/docs/client.md index 5f49ef2c..86760228 100644 --- a/docs/client.md +++ b/docs/client.md @@ -10,3 +10,18 @@ The goal is that the client will request and complete work in order to download data from [regulations.gov](https://www.regulations.gov/). +## Attributes +api_key: Used to authenticate requests made to the API. +client_id: An ID included in the client.env file. +path_generator: An instance of PathGenerator that returns a path for saving job results. +saver: An instance of Saver that handles saving files either to disk or Amazon S3. +redis: A connection to the Redis server for managing job states. +job_queue: A queue from which the client pulls jobs. +cache: An instance of JobStatistics for caching job statistics. + +## Workflow +Initialization: The Client is initialized with a Redis server and a job queue. +Fetching Jobs: The client attempts to fetch a job from the job queue using _get_job_from_job_queue. +Performing Jobs: Depending on the job type, the client performs the job by calling an API endpoint to request a JSON object. +Saving Results: The client saves the job results and any included attachments using the Saver class. +Updating Redis: The client updates the Redis server with job states. diff --git a/docs/database.md b/docs/database.md index 607ddbf5..406545b3 100644 --- a/docs/database.md +++ b/docs/database.md @@ -2,8 +2,33 @@ ## Database Format -We use [Redis](https://redis.io/) to store jobs as well as key values that must -be remembered. +We use [Redis](https://redis.io/) to store jobs as well as key values + +## Database Structure + +The Redis database is structured with the following keys: + +regulations_total_comments +num_dockets_done +num_documents_done +num_attachments_done +last_job_id +jobs_in_progress +num_pdf_attachments_done +num_jobs_documents_waiting +num_jobs_comments_waiting +dockets_last_timestamp +invalid_jobs +regulations_total_dockets +client_jobs +num_extractions_done +regulations_total_documents +mirrulations_bucket_size +num_comments_done +documents_last_timestamp +num_jobs_dockets_waiting +comments_last_timestamp + ## Job Management @@ -11,14 +36,19 @@ The REDIS database has three "queues", with the names: `jobs_waiting_queue`, `jobs_in_progress`, and `jobs_done`. -`jobs_waiting_queue` is a list, while 'jobs_in_progress' and 'jobs_done' are hashes. -Each stores jobs for clients to process. +The keys serve the following functions: + +jobs_waiting_queue: A list holding JSON strings representing each job. + +jobs_in_progress: A hash storing jobs currently being processed. -Keys will be integers, the job ids of the jobs. -These keys will be mapped to integers, the values to be processed. +jobs_done: A hash storing completed jobs. -Additionally, the database has an integer value storing the number of clients: -`total_num_client_ids`. +The keys client_jobs and total_num_client_ids are used for sotring client information. + +client_jobs: A hash mapping job IDs to client IDs. + +total_num_client_ids: An integer value storing the number of clients. ## Redis Format ## `jobs_waiting_queue` @@ -54,7 +84,19 @@ timestamp seen when querying regulations.gov. The `last_job_id` variable is used by the work generator to ensure it generates unique ids for each job. -## Client IDs -The 'last_client_id' variable is used by the work server to ensure that it -generates unique client ids. +## Job Statistics Keys + +DOCKETS_DONE: Tracks the number of completed dockets. + +DOCUMENTS_DONE: Tracks the number of completed documents. + +COMMENTS_DONE: Tracks the number of completed comments. + +ATTACHMENTS_DONE: Tracks the number of completed attachments. + +PDF_ATTACHMENTS_DONE: Tracks the number of completed PDF attachments. + +EXTRACTIONS_DONE: Tracks the number of completed extractions. + +MIRRULATION_BUCKET_SIZE: Stores the size of the mirrulations bucket. \ No newline at end of file diff --git a/scripts/get_counts.py b/scripts/get_counts.py new file mode 100755 index 00000000..980841a4 --- /dev/null +++ b/scripts/get_counts.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 + +import argparse +import datetime as dt +import json +import os +import pathlib +import sys +from typing import Any, TypedDict + +import redis +import requests + +REGULATIONS_BASE_URL = "https://api.regulations.gov/v4/" + + +class Counts(TypedDict): + dockets: int + documents: int + comments: int + + +class EntityCount(TypedDict): + downloaded: int + total: int + + +class Output(TypedDict): + start_timestamp: dt.datetime + stop_timestamp: dt.datetime + dockets: EntityCount + documents: EntityCount + comments: EntityCount + + +class OutputEncoder(json.JSONEncoder): + def default(self, o: Any) -> Any: + if isinstance(o, dt.datetime): + return o.strftime("%Y-%m-%d %H:%M:%S") + return super().default(o) + + +def get_regulation_count(api_key: str, start: dt.datetime, end: dt.datetime) -> Counts: + headers = {"X-Api-Key": api_key} + counts: Counts = {"dockets": -1, "documents": -1, "comments": -1} + + params = { + "filter[lastModifiedDate][ge]": start.strftime("%Y-%m-%d %H:%M:%S"), + "filter[lastModifiedDate][le]": end.strftime("%Y-%m-%d %H:%M:%S"), + } + + for type_ in counts: + response = requests.get( + REGULATIONS_BASE_URL + type_, headers=headers, params=params + ) + response.raise_for_status() + counts[type_] = response.json()["meta"]["totalElements"] + + return counts + + +def get_true_prod_count(dashboard_url: str) -> Counts: + """Dumbly get the counts of a running mirrulations instance""" + response = requests.get(dashboard_url + "/data") + response.raise_for_status() + + stats = response.json() + counts = Counts( + dockets=stats["num_dockets_done"], + documents=stats["num_documents_done"], + comments=stats["num_comments_done"], + ) + + return counts + + +def clamp_counts(counts: Counts, max_counts: Counts) -> Counts: + clamped = Counts(dockets=0, documents=0, comments=0) + for key in counts: + if max_counts[key] < counts[key]: + clamped[key] = max(min(counts[key], max_counts[key]), 0) + else: + clamped[key] = counts[key] + return clamped + + +def get_accurate_prod_count( + db: redis.Redis, max_counts: Counts, ignore_queue: bool = False +) -> Counts: + """Get the counts of a running mirrulations instance, ignoring duplicated downloads + + Args: + db: a redis database connection + strict: true if the resulting counts are allowed to be larger + than the official Regulations.gov counts + ignore_queue: continue even if jobs are in the queue + """ + counts = Counts( + dockets=int(db.get("num_dockets_done")), + documents=int(db.get("num_documents_done")), + comments=int(db.get("num_comments_done")), + ) + jobs_waiting = { + "dockets": int(db.get("num_jobs_dockets_waiting")), + "documents": int(db.get("num_jobs_documents_waiting")), + "comments": int(db.get("num_jobs_comments_waiting")), + } + + if any(jobs_waiting.values()): + if not ignore_queue: + print("Jobs in queue, exitting", file=sys.stderr) + sys.exit(1) + for k in counts: + counts[k] = min(max_counts[k] - jobs_waiting[k], counts[k]) + + return clamp_counts(counts, max_counts) + + +def make_output( + start: dt.datetime, end: dt.datetime, downloaded: Counts, total: Counts +) -> Output: + output: Output = { + "start_timestamp": start, + "stop_timestamp": end, + "dockets": { + "downloaded": downloaded["dockets"], + "total": total["dockets"], + }, + "documents": { + "downloaded": downloaded["documents"], + "total": total["documents"], + }, + "comments": { + "downloaded": downloaded["comments"], + "total": total["comments"], + }, + } + + return output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get Correct Mirrulation Counts as a json document", + epilog="Can be used in conjunction with update_counts.py", + ) + parser.add_argument( + "-f", + "--from", + metavar="START_DATETIME", + dest="start", + type=dt.datetime.fromisoformat, + default=dt.datetime(1776, 7, 4).isoformat(timespec="seconds"), + help="start time (inclusive) for counts in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')", + ) + parser.add_argument( + "-t", + "--to", + metavar="END_DATETIME", + dest="end", + type=dt.datetime.fromisoformat, + default=dt.datetime.now().isoformat(timespec="seconds"), + help="end time (exclusive) for counts in ISO 8601 format 'YYYY-MM-DDTHH:mm:ss' (default '%(default)s')", + ) + parser.add_argument( + "-o", + "--output", + metavar="PATH", + help="file to output to, use '-' for stdout (default '%(default)s')", + type=str, + default="-", + ) + parser.add_argument( + "-c", + "--correct", + help="Get corrected counts download counts", + action="store_true", + ) + parser.add_argument( + "--ignore-queue", help="Continue if jobs are in the job queue", action="store_true" + ) + parser.add_argument( + "-a", + "--api-key", + help="Regulations.gov api key, defaults to value of `API_KEY` environment variable", + default=os.getenv("API_KEY"), + type=str, + ) + parser.add_argument( + "--dashboard", + metavar="URL", + help="URL of dashboard to use, mutually exclusive with '-c', (default '%(default)s')", + default="http://localhost", + ) + + args = parser.parse_args() + + start: dt.datetime = args.start + end: dt.datetime = args.end + out_path: str = args.output + correct: bool = args.correct + dashboard_url: str = args.dashboard + ignore_queue: bool = args.ignore_queue + + api_key = args.api_key + if api_key is None or api_key == "": + print("No api key found, exitting", file=sys.stderr) + sys.exit(1) + + regulations = get_regulation_count(api_key, start, end) + if correct: + mirrulations = get_accurate_prod_count(redis.Redis(), regulations, ignore_queue) + else: + mirrulations = get_true_prod_count(dashboard_url) + + output = make_output(start, end, mirrulations, regulations) + + if out_path == "-": + json.dump(output, sys.stdout, cls=OutputEncoder) + else: + with open(pathlib.Path(out_path), "w") as fp: + json.dump(output, fp, cls=OutputEncoder) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..fe413504 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +requests +docker +redis diff --git a/scripts/update_counts.py b/scripts/update_counts.py new file mode 100755 index 00000000..35e875c7 --- /dev/null +++ b/scripts/update_counts.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +import argparse +from pathlib import Path +import json +import redis +import sys + +def list_current_values(r): + keys = [ + "num_dockets_done", + "num_documents_done", + "num_comments_done", + "dockets_last_timestamp", + "documents_last_timestamp", + "comments_last_timestamp" + ] + print("Current Redis Key Values:") + print("-" * 50) + for key in keys: + value = r.get(key) + value = value.decode('utf-8') if value else 'None' + print(f"{key}: {value}") + print("-" * 50) + +def update_values(r, data, confirmed): + # Extract values from JSON data + num_dockets_done = data.get("dockets", {}).get("downloaded", 0) + num_documents_done = data.get("documents", {}).get("downloaded", 0) + num_comments_done = data.get("comments", {}).get("downloaded", 0) + stop_timestamp = data.get("stop_timestamp", "") + + # Mapping from Redis keys to values + key_value_pairs = { + "num_dockets_done": num_dockets_done, + "num_documents_done": num_documents_done, + "num_comments_done": num_comments_done, + "dockets_last_timestamp": stop_timestamp, + "documents_last_timestamp": stop_timestamp, + "comments_last_timestamp": stop_timestamp + } + + # Display current and new values + print("Current and New Values:") + print("-" * 50) + for key, new_value in key_value_pairs.items(): + current_value = r.get(key) + current_value = current_value.decode('utf-8') if current_value else 'None' + print(f"Key: {key}") + print(f"Current Value: {current_value}") + print(f"New Value: {new_value}") + print("-" * 50) + + # Confirm before updating + if not confirmed: + confirm = input("Do you want to update these keys with the new values? (yes/no): ").strip().lower() + confirmed = confirm == 'yes' + if confirmed: + for key, new_value in key_value_pairs.items(): + r.set(key, new_value) + print("Keys have been updated successfully.") + else: + print("No changes have been made.") + +def main(): + parser = argparse.ArgumentParser(description="Redis Key Updater") + parser.add_argument('-i', '--input', help='Path to the JSON file', default='-') + parser.add_argument('-y', '--yes', help="Automagically accepts all prompts", action="store_true") + args = parser.parse_args() + + # Connect to Redis + try: + r = redis.Redis(host='localhost', port=6379, db=0) + r.ping() + except redis.exceptions.ConnectionError as e: + print(f"Error connecting to Redis: {e}") + sys.exit(1) + + if args.input: + # Load JSON data from the provided file + try: + if args.input == '-': + data = json.load(sys.stdin) + else: + with open(Path(args.input), 'r') as f: + data = json.load(f) + except Exception as e: + print(f"Error reading JSON file: {e}") + sys.exit(1) + update_values(r, data, args.yes) + else: + # List current values + list_current_values(r) + +if __name__ == "__main__": + main()