From a4bd129b2957427b80d4f8e9587da94ccad0d605 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Wed, 16 Oct 2024 21:05:28 -0400 Subject: [PATCH] Add documentation for scripts --- docs/scripts.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 docs/scripts.md diff --git a/docs/scripts.md b/docs/scripts.md new file mode 100644 index 00000000..9b4fd565 --- /dev/null +++ b/docs/scripts.md @@ -0,0 +1,104 @@ +# Script Documentation + +## Summary + +Some tasks are small enough that the project architecture should not change, but the large enough that they should not be performed by hand. +Files in the `scripts` directory exist to fill this space. + +Currently, the following scripts are provided. + +* `get_counts.py` + * get docket, document, and comment counts from regulations.gov, a mirrulations dashboard, or a mirrulations Redis instance as json + * when using regulations.gov a timestamp can be given to make all dockets, documents, and comments before the timestamp count as if they were downloaded +* `correct_counts.py` + * correct possible errors within a counts json file generated by `get_counts.py` +* `set_counts.py` + * set values in a mirrulations Redis instance using json generated by `get_counts.py` + +All of the scripts above share a common format +
+get_counts.py common format + +```json +{ + "creation_timestamp": "2024-10-16 15:00:00", + "dockets": { + "downloaded": 253807, + "jobs": 0, + "total": 253807, + "last_timestamp": "2024-10-13 04:04:18" + }, + "documents": { + "downloaded": 1843774, + "jobs": 0, + "total": 1843774, + "last_timestamp": "2024-10-13 04:04:18" + }, + "comments": { + "downloaded": 22240501, + "jobs": 10, + "total": 22240511, + "last_timestamp": "2024-10-13 04:04:18" + } +} +``` + +
+ +## Description + +### `get_counts.py` + +`get_counts.py` gets counts from one of three sources: regulations.gov, a Mirrulations Redis instance, a Mirrulations dashboard via HTTP. + +When reading from regulations.gov a UTC timestamp can be specified to mock having downloaded all dockets, documents, and comments from before that timestamp. + +When reading from a dashboard a UTC timestamp must be specified since the dashboard API does not provide one. + +### `correct_counts.py` + +`correct_counts.py` corrects counts from `get_counts.py` using one of two strategies: set downloaded counts for a type to the minimum of `downloaded` and `total` for that type, or set downloaded counts to the minimum of `total -jobs` and `downloaded`. +By default any queued jobs will cause the script to exit and output nothing, this behavior can be changed with the `--ignore-queue` flag. + +### `set_counts.py` + +`set_counts.py` sets values from `get_counts.py` in a Redis instance. +By default the script will prompt for user input before changing any values. +This behavior can be changed using the `--yes` flag, which should be used **WITH GREAT CARE, ESPECIALLY IN PRODUCTION!!!**. + +## Setup + +First a virtual environment should be created to download dependencies to. + +```bash +cd scripts +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +Make sure when you are in the correct environment when running scripts. + +## Examples + +### Cap Docket, Document, and Comment downloaded counts by the counts from Regulations.gov + +```bash +./get_counts.py redis | ./correct_counts.py | ./set_counts.py -y +``` + +### Set Docket, Document, Comment downloaded counts while jobs are in the queue + +```bash +./get_counts.py dashboard | ./correct_counts.py --ignore-queue --strategy diff_total_with_jobs | ./set_counts.py -y +``` + +### Download Counts for a Certain Time from Regulations.gov + +```bash +./get_counts.py --api-key $API_KEY -o aug_6_2022.json -t 2024-08-06T06:20:50Z + +EXPORT API_KEY= +./get_counts.py regulations -o oct_01_2024.json --last-timestamp 2024-10-01T15:30:10Z +./set_counts.py -i oct_01_2024.json +```