From b31d9a8986c95c61cab585d2ce99c9a0da5da534 Mon Sep 17 00:00:00 2001 From: Jack Wagner Date: Wed, 19 Apr 2023 15:39:43 -0400 Subject: [PATCH 01/10] Initial metadata extraction implementation --- mirrulations-client/src/mirrclient/client.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mirrulations-client/src/mirrclient/client.py b/mirrulations-client/src/mirrclient/client.py index 3174f612..ee23418e 100644 --- a/mirrulations-client/src/mirrclient/client.py +++ b/mirrulations-client/src/mirrclient/client.py @@ -230,6 +230,7 @@ def _download_all_attachments_from_comment(self, comment_json): ''' path_list = self.path_generator.get_attachment_json_paths(comment_json) + self._make_extraction_meta(path_list) counter = 0 comment_id_str = f"Comment - {comment_json['data']['id']}" print(f"Found {len(path_list)} attachment(s) for {comment_id_str}") @@ -247,6 +248,23 @@ def _download_all_attachments_from_comment(self, comment_json): self.cache.increase_jobs_done('attachment', url.endswith('.pdf')) + def _make_extraction_meta(self, attachment_paths): + meta_save_path = PathGenerator.make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] + meta = { + "extraction_status": {} + } + for path in attachment_paths: + file_name = path.rsplit("/", 1)[1] + meta["extraction_status"][file_name] = "Not Attempted" + # Use Saver to save meta json to Disk and S3 + # Possible file names: extraction-metadata.json ?? + + # write meta to meta_save_path + + # Future: Loop over again and add to extraction queue + + + def _download_single_attachment(self, url, path): ''' Downloads a single attachment for a comment and From 84f44321830384f24b1bc60a2143ef44c49378ff Mon Sep 17 00:00:00 2001 From: Jack Wagner Date: Thu, 20 Apr 2023 09:02:18 -0400 Subject: [PATCH 02/10] Client will initialize the metadata when all the attachments for a comment are found * This change uses the save_json method in the savers * Do note: The DiskSaver and S3 saver were using data with the full data dict before * I changed the Savers and the client to have the parameter for saving json be data["results"] instead of the full data dictionary * This is a better way of implementing the save_json methods since what is passed in is the json we want to save rather than needing to find the key we want in the savers themselves. --- mirrulations-client/src/mirrclient/client.py | 12 +++++- .../src/mirrclient/disk_saver.py | 1 - .../src/mirrclient/s3_saver.py | 2 +- mirrulations-client/tests/test_client.py | 43 +++++++++++++++++++ mirrulations-client/tests/test_disk_saver.py | 2 +- mirrulations-client/tests/test_s3_saver.py | 7 +-- mirrulations-client/tests/test_saver.py | 10 +++-- 7 files changed, 65 insertions(+), 12 deletions(-) diff --git a/mirrulations-client/src/mirrclient/client.py b/mirrulations-client/src/mirrclient/client.py index ee23418e..8cee6386 100644 --- a/mirrulations-client/src/mirrclient/client.py +++ b/mirrulations-client/src/mirrclient/client.py @@ -189,7 +189,7 @@ def _put_results(self, data): the results from a performed job """ dir_, filename = data['directory'].rsplit('/', 1) - self.saver.save_json(f'/data{dir_}/{filename}', data) + self.saver.save_json(f'/data{dir_}/{filename}', data["results"]) def _perform_job(self, job_url): """ @@ -231,6 +231,7 @@ def _download_all_attachments_from_comment(self, comment_json): path_list = self.path_generator.get_attachment_json_paths(comment_json) self._make_extraction_meta(path_list) + counter = 0 comment_id_str = f"Comment - {comment_json['data']['id']}" print(f"Found {len(path_list)} attachment(s) for {comment_id_str}") @@ -249,13 +250,20 @@ def _download_all_attachments_from_comment(self, comment_json): url.endswith('.pdf')) def _make_extraction_meta(self, attachment_paths): - meta_save_path = PathGenerator.make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] + if len(attachment_paths) ==0: + return + meta_save_dir = PathGenerator.make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] meta = { "extraction_status": {} } for path in attachment_paths: file_name = path.rsplit("/", 1)[1] meta["extraction_status"][file_name] = "Not Attempted" + meta_save_path = f"{meta_save_dir}/extraction-metadata.json" + self.saver.save_json(meta_save_path, meta) + return meta_save_path, meta + # self.saver.save_json(meta_save_path, meta) + # Use Saver to save meta json to Disk and S3 # Possible file names: extraction-metadata.json ?? diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py index 080963c9..7620363b 100644 --- a/mirrulations-client/src/mirrclient/disk_saver.py +++ b/mirrulations-client/src/mirrclient/disk_saver.py @@ -25,7 +25,6 @@ def save_json(self, path, data): """ _dir = path.rsplit('/', 1)[0] self.make_path(_dir) - data = data['results'] if os.path.exists(path) is False: self.save_to_disk(path, data) else: diff --git a/mirrulations-client/src/mirrclient/s3_saver.py b/mirrulations-client/src/mirrclient/s3_saver.py index f0543933..8a36a961 100644 --- a/mirrulations-client/src/mirrclient/s3_saver.py +++ b/mirrulations-client/src/mirrclient/s3_saver.py @@ -85,7 +85,7 @@ def save_json(self, path, data): response = self.s3_client.put_object( Bucket=self.bucket_name, Key=path, - Body=json.dumps(data["results"]) + Body=json.dumps(data) ) print(f"Wrote json to S3: {path}") return response diff --git a/mirrulations-client/tests/test_client.py b/mirrulations-client/tests/test_client.py index d22429af..aeeb33b5 100644 --- a/mirrulations-client/tests/test_client.py +++ b/mirrulations-client/tests/test_client.py @@ -347,6 +347,12 @@ def test_client_downloads_attachment_results(mocker, capsys): return_value=None) mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary', return_value=None) + mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk', + return_value=None) + mocker.patch('mirrclient.s3_saver.S3Saver.save_binary', + return_value=None) + mocker.patch('mirrclient.s3_saver.S3Saver.save_json', + return_value=None) mock_redis = ReadyRedis() client = Client(mock_redis, MockJobQueue()) client.api_key = 1234 @@ -441,6 +447,12 @@ def test_two_attachments_in_comment(mocker): return_value=None) mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary', return_value=None) + mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk', + return_value=None) + mocker.patch('mirrclient.s3_saver.S3Saver.save_binary', + return_value=None) + mocker.patch('mirrclient.s3_saver.S3Saver.save_json', + return_value=None) client = Client(ReadyRedis(), MockJobQueue()) client.api_key = 1234 @@ -513,3 +525,34 @@ def test_client_handles_api_timeout(): client.job_operation() assert mock_redis.get('invalid_jobs') == [1, 'http://regulations.gov/job'] + + +def test_make_extraction_meta(mocker): + mocker.patch('mirrclient.disk_saver.DiskSaver.make_path', + return_value=None) + mocker.patch('mirrclient.saver.Saver.save_json', + return_value=None) + mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk', + return_value=None) + test_attachment_paths = [ + "testagency/docketid/comments_attachments/test1.pdf", + "testagency/docketid/comments_attachments/test2.pdf", + "testagency/docketid/comments_attachments/test3.pdf", + "testagency/docketid/comments_attachments/test4.pdf" + ] + mock_redis = ReadyRedis() + client = Client(mock_redis, MockJobQueue()) + client.api_key = 1234 + expected_meta = { + "extraction_status":{ + "test1.pdf":"Not Attempted", + "test2.pdf":"Not Attempted", + "test3.pdf":"Not Attempted", + "test4.pdf":"Not Attempted" + } + } + expected_meta_save_path = 'testagency/docketid/comments_extracted_text/pdfminer/extraction-metadata.json' + actual_meta_path, actual_meta = client._make_extraction_meta(test_attachment_paths) + assert expected_meta_save_path == actual_meta_path + assert expected_meta == actual_meta + diff --git a/mirrulations-client/tests/test_disk_saver.py b/mirrulations-client/tests/test_disk_saver.py index dc78c9dd..625834b2 100644 --- a/mirrulations-client/tests/test_disk_saver.py +++ b/mirrulations-client/tests/test_disk_saver.py @@ -58,7 +58,7 @@ def test_save_json(): saver.save_json(path, data) mock_dir.assert_called_once_with('/USTR') mocked_file.assert_called_once_with(path, 'x', encoding='utf8') - mocked_file().write.assert_called_once_with(dumps(data['results'])) + mocked_file().write.assert_called_once_with(dumps(data)) def test_save_binary(): diff --git a/mirrulations-client/tests/test_s3_saver.py b/mirrulations-client/tests/test_s3_saver.py index 1fc2c6ad..e9a2e3aa 100644 --- a/mirrulations-client/tests/test_s3_saver.py +++ b/mirrulations-client/tests/test_s3_saver.py @@ -1,5 +1,6 @@ import os import boto3 +import json from moto import mock_s3 from pytest import fixture from mirrclient.s3_saver import S3Saver @@ -63,14 +64,14 @@ def test_put_text_to_bucket(): conn = create_mock_mirrulations_bucket() s3_bucket = S3Saver(bucket_name="test-mirrulations1") test_data = { - "results": 'test' + "results": "test" } test_path = "data/test.json" response = s3_bucket.save_json(test_path, test_data) body = conn.Object("test-mirrulations1", "data/test.json").get()["Body"].read()\ - .decode("utf-8").strip('/"') - assert body == test_data["results"] + .decode("utf-8").strip("/'") + assert json.loads(body) == test_data assert response["ResponseMetadata"]['HTTPStatusCode'] == 200 diff --git a/mirrulations-client/tests/test_saver.py b/mirrulations-client/tests/test_saver.py index 0f2e0e97..fb739590 100644 --- a/mirrulations-client/tests/test_saver.py +++ b/mirrulations-client/tests/test_saver.py @@ -4,6 +4,7 @@ from pytest import fixture from moto import mock_s3 import boto3 +import json from mirrclient.saver import Saver from mirrclient.s3_saver import S3Saver from mirrclient.disk_saver import DiskSaver @@ -27,7 +28,7 @@ def test_saving_to_disk(): mocked_file.assert_called_once_with(test_path, 'x', encoding='utf8') mocked_file().write.assert_called_once_with( - dumps(test_data['results'])) + dumps(test_data)) @mock_s3 @@ -44,7 +45,7 @@ def test_saving_to_s3(): body = conn.Object("test-mirrulations1", "data/test.json").get()["Body"].read()\ .decode("utf-8").strip('/"') - assert body == test_data["results"] + assert json.loads(body) == test_data @mock_s3 @@ -64,11 +65,12 @@ def test_saver_saves_text_to_multiple_places(): mocked_file.assert_called_once_with(test_path, 'x', encoding='utf8') mocked_file().write.assert_called_once_with( - dumps(test_data['results'])) + dumps(test_data)) body = conn.Object("test-mirrulations1", "/USTR/file.json").get()["Body"].read()\ .decode("utf-8").strip('/"') - assert body == test_data["results"] + print(body) + assert json.loads(body) == test_data @mock_s3 From 75e198e2ccd51603135282d639ad2a3ec2c9c22f Mon Sep 17 00:00:00 2001 From: Jack Wagner Date: Thu, 20 Apr 2023 09:18:57 -0400 Subject: [PATCH 03/10] Static Fixes --- mirrulations-client/src/mirrclient/client.py | 12 ++++---- mirrulations-client/tests/test_client.py | 30 ++++++++++---------- mirrulations-client/tests/test_s3_saver.py | 2 +- mirrulations-client/tests/test_saver.py | 7 ++--- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/mirrulations-client/src/mirrclient/client.py b/mirrulations-client/src/mirrclient/client.py index 8cee6386..1b72c5da 100644 --- a/mirrulations-client/src/mirrclient/client.py +++ b/mirrulations-client/src/mirrclient/client.py @@ -250,9 +250,10 @@ def _download_all_attachments_from_comment(self, comment_json): url.endswith('.pdf')) def _make_extraction_meta(self, attachment_paths): - if len(attachment_paths) ==0: - return - meta_save_dir = PathGenerator.make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] + if len(attachment_paths) == 0: + return False + meta_save_dir = PathGenerator.\ + make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] meta = { "extraction_status": {} } @@ -262,17 +263,14 @@ def _make_extraction_meta(self, attachment_paths): meta_save_path = f"{meta_save_dir}/extraction-metadata.json" self.saver.save_json(meta_save_path, meta) return meta_save_path, meta - # self.saver.save_json(meta_save_path, meta) # Use Saver to save meta json to Disk and S3 - # Possible file names: extraction-metadata.json ?? + # Possible file names: extraction-metadata.json ?? # write meta to meta_save_path # Future: Loop over again and add to extraction queue - - def _download_single_attachment(self, url, path): ''' Downloads a single attachment for a comment and diff --git a/mirrulations-client/tests/test_client.py b/mirrulations-client/tests/test_client.py index aeeb33b5..10500414 100644 --- a/mirrulations-client/tests/test_client.py +++ b/mirrulations-client/tests/test_client.py @@ -390,10 +390,8 @@ def test_client_downloads_attachment_results(mocker, capsys): client.job_operation() job_stat_results = client.cache.get_jobs_done() assert job_stat_results['num_comments_done'] == 1 - assert job_stat_results['num_attachments_done'] == 1 assert job_stat_results['num_pdf_attachments_done'] == 1 - captured = capsys.readouterr() print_data = [ 'Processing job from RabbitMQ.\n', 'Attempting to get job\n', @@ -406,7 +404,7 @@ def test_client_downloads_attachment_results(mocker, capsys): 'Found 1 attachment(s) for Comment - FDA-2016-D-2335-1566\n', 'Downloaded 1/1 attachment(s) for Comment - FDA-2016-D-2335-1566\n' ] - assert captured.out == "".join(print_data) + assert capsys.readouterr().out == "".join(print_data) @responses.activate @@ -535,24 +533,26 @@ def test_make_extraction_meta(mocker): mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk', return_value=None) test_attachment_paths = [ - "testagency/docketid/comments_attachments/test1.pdf", - "testagency/docketid/comments_attachments/test2.pdf", - "testagency/docketid/comments_attachments/test3.pdf", + "testagency/docketid/comments_attachments/test1.pdf", + "testagency/docketid/comments_attachments/test2.pdf", + "testagency/docketid/comments_attachments/test3.pdf", "testagency/docketid/comments_attachments/test4.pdf" ] mock_redis = ReadyRedis() client = Client(mock_redis, MockJobQueue()) client.api_key = 1234 expected_meta = { - "extraction_status":{ - "test1.pdf":"Not Attempted", - "test2.pdf":"Not Attempted", - "test3.pdf":"Not Attempted", - "test4.pdf":"Not Attempted" + "extraction_status": { + "test1.pdf": "Not Attempted", + "test2.pdf": "Not Attempted", + "test3.pdf": "Not Attempted", + "test4.pdf": "Not Attempted" } } - expected_meta_save_path = 'testagency/docketid/comments_extracted_text/pdfminer/extraction-metadata.json' - actual_meta_path, actual_meta = client._make_extraction_meta(test_attachment_paths) + expected_meta_save_path = \ + 'testagency/docketid/comments_extracted_text' +\ + '/pdfminer/extraction-metadata.json' + actual_meta_path, actual_meta = \ + client._make_extraction_meta(test_attachment_paths) assert expected_meta_save_path == actual_meta_path - assert expected_meta == actual_meta - + assert expected_meta == actual_meta diff --git a/mirrulations-client/tests/test_s3_saver.py b/mirrulations-client/tests/test_s3_saver.py index e9a2e3aa..acc635fa 100644 --- a/mirrulations-client/tests/test_s3_saver.py +++ b/mirrulations-client/tests/test_s3_saver.py @@ -1,6 +1,6 @@ import os -import boto3 import json +import boto3 from moto import mock_s3 from pytest import fixture from mirrclient.s3_saver import S3Saver diff --git a/mirrulations-client/tests/test_saver.py b/mirrulations-client/tests/test_saver.py index fb739590..4af104a5 100644 --- a/mirrulations-client/tests/test_saver.py +++ b/mirrulations-client/tests/test_saver.py @@ -1,10 +1,9 @@ -from json import dumps +from json import dumps, loads from unittest.mock import patch, mock_open import os from pytest import fixture from moto import mock_s3 import boto3 -import json from mirrclient.saver import Saver from mirrclient.s3_saver import S3Saver from mirrclient.disk_saver import DiskSaver @@ -45,7 +44,7 @@ def test_saving_to_s3(): body = conn.Object("test-mirrulations1", "data/test.json").get()["Body"].read()\ .decode("utf-8").strip('/"') - assert json.loads(body) == test_data + assert loads(body) == test_data @mock_s3 @@ -70,7 +69,7 @@ def test_saver_saves_text_to_multiple_places(): "/USTR/file.json").get()["Body"].read()\ .decode("utf-8").strip('/"') print(body) - assert json.loads(body) == test_data + assert loads(body) == test_data @mock_s3 From a2c9bba3ee76f4a8a831ac52fe6349f991ae293c Mon Sep 17 00:00:00 2001 From: Jack Wagner Date: Thu, 20 Apr 2023 22:47:29 -0400 Subject: [PATCH 04/10] Add save_meta function to disk_saver --- .../src/mirrclient/disk_saver.py | 15 +++++ mirrulations-client/tests/test_disk_saver.py | 58 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py index 7620363b..80587e73 100644 --- a/mirrulations-client/src/mirrclient/disk_saver.py +++ b/mirrulations-client/src/mirrclient/disk_saver.py @@ -61,3 +61,18 @@ def is_duplicate(self, existing, new): def check_for_duplicates(self, path, data, i): if self.is_duplicate(self.open_json_file(path), data) is False: self.save_duplicate_json(path, data, i) + + def save_meta(self, path, meta): + _dir = path.rsplit('/', 1)[0] + self.make_path(_dir) + if os.path.exists(path): + with open(path, "r", encoding="utf-8") as file: + previous_meta = load(file) + os.remove(path) + for key in previous_meta["extraction_status"]: + meta['extraction_status'][key] = "Not Attempted" + print("extraction-metadata.json file exists. Updating this file") + with open(path, "w", encoding="utf-8") as file: + # First comment will trigger this + file.write(dumps(meta)) + print(f'Wrote Extraction Metadata to Disk: {path}') diff --git a/mirrulations-client/tests/test_disk_saver.py b/mirrulations-client/tests/test_disk_saver.py index 625834b2..3d91e586 100644 --- a/mirrulations-client/tests/test_disk_saver.py +++ b/mirrulations-client/tests/test_disk_saver.py @@ -154,3 +154,61 @@ def test_check_for_duplicates(capsys): print_data = '' captured = capsys.readouterr() assert captured.out == print_data + + +def test_save_meta(): + saver = DiskSaver() + test_meta_path = 'pdfminer/extraction-metadata.json' + test_meta = { + "extraction_status": { + "test_1.pdf": "Not Attempted", + "test_2.pdf": "Not Attempted", + } + } + with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file: + with patch('os.makedirs') as mock_dir: + saver.save_meta(test_meta_path, test_meta) + mock_dir.assert_called_once_with('pdfminer') + mocked_file.assert_called_once_with(test_meta_path, + 'w', encoding='utf-8') + mocked_file().write.assert_called_once_with(dumps(test_meta)) + + +def test_save_meta_where_meta_exists_already(mocker): + saver = DiskSaver() + test_meta_path = 'pdfminer/extraction-metadata.json' + test_meta = { + "extraction_status": { + "test_1.pdf": "Not Attempted", + "test_2.pdf": "Not Attempted", + } + } + new_meta = { + "extraction_status": { + "test_3.pdf": "Not Attempted", + } + } + + combined_meta = { + "extraction_status": { + "test_3.pdf": "Not Attempted", + "test_1.pdf": "Not Attempted", + "test_2.pdf": "Not Attempted" + } + } + with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file: + with patch('os.makedirs') as mock_dir: + saver.save_meta(test_meta_path, test_meta) + mock_dir.assert_called_once_with('pdfminer') + mocked_file.assert_called_once_with(test_meta_path, + 'w', encoding='utf-8') + mocked_file().write.assert_called_once_with(dumps(test_meta)) + + with patch('mirrclient.disk_saver.open', + mock_open(read_data=dumps(test_meta))) as mocked_file: + with patch('os.makedirs') as mock_dir: + mocker.patch('os.path.exists', return_value=True) + mocker.patch('json.load', return_value=test_meta) + mocker.patch('os.remove') + saver.save_meta(test_meta_path, new_meta) + mocked_file().write.assert_called_once_with(dumps(combined_meta)) From 1f444b19dd879734cac73ede3172575de0cf591f Mon Sep 17 00:00:00 2001 From: Jack Wagner Date: Thu, 20 Apr 2023 22:48:09 -0400 Subject: [PATCH 05/10] Client uses new save_meta function in the DiskSaver --- mirrulations-client/src/mirrclient/client.py | 36 ++++++++++++++------ mirrulations-client/tests/test_client.py | 20 ++++++----- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/mirrulations-client/src/mirrclient/client.py b/mirrulations-client/src/mirrclient/client.py index 1b72c5da..543fd34a 100644 --- a/mirrulations-client/src/mirrclient/client.py +++ b/mirrulations-client/src/mirrclient/client.py @@ -230,11 +230,13 @@ def _download_all_attachments_from_comment(self, comment_json): ''' path_list = self.path_generator.get_attachment_json_paths(comment_json) - self._make_extraction_meta(path_list) - counter = 0 comment_id_str = f"Comment - {comment_json['data']['id']}" print(f"Found {len(path_list)} attachment(s) for {comment_id_str}") + + if len(path_list) > 0: + self._make_extraction_meta(path_list) + for included in comment_json["included"]: if (included["attributes"]["fileFormats"] and included["attributes"]["fileFormats"] @@ -250,8 +252,24 @@ def _download_all_attachments_from_comment(self, comment_json): url.endswith('.pdf')) def _make_extraction_meta(self, attachment_paths): - if len(attachment_paths) == 0: - return False + """ + This method creates the initial meta data json for + attachments for a given comment + The metadata is a json with all of the extraction_statuses + initialized at "Not Attempted" + + Ex: + { + extraction_status: + { + "path_to_attachment" : "Not Attempted" + } + } + extraction-metadata.json + Will be saved in the comments_extracted/pdfminer/ + directory for now. + + """ meta_save_dir = PathGenerator.\ make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0] meta = { @@ -260,15 +278,13 @@ def _make_extraction_meta(self, attachment_paths): for path in attachment_paths: file_name = path.rsplit("/", 1)[1] meta["extraction_status"][file_name] = "Not Attempted" - meta_save_path = f"{meta_save_dir}/extraction-metadata.json" - self.saver.save_json(meta_save_path, meta) + meta_save_path = f"/data{meta_save_dir}/extraction-metadata.json" + + DiskSaver().save_meta(f"{meta_save_path}", meta) return meta_save_path, meta # Use Saver to save meta json to Disk and S3 - # Possible file names: extraction-metadata.json ?? - - # write meta to meta_save_path - + # Possible file names: extraction-metadata.json # Future: Loop over again and add to extraction queue def _download_single_attachment(self, url, path): diff --git a/mirrulations-client/tests/test_client.py b/mirrulations-client/tests/test_client.py index 10500414..8504cf0e 100644 --- a/mirrulations-client/tests/test_client.py +++ b/mirrulations-client/tests/test_client.py @@ -351,7 +351,7 @@ def test_client_downloads_attachment_results(mocker, capsys): return_value=None) mocker.patch('mirrclient.s3_saver.S3Saver.save_binary', return_value=None) - mocker.patch('mirrclient.s3_saver.S3Saver.save_json', + mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta', return_value=None) mock_redis = ReadyRedis() client = Client(mock_redis, MockJobQueue()) @@ -451,6 +451,8 @@ def test_two_attachments_in_comment(mocker): return_value=None) mocker.patch('mirrclient.s3_saver.S3Saver.save_json', return_value=None) + mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta', + return_value=None) client = Client(ReadyRedis(), MockJobQueue()) client.api_key = 1234 @@ -528,15 +530,15 @@ def test_client_handles_api_timeout(): def test_make_extraction_meta(mocker): mocker.patch('mirrclient.disk_saver.DiskSaver.make_path', return_value=None) - mocker.patch('mirrclient.saver.Saver.save_json', - return_value=None) - mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk', + mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta', return_value=None) + # mocker.patch('mirrclient.disk_saver.DiskSaver.open', + # return_value=None) test_attachment_paths = [ - "testagency/docketid/comments_attachments/test1.pdf", - "testagency/docketid/comments_attachments/test2.pdf", - "testagency/docketid/comments_attachments/test3.pdf", - "testagency/docketid/comments_attachments/test4.pdf" + "/testagency/docketid/comments_attachments/test1.pdf", + "/testagency/docketid/comments_attachments/test2.pdf", + "/testagency/docketid/comments_attachments/test3.pdf", + "/testagency/docketid/comments_attachments/test4.pdf" ] mock_redis = ReadyRedis() client = Client(mock_redis, MockJobQueue()) @@ -550,7 +552,7 @@ def test_make_extraction_meta(mocker): } } expected_meta_save_path = \ - 'testagency/docketid/comments_extracted_text' +\ + '/data/testagency/docketid/comments_extracted_text' +\ '/pdfminer/extraction-metadata.json' actual_meta_path, actual_meta = \ client._make_extraction_meta(test_attachment_paths) From 10d6a41da439a797099d24444a3373af19bf4968 Mon Sep 17 00:00:00 2001 From: Nikolas Kovacs <78886981+nikovacs@users.noreply.github.com> Date: Fri, 21 Apr 2023 11:45:51 -0400 Subject: [PATCH 06/10] os.remove is not neccessary. Writing new meta will overwrite --- mirrulations-client/src/mirrclient/disk_saver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py index 80587e73..733217a3 100644 --- a/mirrulations-client/src/mirrclient/disk_saver.py +++ b/mirrulations-client/src/mirrclient/disk_saver.py @@ -68,7 +68,6 @@ def save_meta(self, path, meta): if os.path.exists(path): with open(path, "r", encoding="utf-8") as file: previous_meta = load(file) - os.remove(path) for key in previous_meta["extraction_status"]: meta['extraction_status'][key] = "Not Attempted" print("extraction-metadata.json file exists. Updating this file") From 4e2faa956d60af47a00c3743aa0c59a77b889173 Mon Sep 17 00:00:00 2001 From: Nikolas Kovacs <78886981+nikovacs@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:06:53 -0400 Subject: [PATCH 07/10] Add save_meta method. Add static update meta method. --- mirrulations-client/src/mirrclient/saver.py | 39 +++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mirrulations-client/src/mirrclient/saver.py b/mirrulations-client/src/mirrclient/saver.py index ac29f58e..f812355c 100644 --- a/mirrulations-client/src/mirrclient/saver.py +++ b/mirrulations-client/src/mirrclient/saver.py @@ -1,3 +1,6 @@ +from json import load +import os + class Saver: """ A class which encapsulates the saving for the Client @@ -49,3 +52,39 @@ def save_binary(self, path, binary): """ for saver in self.savers: saver.save_binary(path, binary) + + def save_meta(self, path, meta): + """ + Iterates over the instance variable savers list + and calls the corresponding subclass save_binary() method. + + Parameters + ---------- + path : str + A string denoting where the metadata file should be saved to. + + meta: dict + The metadata (json) to be saved + """ + for saver in self.savers: + saver.save_meta(path, meta) + + @staticmethod + def update_meta(path, meta): + """ + If an existing metadata file exists, + the new meta is updated with the previous + meta's extraction status. + Parameters + ---------- + path : str + The path to the metadata file + meta : dict + The new metadata to be written/combined + """ + if os.path.exists(path): + with open(path, "r", encoding="utf-8") as file: + previous_meta = load(file) + for key in previous_meta["extraction_status"]: + meta['extraction_status'][key] = "Not Attempted" + print("extraction-metadata.json file exists. Updating this file") From cba49a7041b783a42dd3e702863acb60ba8715fd Mon Sep 17 00:00:00 2001 From: Nikolas Kovacs <78886981+nikovacs@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:08:52 -0400 Subject: [PATCH 08/10] refactor save_meta to use Saver's update_meta method --- mirrulations-client/src/mirrclient/disk_saver.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py index 733217a3..712d0aaa 100644 --- a/mirrulations-client/src/mirrclient/disk_saver.py +++ b/mirrulations-client/src/mirrclient/disk_saver.py @@ -1,5 +1,6 @@ import os from json import dumps, load +from mirrclient.saver import Saver class DiskSaver(): @@ -65,13 +66,10 @@ def check_for_duplicates(self, path, data, i): def save_meta(self, path, meta): _dir = path.rsplit('/', 1)[0] self.make_path(_dir) - if os.path.exists(path): - with open(path, "r", encoding="utf-8") as file: - previous_meta = load(file) - for key in previous_meta["extraction_status"]: - meta['extraction_status'][key] = "Not Attempted" - print("extraction-metadata.json file exists. Updating this file") + Saver.update_meta(path, meta) with open(path, "w", encoding="utf-8") as file: # First comment will trigger this file.write(dumps(meta)) print(f'Wrote Extraction Metadata to Disk: {path}') + + From 3aee40b95f2986acbf82a2a67f99c7e1e7f3c6f1 Mon Sep 17 00:00:00 2001 From: Nikolas Kovacs <78886981+nikovacs@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:09:04 -0400 Subject: [PATCH 09/10] add save_meta to s3_saver --- .../src/mirrclient/s3_saver.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mirrulations-client/src/mirrclient/s3_saver.py b/mirrulations-client/src/mirrclient/s3_saver.py index 8a36a961..10e4c6dc 100644 --- a/mirrulations-client/src/mirrclient/s3_saver.py +++ b/mirrulations-client/src/mirrclient/s3_saver.py @@ -1,6 +1,7 @@ import os import json from dotenv import load_dotenv +from mirrclient.saver import Saver import boto3 @@ -114,3 +115,24 @@ def save_binary(self, path, binary): Body=binary) print(f"Wrote binary to S3: {path}") return response + + def save_meta(self, path, meta): + """ + Saves metadata (json) file to Amazon S3 bucket + Bucket Structure: /AGENCYID/path/to/item + + Parameters + ------- + path : str + Where to save the data to in the S3 bucket + + meta : dict + The json contents representing the + metadata that will be written + """ + Saver.update_meta(path, meta) + self.save_json(path, meta) + + + + From 3d5229fa905383774fcea6f312e8e30e80ac28b9 Mon Sep 17 00:00:00 2001 From: Nikolas Kovacs <78886981+nikovacs@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:56:02 -0400 Subject: [PATCH 10/10] linter fixes --- mirrulations-client/src/mirrclient/disk_saver.py | 2 -- mirrulations-client/src/mirrclient/s3_saver.py | 6 +----- mirrulations-client/src/mirrclient/saver.py | 3 ++- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py index 712d0aaa..c9d9bf07 100644 --- a/mirrulations-client/src/mirrclient/disk_saver.py +++ b/mirrulations-client/src/mirrclient/disk_saver.py @@ -71,5 +71,3 @@ def save_meta(self, path, meta): # First comment will trigger this file.write(dumps(meta)) print(f'Wrote Extraction Metadata to Disk: {path}') - - diff --git a/mirrulations-client/src/mirrclient/s3_saver.py b/mirrulations-client/src/mirrclient/s3_saver.py index 10e4c6dc..bf35753f 100644 --- a/mirrulations-client/src/mirrclient/s3_saver.py +++ b/mirrulations-client/src/mirrclient/s3_saver.py @@ -115,7 +115,7 @@ def save_binary(self, path, binary): Body=binary) print(f"Wrote binary to S3: {path}") return response - + def save_meta(self, path, meta): """ Saves metadata (json) file to Amazon S3 bucket @@ -132,7 +132,3 @@ def save_meta(self, path, meta): """ Saver.update_meta(path, meta) self.save_json(path, meta) - - - - diff --git a/mirrulations-client/src/mirrclient/saver.py b/mirrulations-client/src/mirrclient/saver.py index f812355c..deec4be4 100644 --- a/mirrulations-client/src/mirrclient/saver.py +++ b/mirrulations-client/src/mirrclient/saver.py @@ -1,6 +1,7 @@ from json import load import os + class Saver: """ A class which encapsulates the saving for the Client @@ -74,7 +75,7 @@ def update_meta(path, meta): """ If an existing metadata file exists, the new meta is updated with the previous - meta's extraction status. + meta's extraction status. Parameters ---------- path : str