MoravianUniversity · jack11wagner · Apr 19, 2023 · Apr 20, 2023 · Apr 20, 2023 · Apr 21, 2023
diff --git a/mirrulations-client/src/mirrclient/client.py b/mirrulations-client/src/mirrclient/client.py
@@ -189,7 +189,7 @@ def _put_results(self, data):
             the results from a performed job
         """
         dir_, filename = data['directory'].rsplit('/', 1)
-        self.saver.save_json(f'/data{dir_}/{filename}', data)
+        self.saver.save_json(f'/data{dir_}/{filename}', data["results"])
 
     def _perform_job(self, job_url):
         """
@@ -233,6 +233,10 @@ def _download_all_attachments_from_comment(self, comment_json):
         counter = 0
         comment_id_str = f"Comment - {comment_json['data']['id']}"
         print(f"Found {len(path_list)} attachment(s) for {comment_id_str}")
+
+        if len(path_list) > 0:
+            self._make_extraction_meta(path_list)
+
         for included in comment_json["included"]:
             if (included["attributes"]["fileFormats"] and
                     included["attributes"]["fileFormats"]
@@ -247,6 +251,42 @@ def _download_all_attachments_from_comment(self, comment_json):
                     self.cache.increase_jobs_done('attachment',
                                                   url.endswith('.pdf'))
 
+    def _make_extraction_meta(self, attachment_paths):
+        """
+        This method creates the initial meta data json for
+            attachments for a given comment
+        The metadata is a json with all of the extraction_statuses
+            initialized at "Not Attempted"
+
+        Ex:
+        {
+            extraction_status:
+            {
+                "path_to_attachment" : "Not Attempted"
+            }
+        }
+        extraction-metadata.json
+        Will be saved in the comments_extracted/pdfminer/
+        directory for now.
+
+        """
+        meta_save_dir = PathGenerator.\
+            make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0]
+        meta = {
+            "extraction_status": {}
+        }
+        for path in attachment_paths:
+            file_name = path.rsplit("/", 1)[1]
+            meta["extraction_status"][file_name] = "Not Attempted"
+        meta_save_path = f"/data{meta_save_dir}/extraction-metadata.json"
+
+        DiskSaver().save_meta(f"{meta_save_path}", meta)
+        return meta_save_path, meta
+
+        # Use Saver to save meta json to Disk and S3
+        # Possible file names: extraction-metadata.json
+        # Future: Loop over again and add to extraction queue
+
     def _download_single_attachment(self, url, path):
         '''
         Downloads a single attachment for a comment and

diff --git a/mirrulations-client/src/mirrclient/disk_saver.py b/mirrulations-client/src/mirrclient/disk_saver.py
@@ -1,5 +1,6 @@
 import os
 from json import dumps, load
+from mirrclient.saver import Saver
 
 
 class DiskSaver():
@@ -25,7 +26,6 @@ def save_json(self, path, data):
         """
         _dir = path.rsplit('/', 1)[0]
         self.make_path(_dir)
-        data = data['results']
         if os.path.exists(path) is False:
             self.save_to_disk(path, data)
         else:
@@ -62,3 +62,12 @@ def is_duplicate(self, existing, new):
     def check_for_duplicates(self, path, data, i):
         if self.is_duplicate(self.open_json_file(path), data) is False:
             self.save_duplicate_json(path, data, i)
+
+    def save_meta(self, path, meta):
+        _dir = path.rsplit('/', 1)[0]
+        self.make_path(_dir)
+        Saver.update_meta(path, meta)
+        with open(path, "w", encoding="utf-8") as file:
+            # First comment will trigger this
+            file.write(dumps(meta))
+            print(f'Wrote Extraction Metadata to Disk: {path}')
diff --git a/mirrulations-client/src/mirrclient/s3_saver.py b/mirrulations-client/src/mirrclient/s3_saver.py
@@ -1,6 +1,7 @@
 import os
 import json
 from dotenv import load_dotenv
+from mirrclient.saver import Saver
 import boto3
 
 
@@ -85,7 +86,7 @@ def save_json(self, path, data):
         response = self.s3_client.put_object(
             Bucket=self.bucket_name,
             Key=path,
-            Body=json.dumps(data["results"])
+            Body=json.dumps(data)
             )
         print(f"Wrote json to S3: {path}")
         return response
@@ -114,3 +115,20 @@ def save_binary(self, path, binary):
             Body=binary)
         print(f"Wrote binary to S3: {path}")
         return response
+
+    def save_meta(self, path, meta):
+        """
+        Saves metadata (json) file to Amazon S3 bucket
+        Bucket Structure: /AGENCYID/path/to/item
+
+        Parameters
+        -------
+        path : str
+            Where to save the data to in the S3 bucket
+
+        meta : dict
+            The json contents representing the
+            metadata that will be written
+        """
+        Saver.update_meta(path, meta)
+        self.save_json(path, meta)
diff --git a/mirrulations-client/src/mirrclient/saver.py b/mirrulations-client/src/mirrclient/saver.py
@@ -1,3 +1,7 @@
+from json import load
+import os
+
+
 class Saver:
     """
     A class which encapsulates the saving for the Client
@@ -49,3 +53,39 @@ def save_binary(self, path, binary):
         """
         for saver in self.savers:
             saver.save_binary(path, binary)
+
+    def save_meta(self, path, meta):
+        """
+        Iterates over the instance variable savers list
+        and calls the corresponding subclass save_binary() method.
+
+        Parameters
+        ----------
+        path : str
+            A string denoting where the metadata file should be saved to.
+
+         meta: dict
+            The metadata (json) to be saved
+        """
+        for saver in self.savers:
+            saver.save_meta(path, meta)
+
+    @staticmethod
+    def update_meta(path, meta):
+        """
+        If an existing metadata file exists,
+        the new meta is updated with the previous
+        meta's extraction status.
+        Parameters
+        ----------
+        path : str
+            The path to the metadata file
+        meta : dict
+            The new metadata to be written/combined
+        """
+        if os.path.exists(path):
+            with open(path, "r", encoding="utf-8") as file:
+                previous_meta = load(file)
+            for key in previous_meta["extraction_status"]:
+                meta['extraction_status'][key] = "Not Attempted"
+            print("extraction-metadata.json file exists. Updating this file")
diff --git a/mirrulations-client/tests/test_client.py b/mirrulations-client/tests/test_client.py
@@ -347,6 +347,12 @@ def test_client_downloads_attachment_results(mocker, capsys):
                  return_value=None)
     mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary',
                  return_value=None)
+    mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk',
+                 return_value=None)
+    mocker.patch('mirrclient.s3_saver.S3Saver.save_binary',
+                 return_value=None)
+    mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
+                 return_value=None)
     mock_redis = ReadyRedis()
     client = Client(mock_redis, MockJobQueue())
     client.api_key = 1234
@@ -384,10 +390,8 @@ def test_client_downloads_attachment_results(mocker, capsys):
     client.job_operation()
     job_stat_results = client.cache.get_jobs_done()
     assert job_stat_results['num_comments_done'] == 1
-    assert job_stat_results['num_attachments_done'] == 1
     assert job_stat_results['num_pdf_attachments_done'] == 1
 
-    captured = capsys.readouterr()
     print_data = [
         'Processing job from RabbitMQ.\n',
         'Attempting to get job\n',
@@ -400,7 +404,7 @@ def test_client_downloads_attachment_results(mocker, capsys):
         'Found 1 attachment(s) for Comment - FDA-2016-D-2335-1566\n',
         'Downloaded 1/1 attachment(s) for Comment - FDA-2016-D-2335-1566\n'
     ]
-    assert captured.out == "".join(print_data)
+    assert capsys.readouterr().out == "".join(print_data)
 
 
 @responses.activate
@@ -441,6 +445,14 @@ def test_two_attachments_in_comment(mocker):
                  return_value=None)
     mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary',
                  return_value=None)
+    mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk',
+                 return_value=None)
+    mocker.patch('mirrclient.s3_saver.S3Saver.save_binary',
+                 return_value=None)
+    mocker.patch('mirrclient.s3_saver.S3Saver.save_json',
+                 return_value=None)
+    mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
+                 return_value=None)
     client = Client(ReadyRedis(), MockJobQueue())
     client.api_key = 1234
 
@@ -513,3 +525,36 @@ def test_client_handles_api_timeout():
         client.job_operation()
 
     assert mock_redis.get('invalid_jobs') == [1, 'http://regulations.gov/job']
+
+
+def test_make_extraction_meta(mocker):
+    mocker.patch('mirrclient.disk_saver.DiskSaver.make_path',
+                 return_value=None)
+    mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
+                 return_value=None)
+    # mocker.patch('mirrclient.disk_saver.DiskSaver.open',
+    #              return_value=None)
+    test_attachment_paths = [
+        "/testagency/docketid/comments_attachments/test1.pdf",
+        "/testagency/docketid/comments_attachments/test2.pdf",
+        "/testagency/docketid/comments_attachments/test3.pdf",
+        "/testagency/docketid/comments_attachments/test4.pdf"
+    ]
+    mock_redis = ReadyRedis()
+    client = Client(mock_redis, MockJobQueue())
+    client.api_key = 1234
+    expected_meta = {
+        "extraction_status": {
+            "test1.pdf": "Not Attempted",
+            "test2.pdf": "Not Attempted",
+            "test3.pdf": "Not Attempted",
+            "test4.pdf": "Not Attempted"
+        }
+    }
+    expected_meta_save_path = \
+        '/data/testagency/docketid/comments_extracted_text' +\
+        '/pdfminer/extraction-metadata.json'
+    actual_meta_path, actual_meta = \
+        client._make_extraction_meta(test_attachment_paths)
+    assert expected_meta_save_path == actual_meta_path
+    assert expected_meta == actual_meta
diff --git a/mirrulations-client/tests/test_disk_saver.py b/mirrulations-client/tests/test_disk_saver.py
@@ -58,7 +58,7 @@ def test_save_json():
             saver.save_json(path, data)
             mock_dir.assert_called_once_with('/USTR')
             mocked_file.assert_called_once_with(path, 'x', encoding='utf8')
-            mocked_file().write.assert_called_once_with(dumps(data['results']))
+            mocked_file().write.assert_called_once_with(dumps(data))
 
 
 def test_save_binary():
@@ -154,3 +154,61 @@ def test_check_for_duplicates(capsys):
         print_data = ''
         captured = capsys.readouterr()
         assert captured.out == print_data
+
+
+def test_save_meta():
+    saver = DiskSaver()
+    test_meta_path = 'pdfminer/extraction-metadata.json'
+    test_meta = {
+        "extraction_status": {
+            "test_1.pdf": "Not Attempted",
+            "test_2.pdf": "Not Attempted",
+        }
+        }
+    with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file:
+        with patch('os.makedirs') as mock_dir:
+            saver.save_meta(test_meta_path, test_meta)
+            mock_dir.assert_called_once_with('pdfminer')
+            mocked_file.assert_called_once_with(test_meta_path,
+                                                'w', encoding='utf-8')
+            mocked_file().write.assert_called_once_with(dumps(test_meta))
+
+
+def test_save_meta_where_meta_exists_already(mocker):
+    saver = DiskSaver()
+    test_meta_path = 'pdfminer/extraction-metadata.json'
+    test_meta = {
+        "extraction_status": {
+            "test_1.pdf": "Not Attempted",
+            "test_2.pdf": "Not Attempted",
+        }
+    }
+    new_meta = {
+        "extraction_status": {
+            "test_3.pdf": "Not Attempted",
+        }
+    }
+
+    combined_meta = {
+        "extraction_status": {
+            "test_3.pdf": "Not Attempted",
+            "test_1.pdf": "Not Attempted",
+            "test_2.pdf": "Not Attempted"
+        }
+    }
+    with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file:
+        with patch('os.makedirs') as mock_dir:
+            saver.save_meta(test_meta_path, test_meta)
+            mock_dir.assert_called_once_with('pdfminer')
+            mocked_file.assert_called_once_with(test_meta_path,
+                                                'w', encoding='utf-8')
+            mocked_file().write.assert_called_once_with(dumps(test_meta))
+
+    with patch('mirrclient.disk_saver.open',
+               mock_open(read_data=dumps(test_meta))) as mocked_file:
+        with patch('os.makedirs') as mock_dir:
+            mocker.patch('os.path.exists', return_value=True)
+            mocker.patch('json.load', return_value=test_meta)
+            mocker.patch('os.remove')
+            saver.save_meta(test_meta_path, new_meta)
+            mocked_file().write.assert_called_once_with(dumps(combined_meta))
diff --git a/mirrulations-client/tests/test_s3_saver.py b/mirrulations-client/tests/test_s3_saver.py
@@ -1,4 +1,5 @@
 import os
+import json
 import boto3
 from moto import mock_s3
 from pytest import fixture
@@ -63,14 +64,14 @@ def test_put_text_to_bucket():
     conn = create_mock_mirrulations_bucket()
     s3_bucket = S3Saver(bucket_name="test-mirrulations1")
     test_data = {
-        "results": 'test'
+        "results": "test"
     }
     test_path = "data/test.json"
     response = s3_bucket.save_json(test_path, test_data)
     body = conn.Object("test-mirrulations1",
                        "data/test.json").get()["Body"].read()\
-        .decode("utf-8").strip('/"')
-    assert body == test_data["results"]
+        .decode("utf-8").strip("/'")
+    assert json.loads(body) == test_data
     assert response["ResponseMetadata"]['HTTPStatusCode'] == 200
 
 

diff --git a/mirrulations-client/tests/test_saver.py b/mirrulations-client/tests/test_saver.py
@@ -1,4 +1,4 @@
-from json import dumps
+from json import dumps, loads
 from unittest.mock import patch, mock_open
 import os
 from pytest import fixture
@@ -27,7 +27,7 @@ def test_saving_to_disk():
             mocked_file.assert_called_once_with(test_path, 'x',
                                                 encoding='utf8')
             mocked_file().write.assert_called_once_with(
-                dumps(test_data['results']))
+                dumps(test_data))
 
 
 @mock_s3
@@ -44,7 +44,7 @@ def test_saving_to_s3():
     body = conn.Object("test-mirrulations1",
                        "data/test.json").get()["Body"].read()\
         .decode("utf-8").strip('/"')
-    assert body == test_data["results"]
+    assert loads(body) == test_data
 
 
 @mock_s3
@@ -64,11 +64,12 @@ def test_saver_saves_text_to_multiple_places():
             mocked_file.assert_called_once_with(test_path, 'x',
                                                 encoding='utf8')
             mocked_file().write.assert_called_once_with(
-                dumps(test_data['results']))
+                dumps(test_data))
             body = conn.Object("test-mirrulations1",
                                "/USTR/file.json").get()["Body"].read()\
                 .decode("utf-8").strip('/"')
-            assert body == test_data["results"]
+            print(body)
+            assert loads(body) == test_data
 
 
 @mock_s3