Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial metadata extraction implementation #159

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
42 changes: 41 additions & 1 deletion mirrulations-client/src/mirrclient/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def _put_results(self, data):
the results from a performed job
"""
dir_, filename = data['directory'].rsplit('/', 1)
self.saver.save_json(f'/data{dir_}/{filename}', data)
self.saver.save_json(f'/data{dir_}/{filename}', data["results"])

def _perform_job(self, job_url):
"""
Expand Down Expand Up @@ -233,6 +233,10 @@ def _download_all_attachments_from_comment(self, comment_json):
counter = 0
comment_id_str = f"Comment - {comment_json['data']['id']}"
print(f"Found {len(path_list)} attachment(s) for {comment_id_str}")

if len(path_list) > 0:
self._make_extraction_meta(path_list)

for included in comment_json["included"]:
if (included["attributes"]["fileFormats"] and
included["attributes"]["fileFormats"]
Expand All @@ -247,6 +251,42 @@ def _download_all_attachments_from_comment(self, comment_json):
self.cache.increase_jobs_done('attachment',
url.endswith('.pdf'))

def _make_extraction_meta(self, attachment_paths):
"""
This method creates the initial meta data json for
attachments for a given comment
The metadata is a json with all of the extraction_statuses
initialized at "Not Attempted"

Ex:
{
extraction_status:
{
"path_to_attachment" : "Not Attempted"
}
}
extraction-metadata.json
Will be saved in the comments_extracted/pdfminer/
directory for now.

"""
meta_save_dir = PathGenerator.\
make_attachment_save_path(attachment_paths[0]).rsplit("/", 1)[0]
meta = {
"extraction_status": {}
}
for path in attachment_paths:
file_name = path.rsplit("/", 1)[1]
meta["extraction_status"][file_name] = "Not Attempted"
meta_save_path = f"/data{meta_save_dir}/extraction-metadata.json"

DiskSaver().save_meta(f"{meta_save_path}", meta)
return meta_save_path, meta
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method does not have to return anything. Otherwise good.


# Use Saver to save meta json to Disk and S3
# Possible file names: extraction-metadata.json
# Future: Loop over again and add to extraction queue

def _download_single_attachment(self, url, path):
'''
Downloads a single attachment for a comment and
Expand Down
11 changes: 10 additions & 1 deletion mirrulations-client/src/mirrclient/disk_saver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from json import dumps, load
from mirrclient.saver import Saver


class DiskSaver():
Expand All @@ -25,7 +26,6 @@ def save_json(self, path, data):
"""
_dir = path.rsplit('/', 1)[0]
self.make_path(_dir)
data = data['results']
if os.path.exists(path) is False:
self.save_to_disk(path, data)
else:
Expand Down Expand Up @@ -62,3 +62,12 @@ def is_duplicate(self, existing, new):
def check_for_duplicates(self, path, data, i):
if self.is_duplicate(self.open_json_file(path), data) is False:
self.save_duplicate_json(path, data, i)

def save_meta(self, path, meta):
_dir = path.rsplit('/', 1)[0]
self.make_path(_dir)
Saver.update_meta(path, meta)
with open(path, "w", encoding="utf-8") as file:
# First comment will trigger this
file.write(dumps(meta))
print(f'Wrote Extraction Metadata to Disk: {path}')
20 changes: 19 additions & 1 deletion mirrulations-client/src/mirrclient/s3_saver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
from dotenv import load_dotenv
from mirrclient.saver import Saver
import boto3


Expand Down Expand Up @@ -85,7 +86,7 @@ def save_json(self, path, data):
response = self.s3_client.put_object(
Bucket=self.bucket_name,
Key=path,
Body=json.dumps(data["results"])
Body=json.dumps(data)
)
print(f"Wrote json to S3: {path}")
return response
Expand Down Expand Up @@ -114,3 +115,20 @@ def save_binary(self, path, binary):
Body=binary)
print(f"Wrote binary to S3: {path}")
return response

def save_meta(self, path, meta):
"""
Saves metadata (json) file to Amazon S3 bucket
Bucket Structure: /AGENCYID/path/to/item

Parameters
-------
path : str
Where to save the data to in the S3 bucket

meta : dict
The json contents representing the
metadata that will be written
"""
Saver.update_meta(path, meta)
self.save_json(path, meta)
40 changes: 40 additions & 0 deletions mirrulations-client/src/mirrclient/saver.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from json import load
import os


class Saver:
"""
A class which encapsulates the saving for the Client
Expand Down Expand Up @@ -49,3 +53,39 @@ def save_binary(self, path, binary):
"""
for saver in self.savers:
saver.save_binary(path, binary)

def save_meta(self, path, meta):
"""
Iterates over the instance variable savers list
and calls the corresponding subclass save_binary() method.

Parameters
----------
path : str
A string denoting where the metadata file should be saved to.

meta: dict
The metadata (json) to be saved
"""
for saver in self.savers:
saver.save_meta(path, meta)

@staticmethod
def update_meta(path, meta):
"""
If an existing metadata file exists,
the new meta is updated with the previous
meta's extraction status.
Parameters
----------
path : str
The path to the metadata file
meta : dict
The new metadata to be written/combined
"""
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as file:
previous_meta = load(file)
for key in previous_meta["extraction_status"]:
meta['extraction_status'][key] = "Not Attempted"
print("extraction-metadata.json file exists. Updating this file")
51 changes: 48 additions & 3 deletions mirrulations-client/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,12 @@ def test_client_downloads_attachment_results(mocker, capsys):
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary',
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk',
return_value=None)
mocker.patch('mirrclient.s3_saver.S3Saver.save_binary',
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
return_value=None)
mock_redis = ReadyRedis()
client = Client(mock_redis, MockJobQueue())
client.api_key = 1234
Expand Down Expand Up @@ -384,10 +390,8 @@ def test_client_downloads_attachment_results(mocker, capsys):
client.job_operation()
job_stat_results = client.cache.get_jobs_done()
assert job_stat_results['num_comments_done'] == 1
assert job_stat_results['num_attachments_done'] == 1
assert job_stat_results['num_pdf_attachments_done'] == 1

captured = capsys.readouterr()
print_data = [
'Processing job from RabbitMQ.\n',
'Attempting to get job\n',
Expand All @@ -400,7 +404,7 @@ def test_client_downloads_attachment_results(mocker, capsys):
'Found 1 attachment(s) for Comment - FDA-2016-D-2335-1566\n',
'Downloaded 1/1 attachment(s) for Comment - FDA-2016-D-2335-1566\n'
]
assert captured.out == "".join(print_data)
assert capsys.readouterr().out == "".join(print_data)


@responses.activate
Expand Down Expand Up @@ -441,6 +445,14 @@ def test_two_attachments_in_comment(mocker):
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_binary',
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_to_disk',
return_value=None)
mocker.patch('mirrclient.s3_saver.S3Saver.save_binary',
return_value=None)
mocker.patch('mirrclient.s3_saver.S3Saver.save_json',
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
return_value=None)
client = Client(ReadyRedis(), MockJobQueue())
client.api_key = 1234

Expand Down Expand Up @@ -513,3 +525,36 @@ def test_client_handles_api_timeout():
client.job_operation()

assert mock_redis.get('invalid_jobs') == [1, 'http://regulations.gov/job']


def test_make_extraction_meta(mocker):
mocker.patch('mirrclient.disk_saver.DiskSaver.make_path',
return_value=None)
mocker.patch('mirrclient.disk_saver.DiskSaver.save_meta',
return_value=None)
# mocker.patch('mirrclient.disk_saver.DiskSaver.open',
# return_value=None)
test_attachment_paths = [
"/testagency/docketid/comments_attachments/test1.pdf",
"/testagency/docketid/comments_attachments/test2.pdf",
"/testagency/docketid/comments_attachments/test3.pdf",
"/testagency/docketid/comments_attachments/test4.pdf"
]
mock_redis = ReadyRedis()
client = Client(mock_redis, MockJobQueue())
client.api_key = 1234
expected_meta = {
"extraction_status": {
"test1.pdf": "Not Attempted",
"test2.pdf": "Not Attempted",
"test3.pdf": "Not Attempted",
"test4.pdf": "Not Attempted"
}
}
expected_meta_save_path = \
'/data/testagency/docketid/comments_extracted_text' +\
'/pdfminer/extraction-metadata.json'
actual_meta_path, actual_meta = \
client._make_extraction_meta(test_attachment_paths)
assert expected_meta_save_path == actual_meta_path
assert expected_meta == actual_meta
60 changes: 59 additions & 1 deletion mirrulations-client/tests/test_disk_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_save_json():
saver.save_json(path, data)
mock_dir.assert_called_once_with('/USTR')
mocked_file.assert_called_once_with(path, 'x', encoding='utf8')
mocked_file().write.assert_called_once_with(dumps(data['results']))
mocked_file().write.assert_called_once_with(dumps(data))


def test_save_binary():
Expand Down Expand Up @@ -154,3 +154,61 @@ def test_check_for_duplicates(capsys):
print_data = ''
captured = capsys.readouterr()
assert captured.out == print_data


def test_save_meta():
saver = DiskSaver()
test_meta_path = 'pdfminer/extraction-metadata.json'
test_meta = {
"extraction_status": {
"test_1.pdf": "Not Attempted",
"test_2.pdf": "Not Attempted",
}
}
with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file:
with patch('os.makedirs') as mock_dir:
saver.save_meta(test_meta_path, test_meta)
mock_dir.assert_called_once_with('pdfminer')
mocked_file.assert_called_once_with(test_meta_path,
'w', encoding='utf-8')
mocked_file().write.assert_called_once_with(dumps(test_meta))


def test_save_meta_where_meta_exists_already(mocker):
saver = DiskSaver()
test_meta_path = 'pdfminer/extraction-metadata.json'
test_meta = {
"extraction_status": {
"test_1.pdf": "Not Attempted",
"test_2.pdf": "Not Attempted",
}
}
new_meta = {
"extraction_status": {
"test_3.pdf": "Not Attempted",
}
}

combined_meta = {
"extraction_status": {
"test_3.pdf": "Not Attempted",
"test_1.pdf": "Not Attempted",
"test_2.pdf": "Not Attempted"
}
}
with patch('mirrclient.disk_saver.open', mock_open()) as mocked_file:
with patch('os.makedirs') as mock_dir:
saver.save_meta(test_meta_path, test_meta)
mock_dir.assert_called_once_with('pdfminer')
mocked_file.assert_called_once_with(test_meta_path,
'w', encoding='utf-8')
mocked_file().write.assert_called_once_with(dumps(test_meta))

with patch('mirrclient.disk_saver.open',
mock_open(read_data=dumps(test_meta))) as mocked_file:
with patch('os.makedirs') as mock_dir:
mocker.patch('os.path.exists', return_value=True)
mocker.patch('json.load', return_value=test_meta)
mocker.patch('os.remove')
saver.save_meta(test_meta_path, new_meta)
mocked_file().write.assert_called_once_with(dumps(combined_meta))
7 changes: 4 additions & 3 deletions mirrulations-client/tests/test_s3_saver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import json
import boto3
from moto import mock_s3
from pytest import fixture
Expand Down Expand Up @@ -63,14 +64,14 @@ def test_put_text_to_bucket():
conn = create_mock_mirrulations_bucket()
s3_bucket = S3Saver(bucket_name="test-mirrulations1")
test_data = {
"results": 'test'
"results": "test"
}
test_path = "data/test.json"
response = s3_bucket.save_json(test_path, test_data)
body = conn.Object("test-mirrulations1",
"data/test.json").get()["Body"].read()\
.decode("utf-8").strip('/"')
assert body == test_data["results"]
.decode("utf-8").strip("/'")
assert json.loads(body) == test_data
assert response["ResponseMetadata"]['HTTPStatusCode'] == 200


Expand Down
11 changes: 6 additions & 5 deletions mirrulations-client/tests/test_saver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from json import dumps
from json import dumps, loads
from unittest.mock import patch, mock_open
import os
from pytest import fixture
Expand Down Expand Up @@ -27,7 +27,7 @@ def test_saving_to_disk():
mocked_file.assert_called_once_with(test_path, 'x',
encoding='utf8')
mocked_file().write.assert_called_once_with(
dumps(test_data['results']))
dumps(test_data))


@mock_s3
Expand All @@ -44,7 +44,7 @@ def test_saving_to_s3():
body = conn.Object("test-mirrulations1",
"data/test.json").get()["Body"].read()\
.decode("utf-8").strip('/"')
assert body == test_data["results"]
assert loads(body) == test_data


@mock_s3
Expand All @@ -64,11 +64,12 @@ def test_saver_saves_text_to_multiple_places():
mocked_file.assert_called_once_with(test_path, 'x',
encoding='utf8')
mocked_file().write.assert_called_once_with(
dumps(test_data['results']))
dumps(test_data))
body = conn.Object("test-mirrulations1",
"/USTR/file.json").get()["Body"].read()\
.decode("utf-8").strip('/"')
assert body == test_data["results"]
print(body)
assert loads(body) == test_data


@mock_s3
Expand Down