diff --git a/Makefile b/Makefile index c0551aaa..58da0118 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ make-migrations: @docker compose run --rm web python manage.py makemigrations migrate: - @docker compose run --rm web python manage.py migrate + @docker compose run --rm web python manage.py migrate $(migration_args) collectstatic: @docker compose run --rm web python manage.py collectstatic --noinput diff --git a/app/general/admin.py b/app/general/admin.py index ba878a8f..bbdbb8be 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -13,12 +13,12 @@ class DocumentForm(ModelForm): class Meta: model = Document fields = "__all__" # noqa: DJ007 + # Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead + widgets = {"document_data": HiddenInput()} def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.fields["document_data"].widget = HiddenInput() - # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: self.fields["mime_type"].widget = HiddenInput() @@ -30,7 +30,14 @@ def clean(self): url = cleaned_data.get("url", "") uploaded_file = cleaned_data.get("uploaded_file", "") - if uploaded_file: + # We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually + # We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to + # support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if + # the PDF just overrode the text that they explicitly pasted into that field on the same form page! + override_existing_fulltext = ( + "uploaded_file" in self.changed_data and "document_data" not in self.changed_data + ) + if uploaded_file and override_existing_fulltext: file_type = magic.from_buffer(uploaded_file.read(), mime=True) if file_type != "application/pdf": self.add_error("uploaded_file", _("Only PDF files are allowed.")) @@ -39,8 +46,15 @@ def clean(self): limit = 10 * 1024 * 1024 if uploaded_file.size and uploaded_file.size > limit: self.add_error("uploaded_file", _("File size must not exceed 10MB.")) - if not self.has_error("uploaded_file"): - # Don't parse if validation above failed + if len(self.errors) == 0: + # Don't parse if validation above failed, or if there are any other errors + # We want to delay doing the PDF -> text conversion until there are no other errors with the form, + # because it is quite costly. This is compounded by the fact that Django has included a hard-coded + # `novalidate` attribute on admin forms for editing (for at least 8 years + # https://code.djangoproject.com/ticket/26982). Therefore, the fast clientside validation simply does + # not run, which means we need to optimise the server-side validation as much as we can to get a good + # experience. + try: cleaned_data["document_data"] = pdf_to_text(uploaded_file) except GetTextError: @@ -57,6 +71,12 @@ def clean(self): return cleaned_data +class DocumentFormWithFulltext(DocumentForm): + class Meta: + model = Document + fields = "__all__" # noqa: DJ007 + + class DocumentAdmin(SimpleHistoryAdmin): ordering = ["title"] list_display = ["title", "license", "document_type", "available"] @@ -65,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin): form = DocumentForm history_list_display = ["title", "license", "document_type", "available"] + def get_form(self, request, *args, **kwargs): + # Show the fulltext field if the user has the requisite permission + if request.user.has_perm("general.can_edit_fulltext"): + kwargs["form"] = DocumentFormWithFulltext + return super(DocumentAdmin, self).get_form(request, *args, **kwargs) + class SubjectAdmin(SimpleHistoryAdmin): ordering = ["name"] diff --git a/app/general/migrations/0014_alter_document_options.py b/app/general/migrations/0014_alter_document_options.py new file mode 100644 index 00000000..cb69acca --- /dev/null +++ b/app/general/migrations/0014_alter_document_options.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.8 on 2024-11-05 09:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0013_rename_documentfile_document_and_more'), + ] + + operations = [ + migrations.AlterModelOptions( + name='document', + options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'}, + ), + ] diff --git a/app/general/migrations/0015_alter_document_document_data.py b/app/general/migrations/0015_alter_document_document_data.py new file mode 100644 index 00000000..590fdb2e --- /dev/null +++ b/app/general/migrations/0015_alter_document_document_data.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.8 on 2024-11-07 09:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0014_alter_document_options'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='document_data', + field=models.TextField(blank=True, help_text="The searchable text extracted from the document where possible, but it can also be edited.", verbose_name='Searchable content'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index 0e56d660..43fe0be5 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -135,7 +135,13 @@ class Document(models.Model): document_type = models.CharField( max_length=200, choices=document_type_choices, verbose_name=_("document category") ) - document_data = models.TextField(blank=True, verbose_name=_("document data")) + document_data = models.TextField( + blank=True, + verbose_name=_("Searchable content"), + help_text=_( + "The searchable text extracted from the document where possible, but it can also be edited." + ), + ) institution = models.ForeignKey( "Institution", on_delete=models.CASCADE, verbose_name=_("institution") ) @@ -167,5 +173,7 @@ class Meta: GinIndex(fields=["search_vector"]), ] + permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")] + def __str__(self): return self.title diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index 11d14b0a..3b31e3d4 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -2,12 +2,14 @@ import unittest from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase +from django.urls import reverse -from general.admin import DocumentForm -from general.models import Institution +from general.admin import DocumentForm, DocumentFormWithFulltext +from general.models import Document, Institution -class TestDocumentForm(unittest.TestCase): +class TestDocumentForm(TestCase): def __init__(self, methodName: str = "runTest"): super().__init__(methodName) self.form = None @@ -20,6 +22,12 @@ def setUp(self): pdf_file = f.read() self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf") + self.test_institution = Institution.objects.create( + name="Test Institution for Document tests" + ) + + def tearDown(self): + self.test_institution.delete() def test_clean_without_url_and_file(self): tests_form = { @@ -27,7 +35,7 @@ def test_clean_without_url_and_file(self): "license": "MIT", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution"), + "institution": self.test_institution, "url": "", "uploaded_file": "", "description": "Test description", @@ -46,7 +54,7 @@ def test_clean_without_file(self): "license": "(c)", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 2"), + "institution": self.test_institution, "url": "www.example.com", "uploaded_file": "", "document_data": "", @@ -63,7 +71,7 @@ def test_clean_without_url(self): "license": "CC0", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 3"), + "institution": self.test_institution, "url": "", "uploaded_file": self.file_mock, "document_data": "", @@ -81,7 +89,7 @@ def test_clean_with_large_file(self): "license": "MIT", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 4"), + "institution": self.test_institution, "url": "", "uploaded_file": self.file_mock, "description": "Test description", @@ -92,6 +100,94 @@ def test_clean_with_large_file(self): self.assertIn("uploaded_file", form.errors) self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."]) + def test_edited_pdf_takes_priority_over_unedited_fulltext(self): + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Test both kinds of forms - both can have edited PDFs and unedited fulltext + for FormType in [DocumentForm, DocumentFormWithFulltext]: + form = FormType(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertNotEqual(form.cleaned_data["document_data"], "") + + def test_edited_fulltext_takes_priority_over_edited_pdf(self): + custom_data = "testingstringthatisnotinsidelorem.pdf" + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": custom_data, + "description": "Test description", + } + + # We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext + form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertEqual(form.cleaned_data["document_data"], custom_data) + + def test_edited_fulltext_reflects_in_database_and_search(self): + title = "Test document with edited fulltext" + custom_data = "testingstringthatisnotinsidelorem.pdf" + + original_form = { + "title": title, + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Upload the form with the PDF fulltext extracted + form = DocumentForm(original_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + doc = form.save() + orig_fulltext = doc.document_data + self.assertNotEqual(orig_fulltext, custom_data) + + # Check we can search by PDF content + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Now, upload a copy with edited fulltext + edit_form = {**original_form, "document_data": custom_data, "id": doc.id} + form = DocumentFormWithFulltext( + edit_form, files={"uploaded_file": self.file_mock}, instance=doc + ) + self.assertTrue(form.is_valid()) + doc = form.save() + self.assertEqual(doc.document_data, custom_data) + + # Check that: 1. we only have one document with this title, and 2. it has the correct fulltext + self.assertEqual(Document.objects.get(title=title).document_data, custom_data) + + # Check we can search by the new content too + response = self.client.get(reverse("search"), {"search": custom_data}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Check that we CANNOT search by the old content anymore + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.context["page_obj"]), 0) + if __name__ == "__main__": unittest.main()