Merge pull request #139 from SADiLaR/feat/edit-fulltext-field

Allow users to edit documents' fulltext fields (used in search)
SADiLaR · Nov 7, 2024 · 4ec3284 · 4ec3284
2 parents 7717c62 + 2c32403
commit 4ec3284
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 14 deletions.
diff --git a/Makefile b/Makefile
@@ -53,7 +53,7 @@ make-migrations:
 	@docker compose run --rm web python manage.py makemigrations
 
 migrate:
-	@docker compose run --rm web python manage.py migrate
+	@docker compose run --rm web python manage.py migrate $(migration_args)
 
 collectstatic:
 	@docker compose run --rm web python manage.py collectstatic --noinput

diff --git a/app/general/admin.py b/app/general/admin.py
@@ -13,12 +13,12 @@ class DocumentForm(ModelForm):
     class Meta:
         model = Document
         fields = "__all__"  # noqa: DJ007
+        # Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead
+        widgets = {"document_data": HiddenInput()}
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.fields["document_data"].widget = HiddenInput()
-
         # If the instance has a mime_type, the field should be disabled
         if not self.instance.mime_type:
             self.fields["mime_type"].widget = HiddenInput()
@@ -30,7 +30,14 @@ def clean(self):
         url = cleaned_data.get("url", "")
         uploaded_file = cleaned_data.get("uploaded_file", "")
 
-        if uploaded_file:
+        # We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually
+        # We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to
+        # support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if
+        # the PDF just overrode the text that they explicitly pasted into that field on the same form page!
+        override_existing_fulltext = (
+            "uploaded_file" in self.changed_data and "document_data" not in self.changed_data
+        )
+        if uploaded_file and override_existing_fulltext:
             file_type = magic.from_buffer(uploaded_file.read(), mime=True)
             if file_type != "application/pdf":
                 self.add_error("uploaded_file", _("Only PDF files are allowed."))
@@ -39,8 +46,15 @@ def clean(self):
             limit = 10 * 1024 * 1024
             if uploaded_file.size and uploaded_file.size > limit:
                 self.add_error("uploaded_file", _("File size must not exceed 10MB."))
-            if not self.has_error("uploaded_file"):
-                # Don't parse if validation above failed
+            if len(self.errors) == 0:
+                # Don't parse if validation above failed, or if there are any other errors
+                # We want to delay doing the PDF -> text conversion until there are no other errors with the form,
+                # because it is quite costly. This is compounded by the fact that Django has included a hard-coded
+                # `novalidate` attribute on admin forms for editing (for at least 8 years
+                # https://code.djangoproject.com/ticket/26982). Therefore, the fast clientside validation simply does
+                # not run, which means we need to optimise the server-side validation as much as we can to get a good
+                # experience.
+
                 try:
                     cleaned_data["document_data"] = pdf_to_text(uploaded_file)
                 except GetTextError:
@@ -57,6 +71,12 @@ def clean(self):
         return cleaned_data
 
 
+class DocumentFormWithFulltext(DocumentForm):
+    class Meta:
+        model = Document
+        fields = "__all__"  # noqa: DJ007
+
+
 class DocumentAdmin(SimpleHistoryAdmin):
     ordering = ["title"]
     list_display = ["title", "license", "document_type", "available"]
@@ -65,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin):
     form = DocumentForm
     history_list_display = ["title", "license", "document_type", "available"]
 
+    def get_form(self, request, *args, **kwargs):
+        # Show the fulltext field if the user has the requisite permission
+        if request.user.has_perm("general.can_edit_fulltext"):
+            kwargs["form"] = DocumentFormWithFulltext
+        return super(DocumentAdmin, self).get_form(request, *args, **kwargs)
+
 
 class SubjectAdmin(SimpleHistoryAdmin):
     ordering = ["name"]

diff --git a/app/general/migrations/0014_alter_document_options.py b/app/general/migrations/0014_alter_document_options.py
@@ -0,0 +1,17 @@
+# Generated by Django 5.0.8 on 2024-11-05 09:30
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('general', '0013_rename_documentfile_document_and_more'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='document',
+            options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'},
+        ),
+    ]
diff --git a/app/general/migrations/0015_alter_document_document_data.py b/app/general/migrations/0015_alter_document_document_data.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.0.8 on 2024-11-07 09:08
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('general', '0014_alter_document_options'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='document',
+            name='document_data',
+            field=models.TextField(blank=True, help_text="The searchable text extracted from the document where possible, but it can also be edited.", verbose_name='Searchable content'),
+        ),
+    ]
diff --git a/app/general/models.py b/app/general/models.py
@@ -135,7 +135,13 @@ class Document(models.Model):
     document_type = models.CharField(
         max_length=200, choices=document_type_choices, verbose_name=_("document category")
     )
-    document_data = models.TextField(blank=True, verbose_name=_("document data"))
+    document_data = models.TextField(
+        blank=True,
+        verbose_name=_("Searchable content"),
+        help_text=_(
+            "The searchable text extracted from the document where possible, but it can also be edited."
+        ),
+    )
     institution = models.ForeignKey(
         "Institution", on_delete=models.CASCADE, verbose_name=_("institution")
     )
@@ -167,5 +173,7 @@ class Meta:
             GinIndex(fields=["search_vector"]),
         ]
 
+        permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")]
+
     def __str__(self):
         return self.title
diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py
@@ -2,12 +2,14 @@
 import unittest
 
 from django.core.files.uploadedfile import SimpleUploadedFile
+from django.test import TestCase
+from django.urls import reverse
 
-from general.admin import DocumentForm
-from general.models import Institution
+from general.admin import DocumentForm, DocumentFormWithFulltext
+from general.models import Document, Institution
 
 
-class TestDocumentForm(unittest.TestCase):
+class TestDocumentForm(TestCase):
     def __init__(self, methodName: str = "runTest"):
         super().__init__(methodName)
         self.form = None
@@ -20,14 +22,20 @@ def setUp(self):
             pdf_file = f.read()
 
         self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf")
+        self.test_institution = Institution.objects.create(
+            name="Test Institution for Document tests"
+        )
+
+    def tearDown(self):
+        self.test_institution.delete()
 
     def test_clean_without_url_and_file(self):
         tests_form = {
             "title": "Test",
             "license": "MIT",
             "document_type": "Glossary",
             "mime_type": "pdf",
-            "institution": Institution.objects.create(name="Test Institution"),
+            "institution": self.test_institution,
             "url": "",
             "uploaded_file": "",
             "description": "Test description",
@@ -46,7 +54,7 @@ def test_clean_without_file(self):
             "license": "(c)",
             "document_type": "Glossary",
             "mime_type": "pdf",
-            "institution": Institution.objects.create(name="Test Institution 2"),
+            "institution": self.test_institution,
             "url": "www.example.com",
             "uploaded_file": "",
             "document_data": "",
@@ -63,7 +71,7 @@ def test_clean_without_url(self):
             "license": "CC0",
             "document_type": "Glossary",
             "mime_type": "pdf",
-            "institution": Institution.objects.create(name="Test Institution 3"),
+            "institution": self.test_institution,
             "url": "",
             "uploaded_file": self.file_mock,
             "document_data": "",
@@ -81,7 +89,7 @@ def test_clean_with_large_file(self):
             "license": "MIT",
             "document_type": "Glossary",
             "mime_type": "pdf",
-            "institution": Institution.objects.create(name="Test Institution 4"),
+            "institution": self.test_institution,
             "url": "",
             "uploaded_file": self.file_mock,
             "description": "Test description",
@@ -92,6 +100,94 @@ def test_clean_with_large_file(self):
         self.assertIn("uploaded_file", form.errors)
         self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."])
 
+    def test_edited_pdf_takes_priority_over_unedited_fulltext(self):
+        tests_form = {
+            "title": "Test",
+            "license": "CC0",
+            "document_type": "Glossary",
+            "mime_type": "pdf",
+            "institution": self.test_institution,
+            "url": "",
+            "uploaded_file": self.file_mock,
+            "document_data": "",
+            "description": "Test description",
+        }
+
+        # Test both kinds of forms - both can have edited PDFs and unedited fulltext
+        for FormType in [DocumentForm, DocumentFormWithFulltext]:
+            form = FormType(tests_form, files={"uploaded_file": self.file_mock})
+            self.assertTrue(form.is_valid())
+            self.assertNotEqual(form.cleaned_data["document_data"], "")
+
+    def test_edited_fulltext_takes_priority_over_edited_pdf(self):
+        custom_data = "testingstringthatisnotinsidelorem.pdf"
+        tests_form = {
+            "title": "Test",
+            "license": "CC0",
+            "document_type": "Glossary",
+            "mime_type": "pdf",
+            "institution": self.test_institution,
+            "url": "",
+            "uploaded_file": self.file_mock,
+            "document_data": custom_data,
+            "description": "Test description",
+        }
+
+        # We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext
+        form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock})
+        self.assertTrue(form.is_valid())
+        self.assertEqual(form.cleaned_data["document_data"], custom_data)
+
+    def test_edited_fulltext_reflects_in_database_and_search(self):
+        title = "Test document with edited fulltext"
+        custom_data = "testingstringthatisnotinsidelorem.pdf"
+
+        original_form = {
+            "title": title,
+            "license": "CC0",
+            "document_type": "Glossary",
+            "mime_type": "pdf",
+            "institution": self.test_institution,
+            "url": "",
+            "uploaded_file": self.file_mock,
+            "document_data": "",
+            "description": "Test description",
+        }
+
+        # Upload the form with the PDF fulltext extracted
+        form = DocumentForm(original_form, files={"uploaded_file": self.file_mock})
+        self.assertTrue(form.is_valid())
+        doc = form.save()
+        orig_fulltext = doc.document_data
+        self.assertNotEqual(orig_fulltext, custom_data)
+
+        # Check we can search by PDF content
+        response = self.client.get(reverse("search"), {"search": orig_fulltext})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.context["page_obj"][0]["heading"], title)
+
+        # Now, upload a copy with edited fulltext
+        edit_form = {**original_form, "document_data": custom_data, "id": doc.id}
+        form = DocumentFormWithFulltext(
+            edit_form, files={"uploaded_file": self.file_mock}, instance=doc
+        )
+        self.assertTrue(form.is_valid())
+        doc = form.save()
+        self.assertEqual(doc.document_data, custom_data)
+
+        # Check that: 1. we only have one document with this title, and 2. it has the correct fulltext
+        self.assertEqual(Document.objects.get(title=title).document_data, custom_data)
+
+        # Check we can search by the new content too
+        response = self.client.get(reverse("search"), {"search": custom_data})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.context["page_obj"][0]["heading"], title)
+
+        # Check that we CANNOT search by the old content anymore
+        response = self.client.get(reverse("search"), {"search": orig_fulltext})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(len(response.context["page_obj"]), 0)
+
 
 if __name__ == "__main__":
     unittest.main()