diff --git a/readthedocs/core/tasks.py b/readthedocs/core/tasks.py index b318bdcfb7b..0fe62486b06 100644 --- a/readthedocs/core/tasks.py +++ b/readthedocs/core/tasks.py @@ -8,9 +8,14 @@ from django.conf import settings from django.contrib.auth.models import User from django.core.mail import EmailMultiAlternatives +from django.db.models import Q from readthedocs.builds.utils import memcache_lock from readthedocs.core.history import set_change_reason +from readthedocs.core.utils.db import delete_in_batches +from readthedocs.core.utils.db import raw_delete_in_batches +from readthedocs.projects.models import AddonsConfig +from readthedocs.projects.models import ImportedFile from readthedocs.worker import app @@ -110,3 +115,26 @@ def delete_object(self, model_name: str, pk: int, user_id: int | None = None): task_log.info("Object deleted.") else: task_log.info("Object does not exist.") + + +@app.task(queue="web") +def delete_outdated_imported_files(limit): + """ + Delete all imported files that are no longer needed. + + We only need to keep track of the top-level ``404.html`` and all ``index.html`` files. + """ + query = ImportedFile.objects.exclude(Q(path="404.html") | Q(name="index.html")) + raw_delete_in_batches(query, limit=limit) + + +@app.task(queue="web") +def delete_orphaned_addons_configs(limit): + """ + Delete all AddonsConfig objects. + + These were created by mistake, and are not linked to any project. + """ + # NOTE: we can't raw delete because AddonsConfig is related to other models (AddonSearchFilter). + query = AddonsConfig.objects.filter(project=None) + delete_in_batches(query, limit=limit) diff --git a/readthedocs/core/tests/test_db_utils.py b/readthedocs/core/tests/test_db_utils.py index 5e9691029d8..0d88d2bdfe7 100644 --- a/readthedocs/core/tests/test_db_utils.py +++ b/readthedocs/core/tests/test_db_utils.py @@ -2,7 +2,7 @@ from django_dynamic_fixture import get from readthedocs.builds.models import Version -from readthedocs.core.utils.db import delete_in_batches +from readthedocs.core.utils.db import delete_in_batches, raw_delete_in_batches from readthedocs.projects.models import Project @@ -127,3 +127,181 @@ def test_versions_deletion_in_batches(self): # The project should still exist assert Project.objects.filter(slug="version-test").count() == 1 + + def test_delete_with_limit_smaller_than_total(self): + """Test deleting with a limit smaller than total queryset count.""" + # Create 20 projects + projects = [get(Project, slug=f"limit-project-{i}") for i in range(20)] + + # Delete only 10 projects with a limit + queryset = Project.objects.filter(slug__startswith="limit-project-") + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=3, limit=10) + + # Should delete exactly 10 projects and their related objects + assert deleted_counter["projects.Project"] == 10 + # Should still have 10 projects remaining + assert Project.objects.filter(slug__startswith="limit-project-").count() == 10 + + def test_delete_with_limit_larger_than_total(self): + """Test deleting with a limit larger than total queryset count.""" + # Create 5 projects + projects = [get(Project, slug=f"over-limit-{i}") for i in range(5)] + + # Set limit larger than actual count + queryset = Project.objects.filter(slug__startswith="over-limit-") + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=100) + + # Should delete all 5 projects + assert deleted_counter["projects.Project"] == 5 + assert Project.objects.filter(slug__startswith="over-limit-").count() == 0 + + def test_delete_with_limit_equal_to_batch_size(self): + """Test deleting with a limit equal to the batch size.""" + # Create 10 projects + projects = [get(Project, slug=f"equal-limit-{i}") for i in range(10)] + + # Set limit equal to batch_size + queryset = Project.objects.filter(slug__startswith="equal-limit-") + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=5, limit=5) + + # Should delete exactly 5 projects + assert deleted_counter["projects.Project"] == 5 + assert Project.objects.filter(slug__startswith="equal-limit-").count() == 5 + + def test_delete_with_limit_one(self): + """Test deleting with limit=1 (edge case).""" + # Create 5 projects + projects = [get(Project, slug=f"one-limit-{i}") for i in range(5)] + + # Set limit to 1 + queryset = Project.objects.filter(slug__startswith="one-limit-") + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=1) + + # Should delete exactly 1 project and its related objects + assert deleted_counter["projects.Project"] == 1 + assert Project.objects.filter(slug__startswith="one-limit-").count() == 4 + + +class TestRawDeleteInBatches(TestCase): + """Tests for the raw_delete_in_batches utility function.""" + + def test_empty_queryset(self): + """Test raw deleting an empty queryset.""" + queryset = Version.objects.none() + # Should not raise any errors + raw_delete_in_batches(queryset, batch_size=10) + + def test_queryset_smaller_than_batch_size(self): + """Test that a queryset smaller than batch_size uses regular raw delete.""" + # Create a project with 3 versions + project = get(Project, slug="raw-small-project") + versions = [get(Version, project=project, slug=f"raw-v{i}") for i in range(3)] + + initial_count = Version.objects.filter(slug__startswith="raw-v").count() + assert initial_count == 3 + + # Delete with a batch_size larger than the number of objects + queryset = Version.objects.filter(slug__startswith="raw-v") + raw_delete_in_batches(queryset, batch_size=10) + + # Verify versions were deleted + assert Version.objects.filter(slug__startswith="raw-v").count() == 0 + + def test_queryset_equal_to_batch_size(self): + """Test that a queryset equal to batch_size uses regular raw delete.""" + # Create a project with exactly 5 versions + project = get(Project, slug="raw-equal-project") + versions = [get(Version, project=project, slug=f"raw-eq-{i}") for i in range(5)] + + # Delete with a batch_size equal to the number of objects + queryset = Version.objects.filter(slug__startswith="raw-eq-") + raw_delete_in_batches(queryset, batch_size=5) + + assert Version.objects.filter(slug__startswith="raw-eq-").count() == 0 + + def test_queryset_larger_than_batch_size(self): + """Test that a queryset larger than batch_size is raw deleted in batches.""" + # Create a project with 10 versions + project = get(Project, slug="raw-large-project") + versions = [get(Version, project=project, slug=f"raw-large-{i}") for i in range(10)] + + # Delete with a batch_size smaller than the number of objects + queryset = Version.objects.filter(slug__startswith="raw-large-") + raw_delete_in_batches(queryset, batch_size=3) + + assert Version.objects.filter(slug__startswith="raw-large-").count() == 0 + + def test_multiple_batches(self): + """Test raw deletion works correctly across multiple batches.""" + # Create a project with 25 versions + project = get(Project, slug="raw-batch-project") + versions = [get(Version, project=project, slug=f"raw-batch-{i}") for i in range(25)] + + # Delete with batch_size=7, requiring 4 batches (7, 7, 7, 4) + queryset = Version.objects.filter(slug__startswith="raw-batch-") + raw_delete_in_batches(queryset, batch_size=7) + + assert Version.objects.filter(slug__startswith="raw-batch-").count() == 0 + + def test_batch_size_one(self): + """Test raw deletion with batch_size=1 (edge case).""" + # Create a project with 3 versions + project = get(Project, slug="raw-single-project") + versions = [get(Version, project=project, slug=f"raw-single-{i}") for i in range(3)] + + queryset = Version.objects.filter(slug__startswith="raw-single-") + raw_delete_in_batches(queryset, batch_size=1) + + assert Version.objects.filter(slug__startswith="raw-single-").count() == 0 + + def test_raw_delete_with_limit_smaller_than_total(self): + """Test raw deleting with a limit smaller than total queryset count.""" + # Create a project with 20 versions + project = get(Project, slug="raw-limit-project") + versions = [get(Version, project=project, slug=f"raw-limit-{i}") for i in range(20)] + + # Delete only 10 versions with a limit + queryset = Version.objects.filter(slug__startswith="raw-limit-") + raw_delete_in_batches(queryset, batch_size=3, limit=10) + + # Should have 10 versions remaining + assert Version.objects.filter(slug__startswith="raw-limit-").count() == 10 + + def test_raw_delete_with_limit_larger_than_total(self): + """Test raw deleting with a limit larger than total queryset count.""" + # Create a project with 5 versions + project = get(Project, slug="raw-over-limit") + versions = [get(Version, project=project, slug=f"raw-over-{i}") for i in range(5)] + + # Set limit larger than actual count + queryset = Version.objects.filter(slug__startswith="raw-over-") + raw_delete_in_batches(queryset, batch_size=2, limit=100) + + # Should delete all 5 versions + assert Version.objects.filter(slug__startswith="raw-over-").count() == 0 + + def test_raw_delete_with_limit_equal_to_batch_size(self): + """Test raw deleting with a limit equal to the batch size.""" + # Create a project with 10 versions + project = get(Project, slug="raw-equal-limit") + versions = [get(Version, project=project, slug=f"raw-eq-lim-{i}") for i in range(10)] + + # Set limit equal to batch_size + queryset = Version.objects.filter(slug__startswith="raw-eq-lim-") + raw_delete_in_batches(queryset, batch_size=5, limit=5) + + # Should have 5 versions remaining + assert Version.objects.filter(slug__startswith="raw-eq-lim-").count() == 5 + + def test_raw_delete_with_limit_one(self): + """Test raw deleting with limit=1 (edge case).""" + # Create a project with 5 versions + project = get(Project, slug="raw-one-limit") + versions = [get(Version, project=project, slug=f"raw-one-{i}") for i in range(5)] + + # Set limit to 1 + queryset = Version.objects.filter(slug__startswith="raw-one-") + raw_delete_in_batches(queryset, batch_size=2, limit=1) + + # Should have 4 versions remaining + assert Version.objects.filter(slug__startswith="raw-one-").count() == 4 diff --git a/readthedocs/core/utils/db.py b/readthedocs/core/utils/db.py index 9822c9083c3..640539d1fc2 100644 --- a/readthedocs/core/utils/db.py +++ b/readthedocs/core/utils/db.py @@ -2,7 +2,7 @@ from itertools import batched -def delete_in_batches(queryset, batch_size=50) -> tuple[int, dict]: +def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tuple[int, dict]: """ Delete a queryset in batches to avoid long transactions or big queries. @@ -15,25 +15,84 @@ def delete_in_batches(queryset, batch_size=50) -> tuple[int, dict]: Similar to Django's ``QuerySet.delete()``, it returns a tuple with the number of deleted objects and a dictionary with the number of deletions per model type. + This is also useful when you don't want to overload the DB by deleting + a large number of records at once, since .delete() doesn't support limits/offsets. + :param queryset: Django queryset to delete :param batch_size: Number of records to delete per batch + :param limit: Maximum number of records to delete from the queryset """ - # Don't use batch deletion if the number of records - # is smaller or equal to the batch size. - count = queryset.count() - if count == 0: + if limit is not None and limit <= 0: return 0, {} - if count <= batch_size: - return queryset.delete() + + if not limit: + # Don't use batch deletion if the number of records + # is smaller or equal to the batch size. + # Only do this check if no limit is set, + # since with a limit we always need to get a batch. + count = queryset.count() + if count == 0: + return 0, {} + if count <= batch_size: + return queryset.delete() model = queryset.model total_deleted = 0 deleted_counter = Counter() # We can't use a limit or offset with .delete, - # so we first extract the IDs and perform the deletion in anothr query. + # so we first extract the IDs and perform the deletion in another query. all_pks = queryset.values_list("pk", flat=True) + if limit: + all_pks = all_pks[:limit] for batch in batched(all_pks, batch_size): total_deleted_batch, deleted_counter_batch = model.objects.filter(pk__in=batch).delete() total_deleted += total_deleted_batch deleted_counter.update(deleted_counter_batch) return total_deleted, dict(deleted_counter) + + +def raw_delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> int: + """ + Raw delete a queryset in batches to avoid long transactions or big queries. + + Similar to ``delete_in_batches``, but uses ``_raw_delete``. + This is useful when you don't want to overload the DB by deleting + a large number of records at once, since _raw_delete() doesn't support + limits/offsets. + + .. warning:: + + Since this uses ``_raw_delete``, it won't trigger any + signals, and won't cascade delete related objects. + Use it only if you are sure there are no related objects + that need to be deleted/updated. + + :return: Number of deleted records + """ + if limit is not None and limit <= 0: + return 0 + + if not limit: + # Don't use batch deletion if the number of records + # is smaller or equal to the batch size. + # Only do this check if no limit is set, + # since with a limit we always need to get a batch. + count = queryset.count() + if count == 0: + return count + if count <= batch_size: + queryset._raw_delete(queryset.db) + return count + + total_deleted = 0 + model = queryset.model + # We can't use a limit or offset with .raw_delete, + # so we first extract the IDs and perform the deletion in another query. + all_pks = queryset.values_list("pk", flat=True) + if limit: + all_pks = all_pks[:limit] + for batch in batched(all_pks, batch_size): + qs = model.objects.filter(pk__in=batch) + qs._raw_delete(qs.db) + total_deleted += len(batch) + return total_deleted diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 4040ffe7bc5..ee6bb717cec 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -699,8 +699,15 @@ def BUILD_MEMORY_LIMIT(self): }, "every-day-delete-old-buildata-models": { "task": "readthedocs.telemetry.tasks.delete_old_build_data", - "schedule": crontab(minute=0, hour=2), + # NOTE: we are running this task every hour for now, + # since we have lots of objects to delete, and we are limiting + # the number of deletions per task run. + # TODO: go back to once a day (unlimited) after we delete the backlog of objects. + # It should take around ~1 week to delete all the old objects on community, + # commercial doesn't have this problem. + "schedule": crontab(minute=0, hour="*"), "options": {"queue": "web"}, + "kwargs": {"limit": 10_000}, }, "every-day-delete-old-buildconfig-models": { "task": "readthedocs.builds.tasks.remove_orphan_build_config", @@ -754,6 +761,23 @@ def BUILD_MEMORY_LIMIT(self): "schedule": crontab(minute=0, hour=4), "options": {"queue": "web"}, }, + # TODO: delete this task when all imported files pending deletion are done. + # It shuold take around 36 days to delete all the old imported files on community, + # and 6 days in commercial. + "every-hour-delete-imported-files": { + "task": "readthedocs.core.tasks.delete_outdated_imported_files", + "schedule": crontab(minute=15, hour="*"), + "options": {"queue": "web"}, + "kwargs": {"limit": 10_000}, + }, + # TODO: delete this task when all orphan AddonConfig objects are deleted. + # It should take around 6 days on community, commercial doesn't have this problem. + "every-hour-delete-orphaned-addons-configs": { + "task": "readthedocs.core.tasks.delete_orphaned_addons_configs", + "schedule": crontab(minute=30, hour="*"), + "options": {"queue": "web"}, + "kwargs": {"limit": 5_000}, + }, } # Sentry diff --git a/readthedocs/telemetry/tasks.py b/readthedocs/telemetry/tasks.py index 9100be0042d..de584294529 100644 --- a/readthedocs/telemetry/tasks.py +++ b/readthedocs/telemetry/tasks.py @@ -1,10 +1,10 @@ """Tasks related to telemetry.""" from django.conf import settings -from django.db import connections from django.utils import timezone from readthedocs.builds.models import Build +from readthedocs.core.utils.db import raw_delete_in_batches from readthedocs.telemetry.models import BuildData from readthedocs.worker import app @@ -23,26 +23,24 @@ def save_build_data(build_id, data): @app.task(queue="web") -def delete_old_build_data(): +def delete_old_build_data(limit=None): """ Delete BuildData models older than ``RTD_TELEMETRY_DATA_RETENTION_DAYS``. This is intended to run from a periodic task daily. NOTE: the logic of this task could be improved to keep longer data we care - more (eg. active projects )and remove data we don't (eg. builds from spam projects) + more (eg. active projects ) and remove data we don't (eg. builds from spam projects) """ retention_days = settings.RTD_TELEMETRY_DATA_RETENTION_DAYS days_ago = timezone.now().date() - timezone.timedelta(days=retention_days) - # NOTE: we are using raw SQL here to avoid Django doing a SELECT first to - # send `pre_` and `post_` delete signals - # See https://docs.djangoproject.com/en/4.2/ref/models/querysets/#delete - with connections["telemetry"].cursor() as cursor: - cursor.execute( - # "SELECT COUNT(*) FROM telemetry_builddata WHERE created BETWEEN %s AND %s", - "DELETE FROM telemetry_builddata WHERE created BETWEEN %s AND %s", - [ - days_ago - timezone.timedelta(days=90), - days_ago, - ], - ) + + # NOTE: We use _raw_delete to avoid Django fetching all objects + # before the deletion. `pre_delete` and `post_delete` signals + # won't be sent, this is fine as we don't have any special logic + # for the BuildData model, and doesn't have related objects. + query = BuildData.objects.filter(created__lt=days_ago) + if limit: + raw_delete_in_batches(query, limit=limit) + else: + query._raw_delete(query.db)