Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions readthedocs/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@
from django.conf import settings
from django.contrib.auth.models import User
from django.core.mail import EmailMultiAlternatives
from django.db.models import Q

from readthedocs.builds.utils import memcache_lock
from readthedocs.core.history import set_change_reason
from readthedocs.core.utils.db import delete_in_batches
from readthedocs.core.utils.db import raw_delete_in_batches
from readthedocs.projects.models import AddonsConfig
from readthedocs.projects.models import ImportedFile
from readthedocs.worker import app


Expand Down Expand Up @@ -110,3 +115,26 @@ def delete_object(self, model_name: str, pk: int, user_id: int | None = None):
task_log.info("Object deleted.")
else:
task_log.info("Object does not exist.")


@app.task(queue="web")
def delete_outdated_imported_files(limit):
"""
Delete all imported files that are no longer needed.

We only need to keep track of the top-level ``404.html`` and all ``index.html`` files.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that we are not inserting new ImportedFiles that don't meet these conditions, right?

"""
query = ImportedFile.objects.exclude(Q(path="404.html") | Q(name="index.html"))
raw_delete_in_batches(query, limit=limit)


@app.task(queue="web")
def delete_orphaned_addons_configs(limit):
"""
Delete all AddonsConfig objects.

These were created by mistake, and are not linked to any project.
"""
# NOTE: we can't raw delete because AddonsConfig is related to other models (AddonSearchFilter).
query = AddonsConfig.objects.filter(project=None)
delete_in_batches(query, limit=limit)
180 changes: 179 additions & 1 deletion readthedocs/core/tests/test_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from django_dynamic_fixture import get

from readthedocs.builds.models import Version
from readthedocs.core.utils.db import delete_in_batches
from readthedocs.core.utils.db import delete_in_batches, raw_delete_in_batches
from readthedocs.projects.models import Project


Expand Down Expand Up @@ -127,3 +127,181 @@ def test_versions_deletion_in_batches(self):

# The project should still exist
assert Project.objects.filter(slug="version-test").count() == 1

def test_delete_with_limit_smaller_than_total(self):
"""Test deleting with a limit smaller than total queryset count."""
# Create 20 projects
projects = [get(Project, slug=f"limit-project-{i}") for i in range(20)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable projects is not used.

Suggested change
projects = [get(Project, slug=f"limit-project-{i}") for i in range(20)]
for i in range(20):
get(Project, slug=f"limit-project-{i}")

Copilot uses AI. Check for mistakes.

# Delete only 10 projects with a limit
queryset = Project.objects.filter(slug__startswith="limit-project-")
total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=3, limit=10)

# Should delete exactly 10 projects and their related objects
assert deleted_counter["projects.Project"] == 10
# Should still have 10 projects remaining
assert Project.objects.filter(slug__startswith="limit-project-").count() == 10

def test_delete_with_limit_larger_than_total(self):
"""Test deleting with a limit larger than total queryset count."""
# Create 5 projects
projects = [get(Project, slug=f"over-limit-{i}") for i in range(5)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable projects is not used.

Copilot uses AI. Check for mistakes.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copilot, YOU wrote these tests >:x


# Set limit larger than actual count
queryset = Project.objects.filter(slug__startswith="over-limit-")
total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=100)

# Should delete all 5 projects
assert deleted_counter["projects.Project"] == 5
assert Project.objects.filter(slug__startswith="over-limit-").count() == 0

def test_delete_with_limit_equal_to_batch_size(self):
"""Test deleting with a limit equal to the batch size."""
# Create 10 projects
projects = [get(Project, slug=f"equal-limit-{i}") for i in range(10)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable projects is not used.

Copilot uses AI. Check for mistakes.

# Set limit equal to batch_size
queryset = Project.objects.filter(slug__startswith="equal-limit-")
total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=5, limit=5)

# Should delete exactly 5 projects
assert deleted_counter["projects.Project"] == 5
assert Project.objects.filter(slug__startswith="equal-limit-").count() == 5

def test_delete_with_limit_one(self):
"""Test deleting with limit=1 (edge case)."""
# Create 5 projects
projects = [get(Project, slug=f"one-limit-{i}") for i in range(5)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable projects is not used.

Copilot uses AI. Check for mistakes.

# Set limit to 1
queryset = Project.objects.filter(slug__startswith="one-limit-")
total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=1)

# Should delete exactly 1 project and its related objects
assert deleted_counter["projects.Project"] == 1
assert Project.objects.filter(slug__startswith="one-limit-").count() == 4


class TestRawDeleteInBatches(TestCase):
"""Tests for the raw_delete_in_batches utility function."""

def test_empty_queryset(self):
"""Test raw deleting an empty queryset."""
queryset = Version.objects.none()
# Should not raise any errors
raw_delete_in_batches(queryset, batch_size=10)

def test_queryset_smaller_than_batch_size(self):
"""Test that a queryset smaller than batch_size uses regular raw delete."""
# Create a project with 3 versions
project = get(Project, slug="raw-small-project")
versions = [get(Version, project=project, slug=f"raw-v{i}") for i in range(3)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

initial_count = Version.objects.filter(slug__startswith="raw-v").count()
assert initial_count == 3

# Delete with a batch_size larger than the number of objects
queryset = Version.objects.filter(slug__startswith="raw-v")
raw_delete_in_batches(queryset, batch_size=10)

# Verify versions were deleted
assert Version.objects.filter(slug__startswith="raw-v").count() == 0

def test_queryset_equal_to_batch_size(self):
"""Test that a queryset equal to batch_size uses regular raw delete."""
# Create a project with exactly 5 versions
project = get(Project, slug="raw-equal-project")
versions = [get(Version, project=project, slug=f"raw-eq-{i}") for i in range(5)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

# Delete with a batch_size equal to the number of objects
queryset = Version.objects.filter(slug__startswith="raw-eq-")
raw_delete_in_batches(queryset, batch_size=5)

assert Version.objects.filter(slug__startswith="raw-eq-").count() == 0

def test_queryset_larger_than_batch_size(self):
"""Test that a queryset larger than batch_size is raw deleted in batches."""
# Create a project with 10 versions
project = get(Project, slug="raw-large-project")
versions = [get(Version, project=project, slug=f"raw-large-{i}") for i in range(10)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

# Delete with a batch_size smaller than the number of objects
queryset = Version.objects.filter(slug__startswith="raw-large-")
raw_delete_in_batches(queryset, batch_size=3)

assert Version.objects.filter(slug__startswith="raw-large-").count() == 0

def test_multiple_batches(self):
"""Test raw deletion works correctly across multiple batches."""
# Create a project with 25 versions
project = get(Project, slug="raw-batch-project")
versions = [get(Version, project=project, slug=f"raw-batch-{i}") for i in range(25)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

# Delete with batch_size=7, requiring 4 batches (7, 7, 7, 4)
queryset = Version.objects.filter(slug__startswith="raw-batch-")
raw_delete_in_batches(queryset, batch_size=7)

assert Version.objects.filter(slug__startswith="raw-batch-").count() == 0

def test_batch_size_one(self):
"""Test raw deletion with batch_size=1 (edge case)."""
# Create a project with 3 versions
project = get(Project, slug="raw-single-project")
versions = [get(Version, project=project, slug=f"raw-single-{i}") for i in range(3)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

queryset = Version.objects.filter(slug__startswith="raw-single-")
raw_delete_in_batches(queryset, batch_size=1)

assert Version.objects.filter(slug__startswith="raw-single-").count() == 0

def test_raw_delete_with_limit_smaller_than_total(self):
"""Test raw deleting with a limit smaller than total queryset count."""
# Create a project with 20 versions
project = get(Project, slug="raw-limit-project")
versions = [get(Version, project=project, slug=f"raw-limit-{i}") for i in range(20)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Suggested change
versions = [get(Version, project=project, slug=f"raw-limit-{i}") for i in range(20)]
for i in range(20):
get(Version, project=project, slug=f"raw-limit-{i}")

Copilot uses AI. Check for mistakes.

# Delete only 10 versions with a limit
queryset = Version.objects.filter(slug__startswith="raw-limit-")
raw_delete_in_batches(queryset, batch_size=3, limit=10)

# Should have 10 versions remaining
assert Version.objects.filter(slug__startswith="raw-limit-").count() == 10

def test_raw_delete_with_limit_larger_than_total(self):
"""Test raw deleting with a limit larger than total queryset count."""
# Create a project with 5 versions
project = get(Project, slug="raw-over-limit")
versions = [get(Version, project=project, slug=f"raw-over-{i}") for i in range(5)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

# Set limit larger than actual count
queryset = Version.objects.filter(slug__startswith="raw-over-")
raw_delete_in_batches(queryset, batch_size=2, limit=100)

# Should delete all 5 versions
assert Version.objects.filter(slug__startswith="raw-over-").count() == 0

def test_raw_delete_with_limit_equal_to_batch_size(self):
"""Test raw deleting with a limit equal to the batch size."""
# Create a project with 10 versions
project = get(Project, slug="raw-equal-limit")
versions = [get(Version, project=project, slug=f"raw-eq-lim-{i}") for i in range(10)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Suggested change
versions = [get(Version, project=project, slug=f"raw-eq-lim-{i}") for i in range(10)]
for i in range(10):
get(Version, project=project, slug=f"raw-eq-lim-{i}")

Copilot uses AI. Check for mistakes.

# Set limit equal to batch_size
queryset = Version.objects.filter(slug__startswith="raw-eq-lim-")
raw_delete_in_batches(queryset, batch_size=5, limit=5)

# Should have 5 versions remaining
assert Version.objects.filter(slug__startswith="raw-eq-lim-").count() == 5

def test_raw_delete_with_limit_one(self):
"""Test raw deleting with limit=1 (edge case)."""
# Create a project with 5 versions
project = get(Project, slug="raw-one-limit")
versions = [get(Version, project=project, slug=f"raw-one-{i}") for i in range(5)]
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable versions is not used.

Copilot uses AI. Check for mistakes.

# Set limit to 1
queryset = Version.objects.filter(slug__startswith="raw-one-")
raw_delete_in_batches(queryset, batch_size=2, limit=1)

# Should have 4 versions remaining
assert Version.objects.filter(slug__startswith="raw-one-").count() == 4
75 changes: 67 additions & 8 deletions readthedocs/core/utils/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from itertools import batched


def delete_in_batches(queryset, batch_size=50) -> tuple[int, dict]:
def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tuple[int, dict]:
"""
Delete a queryset in batches to avoid long transactions or big queries.

Expand All @@ -15,25 +15,84 @@ def delete_in_batches(queryset, batch_size=50) -> tuple[int, dict]:
Similar to Django's ``QuerySet.delete()``, it returns a tuple with the number of
deleted objects and a dictionary with the number of deletions per model type.

This is also useful when you don't want to overload the DB by deleting
a large number of records at once, since .delete() doesn't support limits/offsets.

:param queryset: Django queryset to delete
:param batch_size: Number of records to delete per batch
:param limit: Maximum number of records to delete from the queryset
"""
# Don't use batch deletion if the number of records
# is smaller or equal to the batch size.
count = queryset.count()
if count == 0:
if limit is not None and limit <= 0:
return 0, {}
if count <= batch_size:
return queryset.delete()

if not limit:
# Don't use batch deletion if the number of records
# is smaller or equal to the batch size.
# Only do this check if no limit is set,
# since with a limit we always need to get a batch.
count = queryset.count()
if count == 0:
return 0, {}
if count <= batch_size:
return queryset.delete()

model = queryset.model
total_deleted = 0
deleted_counter = Counter()
# We can't use a limit or offset with .delete,
# so we first extract the IDs and perform the deletion in anothr query.
# so we first extract the IDs and perform the deletion in another query.
all_pks = queryset.values_list("pk", flat=True)
if limit:
all_pks = all_pks[:limit]
for batch in batched(all_pks, batch_size):
total_deleted_batch, deleted_counter_batch = model.objects.filter(pk__in=batch).delete()
total_deleted += total_deleted_batch
deleted_counter.update(deleted_counter_batch)
return total_deleted, dict(deleted_counter)


def raw_delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> int:
"""
Raw delete a queryset in batches to avoid long transactions or big queries.

Similar to ``delete_in_batches``, but uses ``_raw_delete``.
This is useful when you don't want to overload the DB by deleting
a large number of records at once, since _raw_delete() doesn't support
limits/offsets.

.. warning::

Since this uses ``_raw_delete``, it won't trigger any
signals, and won't cascade delete related objects.
Use it only if you are sure there are no related objects
that need to be deleted/updated.

:return: Number of deleted records
"""
if limit is not None and limit <= 0:
return 0

if not limit:
# Don't use batch deletion if the number of records
# is smaller or equal to the batch size.
# Only do this check if no limit is set,
# since with a limit we always need to get a batch.
count = queryset.count()
if count == 0:
return count
if count <= batch_size:
queryset._raw_delete(queryset.db)
return count

total_deleted = 0
model = queryset.model
# We can't use a limit or offset with .raw_delete,
# so we first extract the IDs and perform the deletion in another query.
all_pks = queryset.values_list("pk", flat=True)
if limit:
all_pks = all_pks[:limit]
for batch in batched(all_pks, batch_size):
qs = model.objects.filter(pk__in=batch)
qs._raw_delete(qs.db)
total_deleted += len(batch)
return total_deleted
26 changes: 25 additions & 1 deletion readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,8 +699,15 @@ def BUILD_MEMORY_LIMIT(self):
},
"every-day-delete-old-buildata-models": {
"task": "readthedocs.telemetry.tasks.delete_old_build_data",
"schedule": crontab(minute=0, hour=2),
# NOTE: we are running this task every hour for now,
# since we have lots of objects to delete, and we are limiting
# the number of deletions per task run.
# TODO: go back to once a day (unlimited) after we delete the backlog of objects.
# It should take around ~1 week to delete all the old objects on community,
# commercial doesn't have this problem.
"schedule": crontab(minute=0, hour="*"),
"options": {"queue": "web"},
"kwargs": {"limit": 10_000},
},
"every-day-delete-old-buildconfig-models": {
"task": "readthedocs.builds.tasks.remove_orphan_build_config",
Expand Down Expand Up @@ -754,6 +761,23 @@ def BUILD_MEMORY_LIMIT(self):
"schedule": crontab(minute=0, hour=4),
"options": {"queue": "web"},
},
# TODO: delete this task when all imported files pending deletion are done.
# It shuold take around 36 days to delete all the old imported files on community,
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in the comment: "shuold" should be "should".

Suggested change
# It shuold take around 36 days to delete all the old imported files on community,
# It should take around 36 days to delete all the old imported files on community,

Copilot uses AI. Check for mistakes.
# and 6 days in commercial.
"every-hour-delete-imported-files": {
"task": "readthedocs.core.tasks.delete_outdated_imported_files",
"schedule": crontab(minute=15, hour="*"),
"options": {"queue": "web"},
"kwargs": {"limit": 10_000},
},
# TODO: delete this task when all orphan AddonConfig objects are deleted.
# It should take around 6 days on community, commercial doesn't have this problem.
"every-hour-delete-orphaned-addons-configs": {
"task": "readthedocs.core.tasks.delete_orphaned_addons_configs",
"schedule": crontab(minute=30, hour="*"),
"options": {"queue": "web"},
"kwargs": {"limit": 5_000},
},
}

# Sentry
Expand Down
Loading