Skip to content

Commit

Permalink
feat: purge useless data files (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
xyb authored Nov 7, 2023
1 parent 74c269b commit 4d9f0b0
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 16 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,17 @@ $ curl -X POST localhost:8000/task/${task_id}/restart_downloading/
```
This simply restarts the download process for samples and full files, but skips the steps of saving and retrieving the file list.

### purge files of deleted leecher tasks

After a long run, there will be a large number of files of deleted leecher tasks. You may want to delete files that you no longer need, you can call the purge api to delete them:
```sh
$ curl -X POST localhost:8000/task/purge/
```
By default, deleted files are moved to the trash folder: `baidupcsleecher_trash`, you have to delete them manually. If you want to delete the file completely, set the parameter `move_to_trash=false`:
```sh
$ curl -X POST -d "move_to_trash=false" localhost:8000/task/purge/
```

## simple ui
You can also directly use the browser to access the simple web interface that comes with the service, submit download tasks, and view the task list.

Expand Down
2 changes: 1 addition & 1 deletion baidupcsleecher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
INSTALLED_APPS.append("django_browser_reload")

# baidupcsleecher settings
DATA_DIR = Path(getenv("DATA_DIR", "/tmp")).resolve()
DATA_DIR = Path(getenv("DATA_DIR", "/tmp/baidupcsleecher")).resolve()
REMOTE_LEECHER_DIR = str(Path(getenv("REMOTE_LEECHER_DIR", "/leecher")).resolve())
RUNNER_SLEEP_SECONDS = int(getenv("RUNNER_SLEEP_SECONDS", "5"))
SAMPLE_SIZE = int(getenv("SAMPLE_SIZE", "10240"))
Expand Down
18 changes: 11 additions & 7 deletions task/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,30 +98,34 @@ class Meta:
def __repr__(self):
return f"<Task id={self.id}, {self.shared_id} with {self.total_files} files>"

def __str__(self):
def __str__(self) -> str:
return repr(self)

@property
def path(self):
def path(self) -> str:
return f"{self.shared_id}.{self.shared_password}"

@property
def sample_path(self):
def sample_path(self) -> str:
return f"{self.path}.sample"

@property
def data_path(self):
def data_path(self) -> Path:
return settings.DATA_DIR / self.path

def ensure_data_path(self):
@property
def sample_data_path(self) -> Path:
return settings.DATA_DIR / self.sample_path

def ensure_data_path(self) -> None:
if not self.data_path.exists():
makedirs(self.data_path, exists_ok=True)

@property
def remote_path(self):
def remote_path(self) -> str:
return str(Path(settings.REMOTE_LEECHER_DIR) / self.path)

def set_files(self, files):
def set_files(self, files) -> None:
remote_base_dir = str(Path(settings.REMOTE_LEECHER_DIR) / self.path)
file_list = []
for file in files:
Expand Down
48 changes: 48 additions & 0 deletions task/purge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import shutil
from pathlib import Path
from typing import List

from django.conf import settings

from .models import Task


def purge(move_to_dir: Path = None) -> None:
keep_dirs = set()
for task in Task.objects.all():
keep_dirs.add(task.data_path)
keep_dirs.add(task.sample_data_path)

exist_dirs = set()
root = settings.DATA_DIR
if root.exists():
for dir in os.listdir(root):
path = root / dir
if path.is_dir() and len(dir.split(".")) in [2, 3]:
exist_dirs.add(root / dir)

useless = exist_dirs - keep_dirs
print(f"{len(useless)} directories to be deleted.")

for dir in sorted(useless):
if not dir.exists():
print(f"{dir} is not exists, skip deletion.")
continue
if move_to_dir:
print(f"start move {dir} to trash dir {move_to_dir} ...")
to_dir = move_to_dir / dir.name
to_dir.parent.mkdir(parents=True, exist_ok=True)
os.rename(dir, move_to_dir / dir.name)
print(f"{dir} moved to trash dir.")
else:
print(f"start delete {dir} ...")
shutil.rmtree(dir)
print(f"{dir} deleted.")


def remove_tasks(keep_task_ids: List[int] = []) -> List[int]:
if keep_task_ids:
to_remove = Task.objects.exclude(id__in=keep_task_ids)
to_remove.delete()
return Task.objects.all().values_list("id", flat=True)
4 changes: 4 additions & 0 deletions task/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,9 @@ class FullDownloadNowSerializer(serializers.Serializer):
full_download_now = serializers.BooleanField()


class PurgeSerializer(serializers.Serializer):
move_to_trash = serializers.BooleanField(default=True)


class OperationSerializer(serializers.Serializer):
pass
20 changes: 20 additions & 0 deletions task/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path

import pytest
from django.conf import settings

from ..utils import walk_files


@pytest.fixture(autouse=True)
def data_dir_setup(tmp_path: Path):
settings.DATA_DIR = tmp_path

yield

print(f"files in tmp_path {tmp_path}:")
total = 0
for path in walk_files(tmp_path):
print(f" {path}")
total += 1
print(f"total: {total} files")
71 changes: 71 additions & 0 deletions task/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,36 @@
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import Mock
from unittest.mock import patch

from django.conf import settings
from django.urls import reverse
from rest_framework import status
from rest_framework.test import APITestCase

from ..models import Task
from ..serializers import TaskSerializer
from ..utils import list_files


def touch_file(path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
open(path, "w").write("")


def touch_task_files(task: Task):
for f in task.load_files():
if f["is_file"]:
touch_file(path=task.data_path / f["path"])
touch_file(path=task.sample_data_path / f["path"])


class TaskViewSetTestCase(APITestCase):
def setUp(self):
self.task = Task.objects.create(
shared_link="https://pan.baidu.com/s/123abc?pwd=def",
shared_id="123abc",
shared_password="def",
)
self.remote_files = [
{
Expand Down Expand Up @@ -247,9 +264,13 @@ def test_delete_remote_files(self, mock_get_baidupcs_client):

def test_delete_local_files(self):
id = self.task.id
touch_file(path=self.task.data_path / "test.txt")
assert list_files(settings.DATA_DIR) != []

response = self.client.delete(reverse("task-local-files", args=[id]))

assert response.json() == {str(id): "local files deleted"}
assert list_files(settings.DATA_DIR) == []

@patch("task.views.get_baidupcs_client")
def test_erase(self, mock_get_baidupcs_client):
Expand All @@ -259,3 +280,53 @@ def test_erase(self, mock_get_baidupcs_client):
response = self.client.delete(reverse("task-erase", args=[id]))

assert response.json() == {str(id): "task deleted"}
assert len(Task.objects.filter(pk=id)) == 0

def test_purge(self):
touch_task_files(self.task)
self.task.delete()
assert sorted(list_files(settings.DATA_DIR)) == [
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"123abc.def/张楚/孤独的人是可耻的.mp3",
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

response = self.client.post(reverse("task-purge"))

assert response.json() == {"done": True}
assert sorted(list_files(settings.DATA_DIR)) == [
"baidupcsleecher_trash/123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"baidupcsleecher_trash/123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"baidupcsleecher_trash/123abc.def/张楚/孤独的人是可耻的.mp3",
"baidupcsleecher_trash/123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

def test_purge_all(self):
touch_task_files(self.task)
self.task.delete()
assert list_files(settings.DATA_DIR) != []

response = self.client.post(
reverse("task-purge"),
data={"move_to_trash": False},
format="json",
)

assert response.json() == {"done": True}
assert list_files(settings.DATA_DIR) == []

def test_purge_nothing(self):
touch_task_files(self.task)
files = sorted(list_files(settings.DATA_DIR))
assert files == [
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"123abc.def/张楚/孤独的人是可耻的.mp3",
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

response = self.client.post(reverse("task-purge"))

assert response.json() == {"done": True}
assert sorted(list_files(settings.DATA_DIR)) == files
102 changes: 102 additions & 0 deletions task/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import logging
import os
import re
import traceback
from http.cookies import SimpleCookie
from pathlib import Path
from typing import List
from typing import Tuple
from urllib.parse import parse_qs
from urllib.parse import urlparse

Expand Down Expand Up @@ -112,3 +116,101 @@ def match_regex(string: str, regex: str) -> bool:
"""
pattern = re.compile(regex)
return bool(re.match(pattern, string))


def walk_dir(path: Path) -> List[Tuple[Path, List[os.DirEntry]]]:
"""
Recursively walks through a directory and yields tuples containing
the current path and a list of directory entries.
Args:
path (Path): The path to the directory.
Returns:
List[Tuple[Path, List[os.DirEntry]]]: A list of tuples containing
the current path and a list of directory entries.
Examples:
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... entries = list(walk_dir(test_dir))
... len(entries)
2
>>> entries[0][0] == test_dir
True
>>> sorted([i.name for i in entries[0][1]])
['file1.txt', 'sub_dir']
>>> entries[1][0] == sub_dir
True
>>> sorted([i.name for i in entries[1][1]])
['file2.txt']
"""

paths = [path]
while paths:
path = paths.pop(0)
with os.scandir(path) as scandir_it:
entries = list(scandir_it)
yield path, entries
for entry in entries:
if entry.is_dir():
paths.append(path._make_child_relpath(entry.name))


def walk_files(path: Path) -> List[Path]:
"""
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... files = list(walk_files(test_dir))
... len(files)
2
>>> [i.name for i in files]
['file1.txt', 'file2.txt']
"""
for root, entries in walk_dir(path):
for p in entries:
if not p.is_dir():
yield root / p


def list_files(root: Path, without_root=True) -> List[str]:
"""
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... files = list_files(test_dir)
... len(files)
2
>>> files
['file1.txt', 'sub_dir/file2.txt']
"""
result = []
for file_path in walk_files(root):
if without_root:
result.append(str(file_path.relative_to(root)))
else:
result.append(str(file_path))
return result
Loading

0 comments on commit 4d9f0b0

Please sign in to comment.