Skip to content

Commit 94d90c0

Browse files
committed
feat: purge useless data files
1 parent 74c269b commit 94d90c0

File tree

9 files changed

+296
-16
lines changed

9 files changed

+296
-16
lines changed

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,17 @@ $ curl -X POST localhost:8000/task/${task_id}/restart_downloading/
173173
```
174174
This simply restarts the download process for samples and full files, but skips the steps of saving and retrieving the file list.
175175

176+
### purge files of deleted leecher tasks
177+
178+
After a long run, there will be a large number of files of deleted leecher tasks. You may want to delete files that you no longer need, you can call the purge api to delete them:
179+
```sh
180+
$ curl -X POST localhost:8000/task/purge/
181+
```
182+
By default, deleted files are moved to the trash folder: `baidupcsleecher_trash`, you have to delete them manually. If you want to delete the file completely, set the parameter `move_to_trash=false`:
183+
```sh
184+
$ curl -X POST -d "move_to_trash=false" localhost:8000/task/purge/
185+
```
186+
176187
## simple ui
177188
You can also directly use the browser to access the simple web interface that comes with the service, submit download tasks, and view the task list.
178189

baidupcsleecher/settings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
INSTALLED_APPS.append("django_browser_reload")
5858

5959
# baidupcsleecher settings
60-
DATA_DIR = Path(getenv("DATA_DIR", "/tmp")).resolve()
60+
DATA_DIR = Path(getenv("DATA_DIR", "/tmp/baidupcsleecher")).resolve()
6161
REMOTE_LEECHER_DIR = str(Path(getenv("REMOTE_LEECHER_DIR", "/leecher")).resolve())
6262
RUNNER_SLEEP_SECONDS = int(getenv("RUNNER_SLEEP_SECONDS", "5"))
6363
SAMPLE_SIZE = int(getenv("SAMPLE_SIZE", "10240"))

task/models.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -98,30 +98,34 @@ class Meta:
9898
def __repr__(self):
9999
return f"<Task id={self.id}, {self.shared_id} with {self.total_files} files>"
100100

101-
def __str__(self):
101+
def __str__(self) -> str:
102102
return repr(self)
103103

104104
@property
105-
def path(self):
105+
def path(self) -> str:
106106
return f"{self.shared_id}.{self.shared_password}"
107107

108108
@property
109-
def sample_path(self):
109+
def sample_path(self) -> str:
110110
return f"{self.path}.sample"
111111

112112
@property
113-
def data_path(self):
113+
def data_path(self) -> Path:
114114
return settings.DATA_DIR / self.path
115115

116-
def ensure_data_path(self):
116+
@property
117+
def sample_data_path(self) -> Path:
118+
return settings.DATA_DIR / self.sample_path
119+
120+
def ensure_data_path(self) -> None:
117121
if not self.data_path.exists():
118122
makedirs(self.data_path, exists_ok=True)
119123

120124
@property
121-
def remote_path(self):
125+
def remote_path(self) -> str:
122126
return str(Path(settings.REMOTE_LEECHER_DIR) / self.path)
123127

124-
def set_files(self, files):
128+
def set_files(self, files) -> None:
125129
remote_base_dir = str(Path(settings.REMOTE_LEECHER_DIR) / self.path)
126130
file_list = []
127131
for file in files:

task/purge.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import shutil
3+
from pathlib import Path
4+
from typing import List
5+
6+
from django.conf import settings
7+
8+
from .models import Task
9+
10+
11+
def purge(move_to_dir: Path = None) -> None:
12+
keep_dirs = set()
13+
for task in Task.objects.all():
14+
keep_dirs.add(task.data_path)
15+
keep_dirs.add(task.sample_data_path)
16+
17+
exist_dirs = set()
18+
root = settings.DATA_DIR
19+
if root.exists():
20+
for dir in os.listdir(root):
21+
path = root / dir
22+
if path.is_dir() and len(dir.split(".")) in [2, 3]:
23+
exist_dirs.add(root / dir)
24+
25+
useless = exist_dirs - keep_dirs
26+
print(f"{len(useless)} directories to be deleted.")
27+
28+
for dir in sorted(useless):
29+
if not dir.exists():
30+
print(f"{dir} is not exists, skip deletion.")
31+
continue
32+
if move_to_dir:
33+
print(f"start move {dir} to trash dir {move_to_dir} ...")
34+
to_dir = move_to_dir / dir.name
35+
to_dir.parent.mkdir(parents=True, exist_ok=True)
36+
os.rename(dir, move_to_dir / dir.name)
37+
print(f"{dir} moved to trash dir.")
38+
else:
39+
print(f"start delete {dir} ...")
40+
shutil.rmtree(dir)
41+
print(f"{dir} deleted.")
42+
43+
44+
def remove_tasks(keep_task_ids: List[int] = []) -> List[int]:
45+
if keep_task_ids:
46+
to_remove = Task.objects.exclude(id__in=keep_task_ids)
47+
to_remove.delete()
48+
return Task.objects.all().values_list("id", flat=True)

task/serializers.py

+4
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,9 @@ class FullDownloadNowSerializer(serializers.Serializer):
6060
full_download_now = serializers.BooleanField()
6161

6262

63+
class PurgeSerializer(serializers.Serializer):
64+
move_to_trash = serializers.BooleanField(default=True)
65+
66+
6367
class OperationSerializer(serializers.Serializer):
6468
pass

task/tests/conftest.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
from django.conf import settings
5+
6+
from ..utils import walk_files
7+
8+
9+
@pytest.fixture(autouse=True)
10+
def data_dir_setup(tmp_path: Path):
11+
settings.DATA_DIR = tmp_path
12+
13+
yield
14+
15+
print(f"files in tmp_path {tmp_path}:")
16+
total = 0
17+
for path in walk_files(tmp_path):
18+
print(f" {path}")
19+
total += 1
20+
print(f"total: {total} files")

task/tests/test_api.py

+71
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,36 @@
1+
from pathlib import Path
12
from unittest.mock import MagicMock
23
from unittest.mock import Mock
34
from unittest.mock import patch
45

6+
from django.conf import settings
57
from django.urls import reverse
68
from rest_framework import status
79
from rest_framework.test import APITestCase
810

911
from ..models import Task
1012
from ..serializers import TaskSerializer
13+
from ..utils import list_files
14+
15+
16+
def touch_file(path: Path):
17+
path.parent.mkdir(parents=True, exist_ok=True)
18+
open(path, "w").write("")
19+
20+
21+
def touch_task_files(task: Task):
22+
for f in task.load_files():
23+
if f["is_file"]:
24+
touch_file(path=task.data_path / f["path"])
25+
touch_file(path=task.sample_data_path / f["path"])
1126

1227

1328
class TaskViewSetTestCase(APITestCase):
1429
def setUp(self):
1530
self.task = Task.objects.create(
1631
shared_link="https://pan.baidu.com/s/123abc?pwd=def",
32+
shared_id="123abc",
33+
shared_password="def",
1734
)
1835
self.remote_files = [
1936
{
@@ -247,9 +264,13 @@ def test_delete_remote_files(self, mock_get_baidupcs_client):
247264

248265
def test_delete_local_files(self):
249266
id = self.task.id
267+
touch_file(path=self.task.data_path / "test.txt")
268+
assert list_files(settings.DATA_DIR) != []
269+
250270
response = self.client.delete(reverse("task-local-files", args=[id]))
251271

252272
assert response.json() == {str(id): "local files deleted"}
273+
assert list_files(settings.DATA_DIR) == []
253274

254275
@patch("task.views.get_baidupcs_client")
255276
def test_erase(self, mock_get_baidupcs_client):
@@ -259,3 +280,53 @@ def test_erase(self, mock_get_baidupcs_client):
259280
response = self.client.delete(reverse("task-erase", args=[id]))
260281

261282
assert response.json() == {str(id): "task deleted"}
283+
assert len(Task.objects.filter(pk=id)) == 0
284+
285+
def test_purge(self):
286+
touch_task_files(self.task)
287+
self.task.delete()
288+
assert list_files(settings.DATA_DIR) == [
289+
"123abc.def/张楚/孤独的人是可耻的.mp3",
290+
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
291+
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
292+
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
293+
]
294+
295+
response = self.client.post(reverse("task-purge"))
296+
297+
assert response.json() == {"done": True}
298+
assert list_files(settings.DATA_DIR) == [
299+
"baidupcsleecher_trash/123abc.def/张楚/孤独的人是可耻的.mp3",
300+
"baidupcsleecher_trash/123abc.def/张楚/蚂蚁蚂蚁.mp3",
301+
"baidupcsleecher_trash/123abc.def.sample/张楚/孤独的人是可耻的.mp3",
302+
"baidupcsleecher_trash/123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
303+
]
304+
305+
def test_purge_all(self):
306+
touch_task_files(self.task)
307+
self.task.delete()
308+
assert list_files(settings.DATA_DIR) != []
309+
310+
response = self.client.post(
311+
reverse("task-purge"),
312+
data={"move_to_trash": False},
313+
format="json",
314+
)
315+
316+
assert response.json() == {"done": True}
317+
assert list_files(settings.DATA_DIR) == []
318+
319+
def test_purge_nothing(self):
320+
touch_task_files(self.task)
321+
files = list_files(settings.DATA_DIR)
322+
assert files == [
323+
"123abc.def/张楚/孤独的人是可耻的.mp3",
324+
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
325+
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
326+
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
327+
]
328+
329+
response = self.client.post(reverse("task-purge"))
330+
331+
assert response.json() == {"done": True}
332+
assert list_files(settings.DATA_DIR) == files

task/utils.py

+102
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import logging
2+
import os
23
import re
34
import traceback
45
from http.cookies import SimpleCookie
6+
from pathlib import Path
7+
from typing import List
8+
from typing import Tuple
59
from urllib.parse import parse_qs
610
from urllib.parse import urlparse
711

@@ -112,3 +116,101 @@ def match_regex(string: str, regex: str) -> bool:
112116
"""
113117
pattern = re.compile(regex)
114118
return bool(re.match(pattern, string))
119+
120+
121+
def walk_dir(path: Path) -> List[Tuple[Path, List[os.DirEntry]]]:
122+
"""
123+
Recursively walks through a directory and yields tuples containing
124+
the current path and a list of directory entries.
125+
126+
Args:
127+
path (Path): The path to the directory.
128+
129+
Returns:
130+
List[Tuple[Path, List[os.DirEntry]]]: A list of tuples containing
131+
the current path and a list of directory entries.
132+
133+
Examples:
134+
>>> import tempfile
135+
>>> with tempfile.TemporaryDirectory() as temp_dir:
136+
... test_dir = Path(temp_dir) / "test_dir"
137+
... test_dir.mkdir()
138+
... file1 = test_dir / "file1.txt"
139+
... file1.touch()
140+
... sub_dir = test_dir / "sub_dir"
141+
... sub_dir.mkdir()
142+
... file2 = sub_dir / "file2.txt"
143+
... file2.touch()
144+
... entries = list(walk_dir(test_dir))
145+
... len(entries)
146+
2
147+
>>> entries[0][0] == test_dir
148+
True
149+
>>> sorted([i.name for i in entries[0][1]])
150+
['file1.txt', 'sub_dir']
151+
>>> entries[1][0] == sub_dir
152+
True
153+
>>> sorted([i.name for i in entries[1][1]])
154+
['file2.txt']
155+
"""
156+
157+
paths = [path]
158+
while paths:
159+
path = paths.pop(0)
160+
with os.scandir(path) as scandir_it:
161+
entries = list(scandir_it)
162+
yield path, entries
163+
for entry in entries:
164+
if entry.is_dir():
165+
paths.append(path._make_child_relpath(entry.name))
166+
167+
168+
def walk_files(path: Path) -> List[Path]:
169+
"""
170+
>>> import tempfile
171+
>>> with tempfile.TemporaryDirectory() as temp_dir:
172+
... test_dir = Path(temp_dir) / "test_dir"
173+
... test_dir.mkdir()
174+
... file1 = test_dir / "file1.txt"
175+
... file1.touch()
176+
... sub_dir = test_dir / "sub_dir"
177+
... sub_dir.mkdir()
178+
... file2 = sub_dir / "file2.txt"
179+
... file2.touch()
180+
... files = list(walk_files(test_dir))
181+
... len(files)
182+
2
183+
>>> [i.name for i in files]
184+
['file1.txt', 'file2.txt']
185+
"""
186+
for root, entries in walk_dir(path):
187+
for p in entries:
188+
if not p.is_dir():
189+
yield root / p
190+
191+
192+
def list_files(root: Path, without_root=True) -> List[str]:
193+
"""
194+
>>> import tempfile
195+
>>> with tempfile.TemporaryDirectory() as temp_dir:
196+
... test_dir = Path(temp_dir) / "test_dir"
197+
... test_dir.mkdir()
198+
... file1 = test_dir / "file1.txt"
199+
... file1.touch()
200+
... sub_dir = test_dir / "sub_dir"
201+
... sub_dir.mkdir()
202+
... file2 = sub_dir / "file2.txt"
203+
... file2.touch()
204+
... files = list_files(test_dir)
205+
... len(files)
206+
2
207+
>>> files
208+
['file1.txt', 'sub_dir/file2.txt']
209+
"""
210+
result = []
211+
for file_path in walk_files(root):
212+
if without_root:
213+
result.append(str(file_path.relative_to(root)))
214+
else:
215+
result.append(str(file_path))
216+
return result

0 commit comments

Comments
 (0)