Skip to content

Commit

Permalink
feat: ignore paths (#87)
Browse files Browse the repository at this point in the history
* feat: ignore paths

* chore: upgrade pre-commit
  • Loading branch information
xyb authored Oct 8, 2023
1 parent d97223c commit 4dbef2e
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 10 deletions.
14 changes: 7 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
rev: v4.5.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.2.6
hooks:
- id: codespell

- repo: https://github.com/PyCQA/autoflake
rev: v1.7.7
rev: v2.2.1
hooks:
- id: autoflake
name: autoflake
Expand All @@ -21,7 +21,7 @@ repos:
files: \.py$

- repo: https://github.com/asottile/reorder_python_imports
rev: v3.9.0
rev: v3.12.0
hooks:
- id: reorder-python-imports
args:
Expand All @@ -43,20 +43,20 @@ repos:
exclude: static/.*

- repo: https://github.com/asottile/pyupgrade
rev: v3.2.0
rev: v3.15.0
hooks:
- id: pyupgrade
args:
- --py37-plus

- repo: https://github.com/asottile/add-trailing-comma
rev: v2.3.0
rev: v3.1.0
hooks:
- id: add-trailing-comma
args:
- --py36-plus

- repo: https://github.com/psf/black
rev: "22.10.0"
rev: "23.9.1"
hooks:
- id: black
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ TRANSFER_POLICY = "if_not_present"
# For PAN_BAIDU_BDUSS and PAN_BAIDU_COOKIES, please check the documentation of BaiduPCS-Py
PAN_BAIDU_BDUSS = ""
PAN_BAIDU_COOKIES = ""
# do not download file if path matches these regex
IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*"

## django settings
# 0: production, 1: development
Expand Down
2 changes: 2 additions & 0 deletions baidupcsleecher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
TRANSFER_POLICY = getenv("TRANSFER_POLICY", "if_not_present")
PAN_BAIDU_BDUSS = getenv("PAN_BAIDU_BDUSS", "")
PAN_BAIDU_COOKIES = getenv("PAN_BAIDU_COOKIES", "")
# do not download these path
IGNORE_PATH_RE = getenv("IGNORE_PATH_RE", ".*__MACOSX.*|.*spam.*")

REST_FRAMEWORK = {
"DEFAULT_PAGINATION_CLASS": "drf_link_header_pagination.LinkHeaderPagination",
Expand Down
4 changes: 4 additions & 0 deletions task/baidupcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from .utils import cookies2dict
from .utils import download_url
from .utils import match_regex
from .utils import unify_shared_link

logger = logging.getLogger("baibupcs")
Expand Down Expand Up @@ -101,6 +102,9 @@ def download_dir(self, remote_dir, local_dir, sample_size=0):
def download_file(self, remote_path, local_dir, file_size, sample_size=0):
local_path = Path(local_dir) / basename(remote_path)
logger.info(f" {remote_path} -> {local_path}")
if match_regex(str(remote_path), settings.IGNORE_PATH_RE):
logger.info(f" {remote_path} matched ignore paths, skipping")
return

if not local_path.parent.exists():
local_path.parent.mkdir(parents=True)
Expand Down
1 change: 0 additions & 1 deletion task/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

initial = True

dependencies = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

dependencies = [
("task", "0001_initial"),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

dependencies = [
("task", "0010_alter_task_full_download_now"),
]
Expand Down
29 changes: 29 additions & 0 deletions task/tests/test_baidupcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_get_baidupcs_client(mock_cookies2dict, mock_settings, mock_BaiduPCS):
mock_settings.PAN_BAIDU_BDUSS = "test_bduss"
mock_settings.PAN_BAIDU_COOKIES = "test_cookies"
mock_cookies2dict.return_value = {"test_cookies"}
mock_settings.IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*"

get_baidupcs_client()

Expand Down Expand Up @@ -229,3 +230,31 @@ def test_download(self, mock_download, mock_list):
assert mock_download.called
assert mock_download.call_count == 1
assert mock_download.call_args.args[0].name == "text.txt"

@patch(
"task.baidupcs.BaiduPCSClient.list_files",
return_value=[
{
"path": "file.txt",
"is_dir": False,
"is_file": True,
"size": 2,
"md5": "abcd",
},
{
"path": "__MACOSX/text.txt",
"is_dir": False,
"is_file": True,
"size": 1024,
"md5": "badbeef",
},
],
)
@patch("task.baidupcs.download_url", return_value=100)
def test_ignore_download(self, mock_download, mock_list):
with tempfile.TemporaryDirectory() as tmpdir:
self.client.download_dir("/", tmpdir, 100)

assert mock_download.called
assert mock_download.call_count == 1
assert mock_download.call_args.args[0].name == "file.txt"
21 changes: 21 additions & 0 deletions task/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,24 @@ def download_url(local_path, url, headers, limit=0):
if limit > 0 and total >= limit:
return total
return total


def match_regex(string: str, regex: str) -> bool:
"""
Check if a string matches a given regular expression.
Args:
string (str): The input string.
regex (str): The regular expression pattern.
Returns:
bool: True if the string matches the regular expression, False otherwise.
Examples:
>>> match_regex("hello.txt", ".*txt|.*mp3")
True
>>> match_regex("hello.html", ".*txt|.*mp3")
False
"""
pattern = re.compile(regex)
return bool(re.match(pattern, string))

0 comments on commit 4dbef2e

Please sign in to comment.