From 39b0826d6715a983f76730240f26a87d467b05b9 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 31 Dec 2018 14:52:10 +0800 Subject: [PATCH] init --- .editorconfig | 11 +++ .gitignore | 157 ++++++++++++++++++++++++++++++++++++++++++ .vscode/settings.json | 3 + get.py | 51 ++++++++++++++ main.py | 98 ++++++++++++++++++++++++++ 5 files changed, 320 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 get.py create mode 100644 main.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..29d7c6e --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +charset = utf-8 +# end_of_line = lf +# insert_final_newline = true +trim_trailing_whitespace = true + +[*.py] +indent_style = space +indent_style = space diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a49184e --- /dev/null +++ b/.gitignore @@ -0,0 +1,157 @@ +### Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + +### macOS.gitignore + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +### VisualStudioCode.gitignore + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..70689ed --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/usr/local/opt/python/bin/python3.6" +} \ No newline at end of file diff --git a/get.py b/get.py new file mode 100644 index 0000000..5d15b9e --- /dev/null +++ b/get.py @@ -0,0 +1,51 @@ +import json +import os + +import requests + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DIST_DIR = os.path.join(BASE_DIR, "dist") + + +def get_html(kw, start): + """获取网页源代码 + + """ + url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}" + new_url = url.format(kw, start) + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" + } + try: + response = requests.get(new_url, headers=headers) + if response.status_code == 200: + result = json.dumps(response.json(), indent=4, ensure_ascii=False) + return result + except requests.ConnectionError as e: + print(e) + return None + + +def write_into_file(result): + """写入文件 + + """ + if not os.path.exists(DIST_DIR): + os.makedirs(DIST_DIR) + with open("dist/result.json", "w", encoding="utf-8") as f: + f.write(result) + + +def main(): + """主函数 + + """ + # 可以修改的测试值 start=24 + kw = "taeyeon" + start = 0 + result = get_html(kw, start) + write_into_file(result) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..57942da --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +import json +import os +from hashlib import md5 +from multiprocessing.dummy import Pool + +import requests + +# 参数 +GROUP_START = 1 +GROUP_END = 10 +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DIST_DIR = os.path.join(BASE_DIR, "dist") + + +def get_html(kw, start): + """获取网页源代码 + + """ + url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}".format(kw, start) + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" + } + try: + response = requests.get(url, headers=headers) + if response.status_code == 200: + result = response.json() + return result + except requests.ConnectionError as e: + print(e) + return None + + +def get_images(json): + """获取image的url + + """ + data = json.get("data") + if data: + object_list = data.get("object_list") + if object_list: + for item in object_list: + contents = {} + contents["path"] = item.get("photo").get("path") + yield contents + + +def write_into_file(keyword, item): + """写入文件 + + """ + if not os.path.exists(os.path.join(DIST_DIR, keyword)): + os.makedirs(os.path.join(DIST_DIR, keyword)) + try: + image_url = item.get("path") + if "gif" in image_url: + response = requests.get(image_url) + if response.status_code == 200: + file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword, + md5(response.content).hexdigest(), "gif") + if not os.path.exists(file_path): + with open(file_path, "wb") as f: + f.write(response.content) + else: + print("Already Downloaded", md5( + response.content).hexdigest(), "gif", sep="") + else: + response = requests.get(image_url) + if response.status_code == 200: + file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword, + md5(response.content).hexdigest(), "jpg") + if not os.path.exists(file_path): + with open(file_path, "wb") as f: + f.write(response.content) + else: + print("Already Downloaded", md5( + response.content).hexdigest(), "jpg", sep="") + except requests.ConnectionError: + print("Failed to save image") + + +def main(start): + """主函数 + + """ + # 这里修改kw + kw = "泰妍" + json = get_html(kw, start) + for item in get_images(json): + print("正在下载: ", item["path"], sep="") + write_into_file(kw, item) + + +if __name__ == '__main__': + pool = Pool(16) + groups = ([x * 24 for x in range(GROUP_START-1, GROUP_END+1)]) + pool.map(main, groups) + pool.close() + pool.join() \ No newline at end of file