From 39b0826d6715a983f76730240f26a87d467b05b9 Mon Sep 17 00:00:00 2001
From: Chris <christoleavenow@gmail.com>
Date: Mon, 31 Dec 2018 14:52:10 +0800
Subject: [PATCH] init

---
 .editorconfig         |  11 +++
 .gitignore            | 157 ++++++++++++++++++++++++++++++++++++++++++
 .vscode/settings.json |   3 +
 get.py                |  51 ++++++++++++++
 main.py               |  98 ++++++++++++++++++++++++++
 5 files changed, 320 insertions(+)
 create mode 100644 .editorconfig
 create mode 100644 .gitignore
 create mode 100644 .vscode/settings.json
 create mode 100644 get.py
 create mode 100644 main.py

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..29d7c6e
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+root = true
+
+[*]
+charset = utf-8
+# end_of_line = lf
+# insert_final_newline = true
+trim_trailing_whitespace = true
+
+[*.py]
+indent_style = space
+indent_style = space
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a49184e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,157 @@
+### Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+
+### macOS.gitignore
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### VisualStudioCode.gitignore
+
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..70689ed
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "python.pythonPath": "/usr/local/opt/python/bin/python3.6"
+}
\ No newline at end of file
diff --git a/get.py b/get.py
new file mode 100644
index 0000000..5d15b9e
--- /dev/null
+++ b/get.py
@@ -0,0 +1,51 @@
+import json
+import os
+
+import requests
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DIST_DIR = os.path.join(BASE_DIR, "dist")
+
+
+def get_html(kw, start):
+    """获取网页源代码
+
+    """
+    url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}"
+    new_url = url.format(kw, start)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
+    }
+    try:
+        response = requests.get(new_url, headers=headers)
+        if response.status_code == 200:
+            result = json.dumps(response.json(), indent=4, ensure_ascii=False)
+            return result
+    except requests.ConnectionError as e:
+        print(e)
+        return None
+
+
+def write_into_file(result):
+    """写入文件
+
+    """
+    if not os.path.exists(DIST_DIR):
+        os.makedirs(DIST_DIR)
+    with open("dist/result.json", "w", encoding="utf-8") as f:
+        f.write(result)
+
+
+def main():
+    """主函数
+
+    """
+    # 可以修改的测试值 start=24
+    kw = "taeyeon"
+    start = 0
+    result = get_html(kw, start)
+    write_into_file(result)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..57942da
--- /dev/null
+++ b/main.py
@@ -0,0 +1,98 @@
+import json
+import os
+from hashlib import md5
+from multiprocessing.dummy import Pool
+
+import requests
+
+# 参数
+GROUP_START = 1
+GROUP_END = 10
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DIST_DIR = os.path.join(BASE_DIR, "dist")
+
+
+def get_html(kw, start):
+    """获取网页源代码
+
+    """
+    url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}".format(kw, start)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        if response.status_code == 200:
+            result = response.json()
+            return result
+    except requests.ConnectionError as e:
+        print(e)
+        return None
+
+
+def get_images(json):
+    """获取image的url
+
+    """
+    data = json.get("data")
+    if data:
+        object_list = data.get("object_list")
+        if object_list:
+            for item in object_list:
+                contents = {}
+                contents["path"] = item.get("photo").get("path")
+                yield contents
+
+
+def write_into_file(keyword, item):
+    """写入文件
+
+    """
+    if not os.path.exists(os.path.join(DIST_DIR, keyword)):
+        os.makedirs(os.path.join(DIST_DIR, keyword))
+    try:
+        image_url = item.get("path")
+        if "gif" in image_url:
+            response = requests.get(image_url)
+            if response.status_code == 200:
+                file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword,
+                                                     md5(response.content).hexdigest(), "gif")
+                if not os.path.exists(file_path):
+                    with open(file_path, "wb") as f:
+                        f.write(response.content)
+                else:
+                    print("Already Downloaded", md5(
+                        response.content).hexdigest(), "gif", sep="")
+        else:
+            response = requests.get(image_url)
+            if response.status_code == 200:
+                file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword,
+                                                     md5(response.content).hexdigest(), "jpg")
+                if not os.path.exists(file_path):
+                    with open(file_path, "wb") as f:
+                        f.write(response.content)
+                else:
+                    print("Already Downloaded", md5(
+                        response.content).hexdigest(), "jpg", sep="")
+    except requests.ConnectionError:
+        print("Failed to save image")
+
+
+def main(start):
+    """主函数
+
+    """
+    # 这里修改kw
+    kw = "泰妍"
+    json = get_html(kw, start)
+    for item in get_images(json):
+        print("正在下载: ", item["path"], sep="")
+        write_into_file(kw, item)
+
+
+if __name__ == '__main__':
+    pool = Pool(16)
+    groups = ([x * 24 for x in range(GROUP_START-1, GROUP_END+1)])
+    pool.map(main, groups)
+    pool.close()
+    pool.join()
\ No newline at end of file