Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
venusing1998 committed Dec 31, 2018
0 parents commit 39b0826
Show file tree
Hide file tree
Showing 5 changed files with 320 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
root = true

[*]
charset = utf-8
# end_of_line = lf
# insert_final_newline = true
trim_trailing_whitespace = true

[*.py]
indent_style = space
indent_style = space
157 changes: 157 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
### Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


### macOS.gitignore

# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk


### VisualStudioCode.gitignore

.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.pythonPath": "/usr/local/opt/python/bin/python3.6"
}
51 changes: 51 additions & 0 deletions get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
import os

import requests

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIST_DIR = os.path.join(BASE_DIR, "dist")


def get_html(kw, start):
"""获取网页源代码
"""
url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}"
new_url = url.format(kw, start)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
try:
response = requests.get(new_url, headers=headers)
if response.status_code == 200:
result = json.dumps(response.json(), indent=4, ensure_ascii=False)
return result
except requests.ConnectionError as e:
print(e)
return None


def write_into_file(result):
"""写入文件
"""
if not os.path.exists(DIST_DIR):
os.makedirs(DIST_DIR)
with open("dist/result.json", "w", encoding="utf-8") as f:
f.write(result)


def main():
"""主函数
"""
# 可以修改的测试值 start=24
kw = "taeyeon"
start = 0
result = get_html(kw, start)
write_into_file(result)


if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import json
import os
from hashlib import md5
from multiprocessing.dummy import Pool

import requests

# 参数
GROUP_START = 1
GROUP_END = 10
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIST_DIR = os.path.join(BASE_DIR, "dist")


def get_html(kw, start):
"""获取网页源代码
"""
url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}".format(kw, start)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
result = response.json()
return result
except requests.ConnectionError as e:
print(e)
return None


def get_images(json):
"""获取image的url
"""
data = json.get("data")
if data:
object_list = data.get("object_list")
if object_list:
for item in object_list:
contents = {}
contents["path"] = item.get("photo").get("path")
yield contents


def write_into_file(keyword, item):
"""写入文件
"""
if not os.path.exists(os.path.join(DIST_DIR, keyword)):
os.makedirs(os.path.join(DIST_DIR, keyword))
try:
image_url = item.get("path")
if "gif" in image_url:
response = requests.get(image_url)
if response.status_code == 200:
file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword,
md5(response.content).hexdigest(), "gif")
if not os.path.exists(file_path):
with open(file_path, "wb") as f:
f.write(response.content)
else:
print("Already Downloaded", md5(
response.content).hexdigest(), "gif", sep="")
else:
response = requests.get(image_url)
if response.status_code == 200:
file_path = "{0}/{1}/{2}.{3}".format(DIST_DIR, keyword,
md5(response.content).hexdigest(), "jpg")
if not os.path.exists(file_path):
with open(file_path, "wb") as f:
f.write(response.content)
else:
print("Already Downloaded", md5(
response.content).hexdigest(), "jpg", sep="")
except requests.ConnectionError:
print("Failed to save image")


def main(start):
"""主函数
"""
# 这里修改kw
kw = "泰妍"
json = get_html(kw, start)
for item in get_images(json):
print("正在下载: ", item["path"], sep="")
write_into_file(kw, item)


if __name__ == '__main__':
pool = Pool(16)
groups = ([x * 24 for x in range(GROUP_START-1, GROUP_END+1)])
pool.map(main, groups)
pool.close()
pool.join()

0 comments on commit 39b0826

Please sign in to comment.