-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e8abd6c
commit c790b7b
Showing
12 changed files
with
633 additions
and
442 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import json | ||
import os | ||
import time | ||
from hashlib import md5 | ||
|
||
import requests | ||
import get_json_requests | ||
|
||
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
DIST_DIR = os.path.join(BASE_DIR, 'dist') | ||
|
||
|
||
class Spider(get_json_requests.Spider): | ||
def __init__(self, kw, start=0): | ||
self.kw = kw | ||
self.start = start | ||
|
||
def test(self, response): | ||
result = json.loads(response.text) | ||
data = result.get('data') | ||
if data: | ||
object_list = data.get('object_list') | ||
if not object_list: | ||
return [] | ||
else: | ||
for item in object_list: | ||
contents = {} | ||
photo = item.get('photo') | ||
if photo: | ||
path = photo.get('path') | ||
if path: | ||
contents['path'] = path | ||
yield contents | ||
|
||
def get_html_2(self, content): | ||
try: | ||
url = content.get('path') | ||
if 'gif_jpeg' in url: | ||
response = requests.get(url[:-5]) | ||
if response.status_code == 200: | ||
return ('gif', response) | ||
elif 'png' in url: | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
return ('png', response) | ||
elif 'jpg' or 'jpeg' in url: | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
return ('jpg', response) | ||
else: | ||
print('Unknown format.') | ||
pass | ||
except requests.ConnectionError as e: | ||
print(e) | ||
pass | ||
|
||
def write_into_file(self, format, response): | ||
if not os.path.exists(os.path.join(DIST_DIR, self.kw)): | ||
os.makedirs(os.path.join(DIST_DIR, self.kw)) | ||
if format == 'gif': | ||
file_path = '{0}/{1}/{2}.{3}'.format( | ||
DIST_DIR, self.kw, | ||
md5(response.content).hexdigest(), 'gif') | ||
if not os.path.exists(file_path): | ||
with open(file_path, 'wb') as f: | ||
f.write(response.content) | ||
else: | ||
print('Already Downloaded {0}.gif'.format( | ||
md5(response.content).hexdigest())) | ||
elif format == 'png': | ||
file_path = '{0}/{1}/{2}.{3}'.format( | ||
DIST_DIR, self.kw, | ||
md5(response.content).hexdigest(), 'png') | ||
if not os.path.exists(file_path): | ||
with open(file_path, 'wb') as f: | ||
f.write(response.content) | ||
else: | ||
print('Already Downloaded {0}.png'.format( | ||
md5(response.content).hexdigest())) | ||
elif format == 'jpg': | ||
file_path = '{0}/{1}/{2}.{3}'.format( | ||
DIST_DIR, self.kw, | ||
md5(response.content).hexdigest(), 'jpg') | ||
if not os.path.exists(file_path): | ||
with open(file_path, 'wb') as f: | ||
f.write(response.content) | ||
else: | ||
print('Already Downloaded {0}.jpg'.format( | ||
md5(response.content).hexdigest())) | ||
|
||
|
||
def main(): | ||
print('Enter the keyowrd: ', end='') | ||
kw = input() | ||
# kw = 'taeyeon' | ||
start = time.time() | ||
counter = 0 | ||
for i in range(0, 960, 24): | ||
spider = Spider(kw, start=i) | ||
response = spider.get_html() | ||
contents = spider.test(response) | ||
if contents: | ||
for content in contents: | ||
format, response = spider.get_html_2(content) | ||
if format == 'gif': | ||
print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start)) | ||
else: | ||
print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start)) | ||
counter += 1 | ||
spider.write_into_file(format, response) | ||
print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start))) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# https://scrapyd.readthedocs.io/en/latest/deploy.html | ||
|
||
[settings] | ||
default = spider_scrapy.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = spider_scrapy |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# https://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
|
||
class SpiderScrapyItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
pass |
Oops, something went wrong.