Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
venusing1998 committed Jan 1, 2019
1 parent e8abd6c commit c790b7b
Show file tree
Hide file tree
Showing 12 changed files with 633 additions and 442 deletions.
4 changes: 2 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ verify_ssl = true

[dev-packages]
pylint = "*"
yapf = "*"
jupyter = "*"
autopep8 = "*"

[packages]
requests = "*"
scrapy = "*"

[requires]
python_version = "3.7"
579 changes: 254 additions & 325 deletions Pipfile.lock

Large diffs are not rendered by default.

45 changes: 29 additions & 16 deletions get_json.py → get_json_requests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import time

import requests

Expand All @@ -22,13 +23,13 @@ def get_html(self):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return response
except requests.ConnectionError as e:
print(e)
return None
pass

def test(self, text):
result = json.loads(text)
def test(self, response):
result = json.loads(response.text)
data = result.get('data')
if data:
object_list = data.get('object_list')
Expand All @@ -37,26 +38,38 @@ def test(self, text):
else:
return True

def write_into_file(self, text):
result = json.dumps(
json.loads(text), indent=4, ensure_ascii=False)
if not os.path.exists(DIST_DIR):
os.makedirs(DIST_DIR)
def write_into_file(self, response):
result = json.dumps(json.loads(response.text), indent=4, ensure_ascii=False)
if not os.path.exists(
os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)):
os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw))
with open(
'dist/result{0}.json'.format(int(self.start / 24) + 1),
'dist/json/{0}/{1}.json'.format(self.kw,
int(self.start / 24) + 1),
'w',
encoding='utf-8') as f:
f.write(result)


def main():
kw = 'correct'
for i in range(0, 360, 24):
print('Enter the keyowrd: ', end='')
kw = input()
# kw = 'taeyeon'
start = time.time()
counter = 0
for i in range(0, 3600, 24):
spider = Spider(kw, start=i)
text = spider.get_html()
items = spider.test(text)
if items:
spider.write_into_file(text)
response = spider.get_html()
contents = spider.test(response)
if contents:
print(
'Downloading: {0}.json It costs {1}s'.format(
str(i // 24 + 1), str(time.time() - start)),)
spider.write_into_file(response)
counter += 1
else:
break
print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))


if __name__ == '__main__':
Expand Down
99 changes: 0 additions & 99 deletions main.py

This file was deleted.

115 changes: 115 additions & 0 deletions spider_requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import json
import os
import time
from hashlib import md5

import requests
import get_json_requests

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIST_DIR = os.path.join(BASE_DIR, 'dist')


class Spider(get_json_requests.Spider):
def __init__(self, kw, start=0):
self.kw = kw
self.start = start

def test(self, response):
result = json.loads(response.text)
data = result.get('data')
if data:
object_list = data.get('object_list')
if not object_list:
return []
else:
for item in object_list:
contents = {}
photo = item.get('photo')
if photo:
path = photo.get('path')
if path:
contents['path'] = path
yield contents

def get_html_2(self, content):
try:
url = content.get('path')
if 'gif_jpeg' in url:
response = requests.get(url[:-5])
if response.status_code == 200:
return ('gif', response)
elif 'png' in url:
response = requests.get(url)
if response.status_code == 200:
return ('png', response)
elif 'jpg' or 'jpeg' in url:
response = requests.get(url)
if response.status_code == 200:
return ('jpg', response)
else:
print('Unknown format.')
pass
except requests.ConnectionError as e:
print(e)
pass

def write_into_file(self, format, response):
if not os.path.exists(os.path.join(DIST_DIR, self.kw)):
os.makedirs(os.path.join(DIST_DIR, self.kw))
if format == 'gif':
file_path = '{0}/{1}/{2}.{3}'.format(
DIST_DIR, self.kw,
md5(response.content).hexdigest(), 'gif')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded {0}.gif'.format(
md5(response.content).hexdigest()))
elif format == 'png':
file_path = '{0}/{1}/{2}.{3}'.format(
DIST_DIR, self.kw,
md5(response.content).hexdigest(), 'png')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded {0}.png'.format(
md5(response.content).hexdigest()))
elif format == 'jpg':
file_path = '{0}/{1}/{2}.{3}'.format(
DIST_DIR, self.kw,
md5(response.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded {0}.jpg'.format(
md5(response.content).hexdigest()))


def main():
print('Enter the keyowrd: ', end='')
kw = input()
# kw = 'taeyeon'
start = time.time()
counter = 0
for i in range(0, 960, 24):
spider = Spider(kw, start=i)
response = spider.get_html()
contents = spider.test(response)
if contents:
for content in contents:
format, response = spider.get_html_2(content)
if format == 'gif':
print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start))
else:
print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start))
counter += 1
spider.write_into_file(format, response)
print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions spider_scrapy/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html

[settings]
default = spider_scrapy.settings

[deploy]
#url = http://localhost:6800/
project = spider_scrapy
Empty file.
14 changes: 14 additions & 0 deletions spider_scrapy/spider_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SpiderScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
Loading

0 comments on commit c790b7b

Please sign in to comment.