Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
venusing1998 committed Jan 7, 2019
1 parent 923aa22 commit 95959a4
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 196 deletions.
4 changes: 1 addition & 3 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@ verify_ssl = true
[dev-packages]
pylint = "*"
autopep8 = "*"
pandas = "*"
lxml = "*"

[packages]
requests = "*"
scrapy = "*"
aiohttp = "*"
aiofiles = "*"
beautifulsoup4 = "*"
pillow = "*"

[requires]
python_version = "3.7"
222 changes: 65 additions & 157 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions json_requests.py → spider_requests/json_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def test(self, response):
data = result.get('data')
if data:
object_list = data.get('object_list')
if not object_list:
return False
else:
if object_list:
return True

def write_into_file(self, response):
Expand Down
10 changes: 4 additions & 6 deletions spider_requests.py → spider_requests/spider_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ def test(self, response):
data = result.get('data')
if data:
object_list = data.get('object_list')
if not object_list:
return None
else:
if object_list:
for i in object_list:
items = {}
photo = i.get('photo')
Expand Down Expand Up @@ -114,9 +112,9 @@ def write_into_file(self, format, response):


def main():
# print('Enter the keyowrd: ', end='')
# kw = input()
kw = 'correct'
print('Enter the keyowrd: ', end='')
kw = input()
# kw = 'correct'
start_time = time.time()
counter = 0
for start in range(0, 3600, 24):
Expand Down
3 changes: 1 addition & 2 deletions spider_scrapy/spider_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,4 @@

class SpiderScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
result = scrapy.Field()
path = scrapy.Field()
28 changes: 23 additions & 5 deletions spider_scrapy/spider_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,29 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from hashlib import md5

class SpiderScrapyPipeline(object):
import scrapy
from scrapy.pipelines.images import ImagesPipeline


class SpiderScrapyPipeline:
def __init__(self):
self.file = open('data.json', 'w', encoding='utf-8')
pass


class ImagePipeline(ImagesPipeline):
def file_path(self, request, item, response=None, info=None):
if 'gif' in item['path']:
filename = '{1}.{2}'.format(
md5(response.content).hexdigest(), 'gif')
elif 'png' in item['path']:
filename = '{1}.{2}'.format(
md5(response.content).hexdigest(), 'png')
elif 'jpg' or 'jpeg' in item['path']:
filename = '{1}.{2}'.format(
md5(response.content).hexdigest(), 'jpg')
return filename

def process_item(self, item, spider):
self.file.write(item.get('result'))
return item
def get_media_requests(self, item, info):
yield scrapy.Request(item['path'])
3 changes: 2 additions & 1 deletion spider_scrapy/spider_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'spider_scrapy.pipelines.SpiderScrapyPipeline': 300,
'spider_scrapy.pipelines.ImagePipeline': 300,
}
IMAGES_STORE = './dist'

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
Expand Down
31 changes: 12 additions & 19 deletions spider_scrapy/spider_scrapy/spiders/duitang.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
import time
from spider_scrapy.items import SpiderScrapyItem

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIST_DIR = os.path.join(BASE_DIR, 'dist')


class DuitangSpider(scrapy.Spider):
name = 'duitang'
Expand All @@ -17,7 +14,7 @@ class DuitangSpider(scrapy.Spider):
# start_urls = []

def start_requests(self):
for start in range(0, 1200, 24):
for start in range(0, 360, 24):
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={0}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={1}'.format(
self.kw, start)

Expand All @@ -31,18 +28,14 @@ def parse(self, response):
if data:
object_list = data.get('object_list')
if object_list:
result = json.dumps(json.loads(response.text),
indent=4, ensure_ascii=False)
result_dir = os.path.join(
os.path.join(DIST_DIR, 'json'), self.kw)
page = response.url.split("=")[-1]
if not os.path.exists(result_dir):
os.makedirs(result_dir)
result_path = os.path.join(
result_dir, '{0}.json'.format(int(page) // 24 + 1))
item['result'] = result
return item

else:
pass

for i in object_list:
item = SpiderScrapyItem()
photo = i.get('photo')
if photo:
path = photo.get('path')
if path:
if 'gif_jpeg' in path:
item['path'] = path[:-5]
else:
item['path'] = path
yield item

0 comments on commit 95959a4

Please sign in to comment.