Skip to content

Commit

Permalink
add json_aiohttp.py
Browse files Browse the repository at this point in the history
  • Loading branch information
venusing1998 committed Jan 2, 2019
1 parent c790b7b commit 4b06273
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ trim_trailing_whitespace = true

[*.py]
indent_style = space
indent_style = space
indent_size = 4
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider1-v-JY5EfN/bin/python"
"python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider_duitang--pJTz49S/bin/python"
}
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ autopep8 = "*"
[packages]
requests = "*"
scrapy = "*"
aiohttp = "*"
aiofiles = "*"
beautifulsoup4 = "*"

[requires]
python_version = "3.7"
111 changes: 110 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

88 changes: 88 additions & 0 deletions json_aiohttp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import asyncio
import json
import os
import time

import aiofiles
import aiohttp

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIST_DIR = os.path.join(BASE_DIR, 'dist')


class Spider:
def __init__(self, kw, start=0):
self.kw = kw
self.start = start

async def get_html(self):
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={0}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={1}'.format(
self.kw, self.start)
headers = {
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as response:
if response.status == 200:
return await response.text()
except aiohttp.ClientConnectionError as e:
print(e)
pass

async def test(self, response):
response = await Spider.get_html(self)
result = json.loads(response)
data = result.get('data')
if data:
object_list = data.get('object_list')
if not object_list:
return None
else:
return True

async def write_into_file(self, response):
response = await Spider.get_html(self)
result = json.dumps(json.loads(response),
indent=4, ensure_ascii=False)
if not os.path.exists(
os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)):
os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw))
async with aiofiles.open(
'dist/json/{0}/{1}.json'.format(self.kw,
int(self.start / 24) + 1),
'w',
encoding='utf-8') as f:
await f.write(result)


def main():
# print('Enter the keyowrd: ', end='')
# kw = input()
kw = 'taeyeon'
start = time.time()
counter = 0
tasks = [asyncio.Semaphore(500)]
loop = asyncio.get_event_loop()
for i in range(0, 240, 24):
spider = Spider(kw, start=i)
response = spider.get_html()
tasks.append(response)
items = spider.test(response)
tasks.append(items)
if items:
print(
'Downloading: {0}.json It costs {1}s'.format(
str(i // 24 + 1), str(time.time() - start)),)
tasks.append(spider.write_into_file(response))
counter += 1
else:
break
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))


if __name__ == '__main__':
main()
16 changes: 8 additions & 8 deletions get_json_requests.py → json_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ def get_html(self):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response
return response.text
except requests.ConnectionError as e:
print(e)
pass

def test(self, response):
result = json.loads(response.text)
result = json.loads(response)
data = result.get('data')
if data:
object_list = data.get('object_list')
Expand All @@ -39,7 +39,7 @@ def test(self, response):
return True

def write_into_file(self, response):
result = json.dumps(json.loads(response.text), indent=4, ensure_ascii=False)
result = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
if not os.path.exists(
os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)):
os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw))
Expand All @@ -52,16 +52,16 @@ def write_into_file(self, response):


def main():
print('Enter the keyowrd: ', end='')
kw = input()
# kw = 'taeyeon'
# print('Enter the keyowrd: ', end='')
# kw = input()
kw = 'taeyeon'
start = time.time()
counter = 0
for i in range(0, 3600, 24):
spider = Spider(kw, start=i)
response = spider.get_html()
contents = spider.test(response)
if contents:
items = spider.test(response)
if items:
print(
'Downloading: {0}.json It costs {1}s'.format(
str(i // 24 + 1), str(time.time() - start)),)
Expand Down
28 changes: 14 additions & 14 deletions spider_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ def test(self, response):
if not object_list:
return []
else:
for item in object_list:
contents = {}
photo = item.get('photo')
for i in object_list:
items = {}
photo = i.get('photo')
if photo:
path = photo.get('path')
if path:
contents['path'] = path
yield contents
items['path'] = path
yield items

def get_html_2(self, content):
def get_html_2(self, item):
try:
url = content.get('path')
url = item.get('path')
if 'gif_jpeg' in url:
response = requests.get(url[:-5])
if response.status_code == 200:
Expand Down Expand Up @@ -95,17 +95,17 @@ def main():
# kw = 'taeyeon'
start = time.time()
counter = 0
for i in range(0, 960, 24):
for i in range(0, 3600, 24):
spider = Spider(kw, start=i)
response = spider.get_html()
contents = spider.test(response)
if contents:
for content in contents:
format, response = spider.get_html_2(content)
items = spider.test(response)
if items:
for item in items:
format, response = spider.get_html_2(item)
if format == 'gif':
print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start))
print('Downloading: {0} It costs {1}s.'.format(item['path'][:-5], time.time() - start))
else:
print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start))
print('Downloading: {0} It costs {1}s.'.format(item['path'], time.time() - start))
counter += 1
spider.write_into_file(format, response)
print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))
Expand Down

0 comments on commit 4b06273

Please sign in to comment.