From 4b06273654afc0dab5b602871603c733e9a0776d Mon Sep 17 00:00:00 2001 From: chris Date: Wed, 2 Jan 2019 18:10:57 +0800 Subject: [PATCH] add json_aiohttp.py --- .editorconfig | 2 +- .vscode/settings.json | 2 +- Pipfile | 3 + Pipfile.lock | 111 ++++++++++++++++++++++- json_aiohttp.py | 88 ++++++++++++++++++ get_json_requests.py => json_requests.py | 16 ++-- spider_requests.py | 28 +++--- 7 files changed, 225 insertions(+), 25 deletions(-) create mode 100644 json_aiohttp.py rename get_json_requests.py => json_requests.py (87%) diff --git a/.editorconfig b/.editorconfig index 29d7c6e..06903a4 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,4 +8,4 @@ trim_trailing_whitespace = true [*.py] indent_style = space -indent_style = space +indent_size = 4 diff --git a/.vscode/settings.json b/.vscode/settings.json index cac1f49..5c3a966 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider1-v-JY5EfN/bin/python" + "python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider_duitang--pJTz49S/bin/python" } \ No newline at end of file diff --git a/Pipfile b/Pipfile index be74d81..3452a58 100644 --- a/Pipfile +++ b/Pipfile @@ -10,6 +10,9 @@ autopep8 = "*" [packages] requests = "*" scrapy = "*" +aiohttp = "*" +aiofiles = "*" +beautifulsoup4 = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 7aef5d7..6629604 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "13a448d78382996dae90e9e369723f00b9e3713ee27e54ce87bc210556c3e577" + "sha256": "2681f0e0c92e00db30a091ae615ca40f433df3f20cc2365a88023dace9ce1b8a" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,42 @@ ] }, "default": { + "aiofiles": { + "hashes": [ + "sha256:021ea0ba314a86027c166ecc4b4c07f2d40fc0f4b3a950d1868a0f2571c2bbee", + "sha256:1e644c2573f953664368de28d2aa4c89dfd64550429d0c27c4680ccd3aa4985d" + ], + "index": "aliyun", + "version": "==0.4.0" + }, + "aiohttp": { + "hashes": [ + "sha256:0bbaec0b171b1ea77d34bc7c49db71a15e511ef34c45065fd2c7fad8daf1483f", + "sha256:168f0ecc91200784467479765eb26a80d6d9cf0025b8a9cc5e501413812d32e7", + "sha256:3011371a48fdef061a8669b6636306b33cf2bf621e1960513c6ce70449f7cd3d", + "sha256:310c95f1da5f92e937b136e55c2013e4bccd1b53bc88780256ba8ed75699dbdb", + "sha256:359baeea2ca640e0dde31a03c3bf3d3008bcbd136c6b1768b58a3499a46a6cc2", + "sha256:5202ac2d00226f0b2990af9f3301c1ba5eebb673ae0a0acfe499eaea8a1b23ad", + "sha256:53fc0ad2e8d8f2f0c87bdc3009784de61f5dd9a4259f67301b317525eedc3ed5", + "sha256:55355947c4fe4b37d2a51b8f1d3f36f7fca541cf012031225be836d1f743c011", + "sha256:5691c630435fd6bd09a789de9ffd5a61b812445dfd515525c738a97d4f9b550a", + "sha256:6739494376c90806cbb88e7ea2c9e2c35949e6c7089507d19e8f489170a26156", + "sha256:a68232a60b8c1a822c4ac4096bfb42b4f873ac7dcef265642223690220b5af4f", + "sha256:af664f067d3c905f4f44d724e65406ed95dd2b4adfcc3d23a9203320ce497950", + "sha256:b9def7acd7c84ca86d0c3247e83180782c423d0e8a68254718fcc69e521570da", + "sha256:bb96d5e0a82f67a04cde32f970ca837fbcf7ef44124170bc5e34f26c0ed92f7d", + "sha256:c115744b2a0bf666fd8cde52a6d3e9319ffeb486009579743f5adfdcf0bf0773", + "sha256:c642901f6c53b965785e57a597229dd87910991b3e2d8aecf552da7d48cfe170", + "sha256:c9b47b2ee669b2f01824e0f3b364a8cdfab8d40df1b5987c7c2103d3e13ec9e9", + "sha256:dd07976a2f2615d4f2ed3654b24e53fe837708602c00934ce1e963690c91c933", + "sha256:e3b29248c9180fd6a30619b2714c534e3165e523a568296250337fe8952d39b8", + "sha256:ed65392135299698b0ebff4ee53ccf19d5c7c12077652a7faab05db369eb3996", + "sha256:f438eab30868997407b73814ba097b80862d6d5bc5f7f2fda384e60df769777b", + "sha256:f73d6a3e711f26be58bfa13a65a425638fa9d3f4a081eebff0eb70e42fee40a8" + ], + "index": "aliyun", + "version": "==3.5.1" + }, "asn1crypto": { "hashes": [ "sha256:2f1adbb7546ed199e3c90ef23ec95c5cf3585bac7d11fb7eb562a3fe89c64e87", @@ -23,6 +59,13 @@ ], "version": "==0.24.0" }, + "async-timeout": { + "hashes": [ + "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", + "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" + ], + "version": "==3.0.1" + }, "attrs": { "hashes": [ "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", @@ -37,6 +80,15 @@ ], "version": "==0.7.0" }, + "beautifulsoup4": { + "hashes": [ + "sha256:1ed70a0e99742653953d68462378a1a8eb65dca5f7c8fa44a05a2a0b3545df67", + "sha256:6a7f5e0efc563cd1ffeefba6d528b97aa0d313c02dd126ba6c455e5fe5bd48eb", + "sha256:e394827904cc4923f443e8dd2e9968343669c8e1ad7a8d62d7541e780884acb8" + ], + "index": "aliyun", + "version": "==4.7.0" + }, "certifi": { "hashes": [ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", @@ -182,6 +234,40 @@ ], "version": "==4.2.5" }, + "multidict": { + "hashes": [ + "sha256:024b8129695a952ebd93373e45b5d341dbb87c17ce49637b34000093f243dd4f", + "sha256:041e9442b11409be5e4fc8b6a97e4bcead758ab1e11768d1e69160bdde18acc3", + "sha256:045b4dd0e5f6121e6f314d81759abd2c257db4634260abcfe0d3f7083c4908ef", + "sha256:047c0a04e382ef8bd74b0de01407e8d8632d7d1b4db6f2561106af812a68741b", + "sha256:068167c2d7bbeebd359665ac4fff756be5ffac9cda02375b5c5a7c4777038e73", + "sha256:148ff60e0fffa2f5fad2eb25aae7bef23d8f3b8bdaf947a65cdbe84a978092bc", + "sha256:1d1c77013a259971a72ddaa83b9f42c80a93ff12df6a4723be99d858fa30bee3", + "sha256:1d48bc124a6b7a55006d97917f695effa9725d05abe8ee78fd60d6588b8344cd", + "sha256:31dfa2fc323097f8ad7acd41aa38d7c614dd1960ac6681745b6da124093dc351", + "sha256:34f82db7f80c49f38b032c5abb605c458bac997a6c3142e0d6c130be6fb2b941", + "sha256:3d5dd8e5998fb4ace04789d1d008e2bb532de501218519d70bb672c4c5a2fc5d", + "sha256:4a6ae52bd3ee41ee0f3acf4c60ceb3f44e0e3bc52ab7da1c2b2aa6703363a3d1", + "sha256:4b02a3b2a2f01d0490dd39321c74273fed0568568ea0e7ea23e02bd1fb10a10b", + "sha256:4b843f8e1dd6a3195679d9838eb4670222e8b8d01bc36c9894d6c3538316fa0a", + "sha256:5de53a28f40ef3c4fd57aeab6b590c2c663de87a5af76136ced519923d3efbb3", + "sha256:61b2b33ede821b94fa99ce0b09c9ece049c7067a33b279f343adfe35108a4ea7", + "sha256:6a3a9b0f45fd75dc05d8e93dc21b18fc1670135ec9544d1ad4acbcf6b86781d0", + "sha256:76ad8e4c69dadbb31bad17c16baee61c0d1a4a73bed2590b741b2e1a46d3edd0", + "sha256:7ba19b777dc00194d1b473180d4ca89a054dd18de27d0ee2e42a103ec9b7d014", + "sha256:7c1b7eab7a49aa96f3db1f716f0113a8a2e93c7375dd3d5d21c4941f1405c9c5", + "sha256:7fc0eee3046041387cbace9314926aa48b681202f8897f8bff3809967a049036", + "sha256:8ccd1c5fff1aa1427100ce188557fc31f1e0a383ad8ec42c559aabd4ff08802d", + "sha256:8e08dd76de80539d613654915a2f5196dbccc67448df291e69a88712ea21e24a", + "sha256:c18498c50c59263841862ea0501da9f2b3659c00db54abfbf823a80787fde8ce", + "sha256:c49db89d602c24928e68c0d510f4fcf8989d77defd01c973d6cbe27e684833b1", + "sha256:ce20044d0317649ddbb4e54dab3c1bcc7483c78c27d3f58ab3d0c7e6bc60d26a", + "sha256:d1071414dd06ca2eafa90c85a079169bfeb0e5f57fd0b45d44c092546fcd6fd9", + "sha256:d3be11ac43ab1a3e979dac80843b42226d5d3cccd3986f2e03152720a4297cd7", + "sha256:db603a1c235d110c860d5f39988ebc8218ee028f07a7cbc056ba6424372ca31b" + ], + "version": "==4.5.2" + }, "parsel": { "hashes": [ "sha256:493a9214acbdcb4487a084d95344c25e85e90426a67311ea0425dc5df8dc24b9", @@ -267,6 +353,13 @@ ], "version": "==1.12.0" }, + "soupsieve": { + "hashes": [ + "sha256:29bf6f6d4a641eb0a172c1a95bbc4466009cceaceb747a6e425efd7d0b5fbea2", + "sha256:7a89864492e82b92eac0097f966f30f5f96597ba1af3c3b5385b41bc92cebe55" + ], + "version": "==1.6" + }, "twisted": { "hashes": [ "sha256:294be2c6bf84ae776df2fc98e7af7d6537e1c5e60a46d33c3ce2a197677da395" @@ -287,6 +380,22 @@ ], "version": "==1.19.0" }, + "yarl": { + "hashes": [ + "sha256:024ecdc12bc02b321bc66b41327f930d1c2c543fa9a561b39861da9388ba7aa9", + "sha256:2f3010703295fbe1aec51023740871e64bb9664c789cba5a6bdf404e93f7568f", + "sha256:3890ab952d508523ef4881457c4099056546593fa05e93da84c7250516e632eb", + "sha256:3e2724eb9af5dc41648e5bb304fcf4891adc33258c6e14e2a7414ea32541e320", + "sha256:5badb97dd0abf26623a9982cd448ff12cb39b8e4c94032ccdedf22ce01a64842", + "sha256:73f447d11b530d860ca1e6b582f947688286ad16ca42256413083d13f260b7a0", + "sha256:7ab825726f2940c16d92aaec7d204cfc34ac26c0040da727cf8ba87255a33829", + "sha256:b25de84a8c20540531526dfbb0e2d2b648c13fd5dd126728c496d7c3fea33310", + "sha256:c6e341f5a6562af74ba55205dbd56d248daf1b5748ec48a0200ba227bb9e33f4", + "sha256:c9bb7c249c4432cd47e75af3864bc02d26c9594f49c82e2a28624417f0ae63b8", + "sha256:e060906c0c585565c718d1c3841747b61c5439af2211e185f6739a9412dfbde1" + ], + "version": "==1.3.0" + }, "zope.interface": { "hashes": [ "sha256:086707e0f413ff8800d9c4bc26e174f7ee4c9c8b0302fbad68d083071822316c", diff --git a/json_aiohttp.py b/json_aiohttp.py new file mode 100644 index 0000000..c7fd04d --- /dev/null +++ b/json_aiohttp.py @@ -0,0 +1,88 @@ +import asyncio +import json +import os +import time + +import aiofiles +import aiohttp + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DIST_DIR = os.path.join(BASE_DIR, 'dist') + + +class Spider: + def __init__(self, kw, start=0): + self.kw = kw + self.start = start + + async def get_html(self): + url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={0}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={1}'.format( + self.kw, self.start) + headers = { + 'User-Agent': + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' + } + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers) as response: + if response.status == 200: + return await response.text() + except aiohttp.ClientConnectionError as e: + print(e) + pass + + async def test(self, response): + response = await Spider.get_html(self) + result = json.loads(response) + data = result.get('data') + if data: + object_list = data.get('object_list') + if not object_list: + return None + else: + return True + + async def write_into_file(self, response): + response = await Spider.get_html(self) + result = json.dumps(json.loads(response), + indent=4, ensure_ascii=False) + if not os.path.exists( + os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)): + os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)) + async with aiofiles.open( + 'dist/json/{0}/{1}.json'.format(self.kw, + int(self.start / 24) + 1), + 'w', + encoding='utf-8') as f: + await f.write(result) + + +def main(): + # print('Enter the keyowrd: ', end='') + # kw = input() + kw = 'taeyeon' + start = time.time() + counter = 0 + tasks = [asyncio.Semaphore(500)] + loop = asyncio.get_event_loop() + for i in range(0, 240, 24): + spider = Spider(kw, start=i) + response = spider.get_html() + tasks.append(response) + items = spider.test(response) + tasks.append(items) + if items: + print( + 'Downloading: {0}.json It costs {1}s'.format( + str(i // 24 + 1), str(time.time() - start)),) + tasks.append(spider.write_into_file(response)) + counter += 1 + else: + break + loop.run_until_complete(asyncio.wait(tasks)) + loop.close() + print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start))) + + +if __name__ == '__main__': + main() diff --git a/get_json_requests.py b/json_requests.py similarity index 87% rename from get_json_requests.py rename to json_requests.py index 452221c..629fb86 100644 --- a/get_json_requests.py +++ b/json_requests.py @@ -23,13 +23,13 @@ def get_html(self): try: response = requests.get(url, headers=headers) if response.status_code == 200: - return response + return response.text except requests.ConnectionError as e: print(e) pass def test(self, response): - result = json.loads(response.text) + result = json.loads(response) data = result.get('data') if data: object_list = data.get('object_list') @@ -39,7 +39,7 @@ def test(self, response): return True def write_into_file(self, response): - result = json.dumps(json.loads(response.text), indent=4, ensure_ascii=False) + result = json.dumps(json.loads(response), indent=4, ensure_ascii=False) if not os.path.exists( os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)): os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)) @@ -52,16 +52,16 @@ def write_into_file(self, response): def main(): - print('Enter the keyowrd: ', end='') - kw = input() - # kw = 'taeyeon' + # print('Enter the keyowrd: ', end='') + # kw = input() + kw = 'taeyeon' start = time.time() counter = 0 for i in range(0, 3600, 24): spider = Spider(kw, start=i) response = spider.get_html() - contents = spider.test(response) - if contents: + items = spider.test(response) + if items: print( 'Downloading: {0}.json It costs {1}s'.format( str(i // 24 + 1), str(time.time() - start)),) diff --git a/spider_requests.py b/spider_requests.py index 1201aff..e09a470 100644 --- a/spider_requests.py +++ b/spider_requests.py @@ -23,18 +23,18 @@ def test(self, response): if not object_list: return [] else: - for item in object_list: - contents = {} - photo = item.get('photo') + for i in object_list: + items = {} + photo = i.get('photo') if photo: path = photo.get('path') if path: - contents['path'] = path - yield contents + items['path'] = path + yield items - def get_html_2(self, content): + def get_html_2(self, item): try: - url = content.get('path') + url = item.get('path') if 'gif_jpeg' in url: response = requests.get(url[:-5]) if response.status_code == 200: @@ -95,17 +95,17 @@ def main(): # kw = 'taeyeon' start = time.time() counter = 0 - for i in range(0, 960, 24): + for i in range(0, 3600, 24): spider = Spider(kw, start=i) response = spider.get_html() - contents = spider.test(response) - if contents: - for content in contents: - format, response = spider.get_html_2(content) + items = spider.test(response) + if items: + for item in items: + format, response = spider.get_html_2(item) if format == 'gif': - print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start)) + print('Downloading: {0} It costs {1}s.'.format(item['path'][:-5], time.time() - start)) else: - print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start)) + print('Downloading: {0} It costs {1}s.'.format(item['path'], time.time() - start)) counter += 1 spider.write_into_file(format, response) print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))