add json_aiohttp.py

impossible98 · Jan 2, 2019 · 4b06273 · 4b06273
1 parent c790b7b
commit 4b06273
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 25 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -8,4 +8,4 @@ trim_trailing_whitespace = true
 
 [*.py]
 indent_style = space
-indent_style = space
+indent_size = 4
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,3 @@
 {
-  "python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider1-v-JY5EfN/bin/python"
+  "python.pythonPath": "/Users/chris/.local/share/virtualenvs/spider_duitang--pJTz49S/bin/python"
 }
diff --git a/Pipfile b/Pipfile
@@ -10,6 +10,9 @@ autopep8 = "*"
 [packages]
 requests = "*"
 scrapy = "*"
+aiohttp = "*"
+aiofiles = "*"
+beautifulsoup4 = "*"
 
 [requires]
 python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/json_aiohttp.py b/json_aiohttp.py
@@ -0,0 +1,88 @@
+import asyncio
+import json
+import os
+import time
+
+import aiofiles
+import aiohttp
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DIST_DIR = os.path.join(BASE_DIR, 'dist')
+
+
+class Spider:
+    def __init__(self, kw, start=0):
+        self.kw = kw
+        self.start = start
+
+    async def get_html(self):
+        url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={0}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={1}'.format(
+            self.kw, self.start)
+        headers = {
+            'User-Agent':
+            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
+        }
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.text()
+        except aiohttp.ClientConnectionError as e:
+            print(e)
+            pass
+
+    async def test(self, response):
+        response = await Spider.get_html(self)
+        result = json.loads(response)
+        data = result.get('data')
+        if data:
+            object_list = data.get('object_list')
+            if not object_list:
+                return None
+            else:
+                return True
+
+    async def write_into_file(self, response):
+        response = await Spider.get_html(self)
+        result = json.dumps(json.loads(response),
+                            indent=4, ensure_ascii=False)
+        if not os.path.exists(
+                os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)):
+            os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw))
+        async with aiofiles.open(
+                'dist/json/{0}/{1}.json'.format(self.kw,
+                                                int(self.start / 24) + 1),
+                'w',
+                encoding='utf-8') as f:
+            await f.write(result)
+
+
+def main():
+    # print('Enter the keyowrd: ', end='')
+    # kw = input()
+    kw = 'taeyeon'
+    start = time.time()
+    counter = 0
+    tasks = [asyncio.Semaphore(500)]
+    loop = asyncio.get_event_loop()
+    for i in range(0, 240, 24):
+        spider = Spider(kw, start=i)
+        response = spider.get_html()
+        tasks.append(response)
+        items = spider.test(response)
+        tasks.append(items)
+        if items:
+            print(
+                'Downloading: {0}.json It costs {1}s'.format(
+                    str(i // 24 + 1), str(time.time() - start)),)
+            tasks.append(spider.write_into_file(response))
+            counter += 1
+        else:
+            break
+    loop.run_until_complete(asyncio.wait(tasks))
+    loop.close()
+    print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/get_json_requests.py → json_requests.py b/get_json_requests.py → json_requests.py
@@ -23,13 +23,13 @@ def get_html(self):
         try:
             response = requests.get(url, headers=headers)
             if response.status_code == 200:
-                return response
+                return response.text
         except requests.ConnectionError as e:
             print(e)
             pass
 
     def test(self, response):
-        result = json.loads(response.text)
+        result = json.loads(response)
         data = result.get('data')
         if data:
             object_list = data.get('object_list')
@@ -39,7 +39,7 @@ def test(self, response):
                 return True
 
     def write_into_file(self, response):
-        result = json.dumps(json.loads(response.text), indent=4, ensure_ascii=False)
+        result = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
         if not os.path.exists(
                 os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)):
             os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw))
@@ -52,16 +52,16 @@ def write_into_file(self, response):
 
 
 def main():
-    print('Enter the keyowrd: ', end='')
-    kw = input()
-    # kw = 'taeyeon'
+    # print('Enter the keyowrd: ', end='')
+    # kw = input()
+    kw = 'taeyeon'
     start = time.time()
     counter = 0
     for i in range(0, 3600, 24):
         spider = Spider(kw, start=i)
         response = spider.get_html()
-        contents = spider.test(response)
-        if contents:
+        items = spider.test(response)
+        if items:
             print(
                 'Downloading: {0}.json It costs {1}s'.format(
                     str(i // 24 + 1), str(time.time() - start)),)

diff --git a/spider_requests.py b/spider_requests.py
@@ -23,18 +23,18 @@ def test(self, response):
             if not object_list:
                 return []
             else:
-                for item in object_list:
-                    contents = {}
-                    photo = item.get('photo')
+                for i in object_list:
+                    items = {}
+                    photo = i.get('photo')
                     if photo:
                         path = photo.get('path')
                         if path:
-                            contents['path'] = path
-                    yield contents
+                            items['path'] = path
+                    yield items
 
-    def get_html_2(self, content):
+    def get_html_2(self, item):
         try:
-            url = content.get('path')
+            url = item.get('path')
             if 'gif_jpeg' in url:
                 response = requests.get(url[:-5])
                 if response.status_code == 200:
@@ -95,17 +95,17 @@ def main():
     # kw = 'taeyeon'
     start = time.time()
     counter = 0
-    for i in range(0, 960, 24):
+    for i in range(0, 3600, 24):
         spider = Spider(kw, start=i)
         response = spider.get_html()
-        contents = spider.test(response)
-        if contents:
-            for content in contents:
-                format, response = spider.get_html_2(content)
+        items = spider.test(response)
+        if items:
+            for item in items:
+                format, response = spider.get_html_2(item)
                 if format == 'gif':
-                    print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start))
+                    print('Downloading: {0} It costs {1}s.'.format(item['path'][:-5], time.time() - start))
                 else:
-                    print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start))
+                    print('Downloading: {0} It costs {1}s.'.format(item['path'], time.time() - start))
                 counter += 1
                 spider.write_into_file(format, response)
     print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start)))