From 0ac51895bfd8be6ed3ee7e1db025a9846fd28f75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B2=9B=E9=A3=8E?= <i@inori.co>
Date: Wed, 18 Sep 2019 23:31:16 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=BC=82=E5=B8=B8=E6=8D=95?=
 =?UTF-8?q?=E8=8E=B7=EF=BC=8C=E7=8E=B0=E5=9C=A8=E5=AE=83=E9=81=87=E5=88=B0?=
 =?UTF-8?q?=E4=BB=BB=E4=BD=95=E6=83=85=E5=86=B5=E9=83=BD=E8=83=BD=E6=AD=A3?=
 =?UTF-8?q?=E5=B8=B8=E8=BF=90=E8=A1=8C=E4=B8=94=E6=AD=A3=E7=A1=AE=E5=8E=BB?=
 =?UTF-8?q?=E9=87=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore |  1 +
 Pixiv.py   | 47 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index e01aa02..744a162 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,3 +136,4 @@ img/
 .idea/vcs.xml
 .idea/.gitignore
 test.py
+.idea/dictionaries/i.xml
diff --git a/Pixiv.py b/Pixiv.py
index 47fbb73..72ad5fc 100644
--- a/Pixiv.py
+++ b/Pixiv.py
@@ -19,6 +19,7 @@ except:
     import redis
 
 requests.packages.urllib3.disable_warnings()
+error_list = []
 
 
 class PixivSpider(object):
@@ -29,6 +30,9 @@ class PixivSpider(object):
         self.r = redis.Redis(host='localhost', port=6379, decode_responses=True)
 
     def get_list(self, pid):
+        """
+        :param pid: 插画ID
+        """
         response = requests.get(self.ajax_url.format(pid), headers=self.headers, verify=False)
         json_data = response.json()
         list_temp = json_data['body']
@@ -36,37 +40,52 @@ class PixivSpider(object):
             url_tamp = l['urls']['original']
             n = self.r.get(pid)
             if not n:
-                self.get_img(url_tamp)
+                why_not_do = self.get_img(url_tamp)
+                # 判断是否返回异常 如果有异常则取消这个页面的爬取 等待下次
+                if why_not_do == 1:
+                    return pid
             else:
                 print(f'插画ID:{pid}已存在!')
+                break
 
             # with open('pixiv.json', 'a', encoding='utf-8') as f:
             #     f.write(url_tamp + '\n')
             # 导出
 
     def get_img(self, url):
+        """
+
+        :param url: 作品页URL
+        :return:
+        """
         if not os.path.isdir('./img'):
             os.makedirs('./img')
         file_name = re.findall('/\d+/\d+/\d+/\d+/\d+/\d+/(.*)', url)[0]
         if os.path.isfile(f'./img/{file_name}'):
-            print(f'{file_name}已存在!')
-            return 1
+            print(f'文件:{file_name}已存在,跳过')
+            #  单个文件存在并不能判断是否爬取过
+            return 0
         print(f'开始下载:{file_name}')
         t = 0
         while t < 3:
             try:
                 img_temp = requests.get(url, headers=self.headers, timeout=15, verify=False)
                 break
-            except requests.exceptions.ConnectTimeout:
-                print("连接超时!正在重试!")
-                t += 1
-            except requests.exceptions.ConnectionError:
+            except requests.exceptions.RequestException:
                 print('连接异常!正在重试!')
                 t += 1
+        if t == 3:
+            # 返回异常 取消此次爬取 等待下次
+            return 1
         with open(f'./img/{file_name}', 'wb') as fp:
             fp.write(img_temp.content)
 
     def get_top_url(self, num):
+        """
+
+        :param num: 页码
+        :return:
+        """
         params = {
             'mode': 'daily',
             'content': 'illust',
@@ -81,12 +100,12 @@ class PixivSpider(object):
         for url in self.data:
             illust_id = url['illust_id']
             illust_user = url['user_id']
-            yield illust_id  # 生成PID 、用户ID
+            yield illust_id  # 生成PID
             self.r.set(illust_id, illust_user)
 
     @classmethod
-    def pixiv_spider_go(cls, json_data):
-        cls.data = json_data
+    def pixiv_spider_go(cls, data):
+        cls.data = data
 
     @classmethod
     def pixiv_main(cls):
@@ -107,8 +126,12 @@ class PixivSpider(object):
         print('开始抓取...')
         for i in range(1, 11, 1):  # p站每日排行榜最多为500个
             pixiv.get_top_url(i)
-            for j in pixiv.get_top_pic():  # 接口暂时不想写了 先这样凑合一下吧
-                pixiv.get_list(j)
+            for j in pixiv.get_top_pic():
+                k = pixiv.get_list(j)  # 接口暂时不想写了 先这样凑合一下吧
+                if k:
+                    error_list.append(k)
+        for k in error_list:
+            pixiv.r.delete(k)
 
 
 if __name__ == '__main__':