From 0ac51895bfd8be6ed3ee7e1db025a9846fd28f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B2=9B=E9=A3=8E?= <i@inori.co> Date: Wed, 18 Sep 2019 23:31:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=BC=82=E5=B8=B8=E6=8D=95?= =?UTF-8?q?=E8=8E=B7=EF=BC=8C=E7=8E=B0=E5=9C=A8=E5=AE=83=E9=81=87=E5=88=B0?= =?UTF-8?q?=E4=BB=BB=E4=BD=95=E6=83=85=E5=86=B5=E9=83=BD=E8=83=BD=E6=AD=A3?= =?UTF-8?q?=E5=B8=B8=E8=BF=90=E8=A1=8C=E4=B8=94=E6=AD=A3=E7=A1=AE=E5=8E=BB?= =?UTF-8?q?=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + Pixiv.py | 47 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index e01aa02..744a162 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,4 @@ img/ .idea/vcs.xml .idea/.gitignore test.py +.idea/dictionaries/i.xml diff --git a/Pixiv.py b/Pixiv.py index 47fbb73..72ad5fc 100644 --- a/Pixiv.py +++ b/Pixiv.py @@ -19,6 +19,7 @@ except: import redis requests.packages.urllib3.disable_warnings() +error_list = [] class PixivSpider(object): @@ -29,6 +30,9 @@ class PixivSpider(object): self.r = redis.Redis(host='localhost', port=6379, decode_responses=True) def get_list(self, pid): + """ + :param pid: 插画ID + """ response = requests.get(self.ajax_url.format(pid), headers=self.headers, verify=False) json_data = response.json() list_temp = json_data['body'] @@ -36,37 +40,52 @@ class PixivSpider(object): url_tamp = l['urls']['original'] n = self.r.get(pid) if not n: - self.get_img(url_tamp) + why_not_do = self.get_img(url_tamp) + # 判断是否返回异常 如果有异常则取消这个页面的爬取 等待下次 + if why_not_do == 1: + return pid else: print(f'插画ID:{pid}已存在!') + break # with open('pixiv.json', 'a', encoding='utf-8') as f: # f.write(url_tamp + '\n') # 导出 def get_img(self, url): + """ + + :param url: 作品页URL + :return: + """ if not os.path.isdir('./img'): os.makedirs('./img') file_name = re.findall('/\d+/\d+/\d+/\d+/\d+/\d+/(.*)', url)[0] if os.path.isfile(f'./img/{file_name}'): - print(f'{file_name}已存在!') - return 1 + print(f'文件:{file_name}已存在,跳过') + # 单个文件存在并不能判断是否爬取过 + return 0 print(f'开始下载:{file_name}') t = 0 while t < 3: try: img_temp = requests.get(url, headers=self.headers, timeout=15, verify=False) break - except requests.exceptions.ConnectTimeout: - print("连接超时!正在重试!") - t += 1 - except requests.exceptions.ConnectionError: + except requests.exceptions.RequestException: print('连接异常!正在重试!') t += 1 + if t == 3: + # 返回异常 取消此次爬取 等待下次 + return 1 with open(f'./img/{file_name}', 'wb') as fp: fp.write(img_temp.content) def get_top_url(self, num): + """ + + :param num: 页码 + :return: + """ params = { 'mode': 'daily', 'content': 'illust', @@ -81,12 +100,12 @@ class PixivSpider(object): for url in self.data: illust_id = url['illust_id'] illust_user = url['user_id'] - yield illust_id # 生成PID 、用户ID + yield illust_id # 生成PID self.r.set(illust_id, illust_user) @classmethod - def pixiv_spider_go(cls, json_data): - cls.data = json_data + def pixiv_spider_go(cls, data): + cls.data = data @classmethod def pixiv_main(cls): @@ -107,8 +126,12 @@ class PixivSpider(object): print('开始抓取...') for i in range(1, 11, 1): # p站每日排行榜最多为500个 pixiv.get_top_url(i) - for j in pixiv.get_top_pic(): # 接口暂时不想写了 先这样凑合一下吧 - pixiv.get_list(j) + for j in pixiv.get_top_pic(): + k = pixiv.get_list(j) # 接口暂时不想写了 先这样凑合一下吧 + if k: + error_list.append(k) + for k in error_list: + pixiv.r.delete(k) if __name__ == '__main__':