完善异常捕获,现在它遇到任何情况都能正常运行且正确去重

This commit is contained in:
岛风 2019-09-18 23:31:16 +08:00
parent 420ea4171d
commit 0ac51895bf
2 changed files with 36 additions and 12 deletions

1
.gitignore vendored
View File

@ -136,3 +136,4 @@ img/
.idea/vcs.xml .idea/vcs.xml
.idea/.gitignore .idea/.gitignore
test.py test.py
.idea/dictionaries/i.xml

View File

@ -19,6 +19,7 @@ except:
import redis import redis
requests.packages.urllib3.disable_warnings() requests.packages.urllib3.disable_warnings()
error_list = []
class PixivSpider(object): class PixivSpider(object):
@ -29,6 +30,9 @@ class PixivSpider(object):
self.r = redis.Redis(host='localhost', port=6379, decode_responses=True) self.r = redis.Redis(host='localhost', port=6379, decode_responses=True)
def get_list(self, pid): def get_list(self, pid):
"""
:param pid: 插画ID
"""
response = requests.get(self.ajax_url.format(pid), headers=self.headers, verify=False) response = requests.get(self.ajax_url.format(pid), headers=self.headers, verify=False)
json_data = response.json() json_data = response.json()
list_temp = json_data['body'] list_temp = json_data['body']
@ -36,37 +40,52 @@ class PixivSpider(object):
url_tamp = l['urls']['original'] url_tamp = l['urls']['original']
n = self.r.get(pid) n = self.r.get(pid)
if not n: if not n:
self.get_img(url_tamp) why_not_do = self.get_img(url_tamp)
# 判断是否返回异常 如果有异常则取消这个页面的爬取 等待下次
if why_not_do == 1:
return pid
else: else:
print(f'插画ID:{pid}已存在!') print(f'插画ID:{pid}已存在!')
break
# with open('pixiv.json', 'a', encoding='utf-8') as f: # with open('pixiv.json', 'a', encoding='utf-8') as f:
# f.write(url_tamp + '\n') # f.write(url_tamp + '\n')
# 导出 # 导出
def get_img(self, url): def get_img(self, url):
"""
:param url: 作品页URL
:return:
"""
if not os.path.isdir('./img'): if not os.path.isdir('./img'):
os.makedirs('./img') os.makedirs('./img')
file_name = re.findall('/\d+/\d+/\d+/\d+/\d+/\d+/(.*)', url)[0] file_name = re.findall('/\d+/\d+/\d+/\d+/\d+/\d+/(.*)', url)[0]
if os.path.isfile(f'./img/{file_name}'): if os.path.isfile(f'./img/{file_name}'):
print(f'{file_name}已存在!') print(f'文件:{file_name}已存在,跳过')
return 1 # 单个文件存在并不能判断是否爬取过
return 0
print(f'开始下载:{file_name}') print(f'开始下载:{file_name}')
t = 0 t = 0
while t < 3: while t < 3:
try: try:
img_temp = requests.get(url, headers=self.headers, timeout=15, verify=False) img_temp = requests.get(url, headers=self.headers, timeout=15, verify=False)
break break
except requests.exceptions.ConnectTimeout: except requests.exceptions.RequestException:
print("连接超时!正在重试!")
t += 1
except requests.exceptions.ConnectionError:
print('连接异常!正在重试!') print('连接异常!正在重试!')
t += 1 t += 1
if t == 3:
# 返回异常 取消此次爬取 等待下次
return 1
with open(f'./img/{file_name}', 'wb') as fp: with open(f'./img/{file_name}', 'wb') as fp:
fp.write(img_temp.content) fp.write(img_temp.content)
def get_top_url(self, num): def get_top_url(self, num):
"""
:param num: 页码
:return:
"""
params = { params = {
'mode': 'daily', 'mode': 'daily',
'content': 'illust', 'content': 'illust',
@ -81,12 +100,12 @@ class PixivSpider(object):
for url in self.data: for url in self.data:
illust_id = url['illust_id'] illust_id = url['illust_id']
illust_user = url['user_id'] illust_user = url['user_id']
yield illust_id # 生成PID 、用户ID yield illust_id # 生成PID
self.r.set(illust_id, illust_user) self.r.set(illust_id, illust_user)
@classmethod @classmethod
def pixiv_spider_go(cls, json_data): def pixiv_spider_go(cls, data):
cls.data = json_data cls.data = data
@classmethod @classmethod
def pixiv_main(cls): def pixiv_main(cls):
@ -107,8 +126,12 @@ class PixivSpider(object):
print('开始抓取...') print('开始抓取...')
for i in range(1, 11, 1): # p站每日排行榜最多为500个 for i in range(1, 11, 1): # p站每日排行榜最多为500个
pixiv.get_top_url(i) pixiv.get_top_url(i)
for j in pixiv.get_top_pic(): # 接口暂时不想写了 先这样凑合一下吧 for j in pixiv.get_top_pic():
pixiv.get_list(j) k = pixiv.get_list(j) # 接口暂时不想写了 先这样凑合一下吧
if k:
error_list.append(k)
for k in error_list:
pixiv.r.delete(k)
if __name__ == '__main__': if __name__ == '__main__':