2019-09-17 00:42:48 +08:00
|
|
|
|
"""
|
|
|
|
|
P站小爬虫 爬每日排行榜
|
|
|
|
|
环境需求:Python3.6+ / Redis
|
|
|
|
|
项目地址:https://github.com/nyaasuki/PixivSpider
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
2019-09-16 01:40:01 +08:00
|
|
|
|
import re
|
|
|
|
|
import os
|
|
|
|
|
|
2019-09-17 00:42:48 +08:00
|
|
|
|
try:
|
|
|
|
|
import requests
|
|
|
|
|
import redis
|
2019-09-17 01:26:17 +08:00
|
|
|
|
|
2019-09-17 00:42:48 +08:00
|
|
|
|
except:
|
|
|
|
|
print('检测到缺少必要包!正在尝试安装!.....')
|
|
|
|
|
os.system(r'pip install -r requirements.txt')
|
|
|
|
|
import requests
|
|
|
|
|
import redis
|
|
|
|
|
|
2019-09-17 01:26:17 +08:00
|
|
|
|
requests.packages.urllib3.disable_warnings()
|
2019-09-18 23:31:16 +08:00
|
|
|
|
error_list = []
|
2019-09-17 01:26:17 +08:00
|
|
|
|
|
2019-09-16 01:40:01 +08:00
|
|
|
|
|
|
|
|
|
class PixivSpider(object):
|
2019-09-17 00:42:48 +08:00
|
|
|
|
|
2019-09-16 01:40:01 +08:00
|
|
|
|
def __init__(self):
|
|
|
|
|
self.ajax_url = 'https://www.pixiv.net/ajax/illust/{}/pages' # id
|
2019-09-17 00:42:48 +08:00
|
|
|
|
self.top_url = 'https://www.pixiv.net/ranking.php'
|
|
|
|
|
self.r = redis.Redis(host='localhost', port=6379, decode_responses=True)
|
2019-09-16 01:40:01 +08:00
|
|
|
|
|
2019-09-17 00:42:48 +08:00
|
|
|
|
def get_list(self, pid):
|
2019-09-18 23:31:16 +08:00
|
|
|
|
"""
|
|
|
|
|
:param pid: 插画ID
|
|
|
|
|
"""
|
2019-09-17 01:26:17 +08:00
|
|
|
|
response = requests.get(self.ajax_url.format(pid), headers=self.headers, verify=False)
|
2019-09-16 01:40:01 +08:00
|
|
|
|
json_data = response.json()
|
|
|
|
|
list_temp = json_data['body']
|
|
|
|
|
for l in list_temp:
|
|
|
|
|
url_tamp = l['urls']['original']
|
2019-09-17 00:42:48 +08:00
|
|
|
|
n = self.r.get(pid)
|
|
|
|
|
if not n:
|
2019-09-18 23:31:16 +08:00
|
|
|
|
why_not_do = self.get_img(url_tamp)
|
|
|
|
|
# 判断是否返回异常 如果有异常则取消这个页面的爬取 等待下次
|
|
|
|
|
if why_not_do == 1:
|
|
|
|
|
return pid
|
2019-09-17 00:42:48 +08:00
|
|
|
|
else:
|
|
|
|
|
print(f'插画ID:{pid}已存在!')
|
2019-09-18 23:31:16 +08:00
|
|
|
|
break
|
2019-09-17 00:42:48 +08:00
|
|
|
|
|
2019-09-16 01:40:01 +08:00
|
|
|
|
# with open('pixiv.json', 'a', encoding='utf-8') as f:
|
|
|
|
|
# f.write(url_tamp + '\n')
|
|
|
|
|
# 导出
|
|
|
|
|
|
|
|
|
|
def get_img(self, url):
|
2019-09-18 23:31:16 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
:param url: 作品页URL
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
2019-09-16 01:40:01 +08:00
|
|
|
|
if not os.path.isdir('./img'):
|
|
|
|
|
os.makedirs('./img')
|
|
|
|
|
file_name = re.findall('/\d+/\d+/\d+/\d+/\d+/\d+/(.*)', url)[0]
|
|
|
|
|
if os.path.isfile(f'./img/{file_name}'):
|
2019-09-18 23:31:16 +08:00
|
|
|
|
print(f'文件:{file_name}已存在,跳过')
|
|
|
|
|
# 单个文件存在并不能判断是否爬取过
|
|
|
|
|
return 0
|
2019-09-16 01:40:01 +08:00
|
|
|
|
print(f'开始下载:{file_name}')
|
2019-09-16 01:51:49 +08:00
|
|
|
|
t = 0
|
|
|
|
|
while t < 3:
|
|
|
|
|
try:
|
2019-09-17 01:26:17 +08:00
|
|
|
|
img_temp = requests.get(url, headers=self.headers, timeout=15, verify=False)
|
2019-09-16 01:51:49 +08:00
|
|
|
|
break
|
2019-09-18 23:31:16 +08:00
|
|
|
|
except requests.exceptions.RequestException:
|
2019-09-17 20:03:54 +08:00
|
|
|
|
print('连接异常!正在重试!')
|
|
|
|
|
t += 1
|
2019-09-18 23:31:16 +08:00
|
|
|
|
if t == 3:
|
|
|
|
|
# 返回异常 取消此次爬取 等待下次
|
|
|
|
|
return 1
|
2019-09-16 01:40:01 +08:00
|
|
|
|
with open(f'./img/{file_name}', 'wb') as fp:
|
|
|
|
|
fp.write(img_temp.content)
|
|
|
|
|
|
2019-09-17 00:42:48 +08:00
|
|
|
|
def get_top_url(self, num):
|
2019-09-18 23:31:16 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
:param num: 页码
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
2019-09-17 00:42:48 +08:00
|
|
|
|
params = {
|
|
|
|
|
'mode': 'daily',
|
|
|
|
|
'content': 'illust',
|
|
|
|
|
'p': f'{num}',
|
|
|
|
|
'format': 'json'
|
|
|
|
|
}
|
2019-09-17 01:26:17 +08:00
|
|
|
|
response = requests.get(self.top_url, params=params, headers=self.headers, verify=False)
|
2019-09-17 00:42:48 +08:00
|
|
|
|
json_data = response.json()
|
|
|
|
|
self.pixiv_spider_go(json_data['contents'])
|
|
|
|
|
|
|
|
|
|
def get_top_pic(self):
|
|
|
|
|
for url in self.data:
|
|
|
|
|
illust_id = url['illust_id']
|
|
|
|
|
illust_user = url['user_id']
|
2019-09-18 23:31:16 +08:00
|
|
|
|
yield illust_id # 生成PID
|
2019-09-17 00:42:48 +08:00
|
|
|
|
self.r.set(illust_id, illust_user)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2019-09-18 23:31:16 +08:00
|
|
|
|
def pixiv_spider_go(cls, data):
|
|
|
|
|
cls.data = data
|
2019-09-17 00:42:48 +08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def pixiv_main(cls):
|
2019-09-17 20:03:54 +08:00
|
|
|
|
cookie = pixiv.r.get('cookie')
|
|
|
|
|
if not cookie:
|
|
|
|
|
cookie = input('请输入一个cookie:')
|
|
|
|
|
pixiv.r.set('cookie', cookie)
|
2019-09-17 00:42:48 +08:00
|
|
|
|
cls.headers = {
|
|
|
|
|
'accept': 'application/json',
|
|
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6',
|
|
|
|
|
'dnt': '1',
|
|
|
|
|
'cookie': f'{cookie}',
|
|
|
|
|
'referer': 'https://www.pixiv.net/',
|
|
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
|
|
'sec-fetch-site': 'same-origin',
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
|
|
|
|
|
}
|
2019-09-17 11:04:01 +08:00
|
|
|
|
print('开始抓取...')
|
|
|
|
|
for i in range(1, 11, 1): # p站每日排行榜最多为500个
|
2019-09-17 00:42:48 +08:00
|
|
|
|
pixiv.get_top_url(i)
|
2019-09-18 23:31:16 +08:00
|
|
|
|
for j in pixiv.get_top_pic():
|
|
|
|
|
k = pixiv.get_list(j) # 接口暂时不想写了 先这样凑合一下吧
|
|
|
|
|
if k:
|
|
|
|
|
error_list.append(k)
|
|
|
|
|
for k in error_list:
|
|
|
|
|
pixiv.r.delete(k)
|
2019-09-17 00:42:48 +08:00
|
|
|
|
|
2019-09-16 01:40:01 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
pixiv = PixivSpider()
|
2019-09-17 00:42:48 +08:00
|
|
|
|
pixiv.pixiv_main()
|
|
|
|
|
# for id_url in pixiv.get_list():
|
|
|
|
|
# pixiv.get_img(id_url)
|