PixivSpider/pixiv_spider.py

168 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Pixiv爬虫 - 每日排行榜下载
环境需求Python3.8+ / Redis
"""
from typing import Generator, List, Dict, Any
import requests
from rich.console import Console
from rich.progress import (
Progress,
BarColumn,
TaskProgressColumn,
TextColumn,
SpinnerColumn
)
from rich.live import Live
from rich.layout import Layout
from rich.panel import Panel
from rich.console import Group
from config import PIXIV_CONFIG
from redis_client import RedisClient
from pixiv_download import PixivDownloader
requests.packages.urllib3.disable_warnings()
class PixivSpider:
"""Pixiv每日排行榜爬虫"""
TOTAL_IMAGES = 500 # 每日排行榜总图片数
def __init__(self, db: int = 0):
"""
初始化爬虫
参数:
db: Redis数据库编号(0-5)
"""
# 设置Redis
self.redis = RedisClient()
if not self.redis.select_db(db):
raise ValueError(f"无效的Redis数据库编号: {db}")
# 设置界面组件
self.console = Console()
self._setup_ui()
# 初始化状态
self.headers = None
self.current_ranking_data = []
self.failed_works = []
def _setup_ui(self) -> None:
"""设置Rich界面组件"""
# 创建布局
self.layout = Layout()
self.layout.split(
Layout(name="PixivSpider", ratio=8),
Layout(name="progress", ratio=2)
)
# 创建进度条
self.progress = Progress(
TextColumn("[bold blue]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
TextColumn("{task.fields[speed]}"),
console=Console(stderr=True),
expand=True
)
# 设置日志面板
self.log_messages = []
self.main_task_id = self.progress.add_task(
"[cyan]总体进度",
total=self.TOTAL_IMAGES,
speed=""
)
def _update_log(self, message: str) -> None:
"""更新日志显示"""
self.log_messages.append(message)
if len(self.log_messages) > 18:
self.log_messages.pop(0)
log_group = Group(*self.log_messages)
self.layout["PixivSpider"].update(
Panel(
log_group,
title="PixivSpider",
title_align="left",
border_style="cyan",
padding=(0, 1)
)
)
def _setup_session(self) -> None:
"""设置请求会话"""
cookie = self.redis.get_cookie()
if not cookie:
cookie = input('请输入一个cookie')
self.redis.set_cookie(cookie)
self.headers = PIXIV_CONFIG.headers.copy()
self.headers['cookie'] = cookie
def get_ranking_page(self, page: int) -> None:
"""
获取排行榜单页数据
参数:
page: 页码(1-10)
"""
params = {
'mode': 'daily',
'content': 'illust',
'p': str(page),
'format': 'json'
}
response = requests.get(
PIXIV_CONFIG.top_url,
params=params,
headers=self.headers,
verify=False
)
data = response.json()
self.current_ranking_data = data['contents']
def process_ranking_data(self) -> Generator[str, None, None]:
"""
处理当前排行榜数据
生成:
str: 作品ID
"""
for item in self.current_ranking_data:
work_id = str(item['illust_id'])
user_id = str(item['user_id'])
self.redis.store_user_id(work_id, user_id)
yield work_id
def run(self) -> None:
"""运行爬虫"""
self._setup_session()
downloader = PixivDownloader(self.headers, self.progress)
with Live(self.layout, self.console, refresh_per_second=10):
self.layout["progress"].update(self.progress)
self._update_log('[cyan]开始抓取...[/cyan]')
# 处理排行榜页面
for page in range(1, 11):
try:
self.get_ranking_page(page)
for work_id in self.process_ranking_data():
if not downloader.download_work(work_id):
self.failed_works.append(work_id)
self.progress.update(self.main_task_id, advance=1)
except requests.RequestException as e:
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
continue
# 清理失败作品的记录
for work_id in self.failed_works:
self.redis.client.delete(work_id)
self._update_log('[green]爬虫运行完成[/green]')