This commit is contained in:
岛风 2024-12-22 05:34:21 +08:00
parent 99a9de90aa
commit a6bf93ebbc
2 changed files with 136 additions and 62 deletions

View File

@ -4,6 +4,7 @@ import re
from typing import Optional, Union
import requests
from rich.progress import Progress
from rich.console import Console
from config import PIXIV_CONFIG
from redis_client import RedisClient
@ -11,19 +12,23 @@ from redis_client import RedisClient
class PixivDownloader:
"""处理Pixiv图片下载"""
def __init__(self, headers: dict, progress: Progress):
def __init__(self, spider, headers: dict, progress: Progress):
"""
初始化下载器
参数:
spider: PixivSpider实例用于日志更新
headers: 带cookie的请求头
progress: Rich进度条实例
"""
self.spider = spider
self.headers = headers
self.progress = progress
self.redis = RedisClient()
# 用于追踪下载状态
self.work_status = {} # 记录每个作品的下载状态
def download_image(self, url: str) -> bool:
def download_image(self, url: str, work_id: str = None) -> bool:
"""
下载单张图片
@ -41,8 +46,13 @@ class PixivDownloader:
illust_id, page_num, extension = match.groups()
file_name = f"{illust_id}_p{page_num}.{extension}"
# 检查是否已下载
if self.redis.is_image_downloaded(illust_id, page_num):
# 检查文件是否已存在
file_path = f'./img/{file_name}'
if os.path.exists(file_path):
self.spider._update_log(f"[green]{file_name} 已存在![/green]")
# 确保Redis状态同步
if not self.redis.is_image_downloaded(illust_id, page_num):
self.redis.mark_image_downloaded(illust_id, page_num)
return True
# 确保下载目录存在
@ -63,8 +73,9 @@ class PixivDownloader:
with open(f'./img/{file_name}', 'wb') as fp:
fp.write(response.content)
# 更新Redis记录
# 更新Redis记录并显示下载成功信息
self.redis.mark_image_downloaded(illust_id, page_num)
self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]")
# 更新总页数
total_pages = self.redis.get_total_pages(illust_id)
@ -81,9 +92,12 @@ class PixivDownloader:
return True
except requests.RequestException:
except requests.RequestException as e:
if attempt == 2: # 最后一次尝试失败
error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]"
self.spider._update_log(error_msg)
return False
self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]")
continue
return False
@ -100,6 +114,9 @@ class PixivDownloader:
"""
# 跳过已完成的作品
if self.redis.is_work_complete(work_id):
if work_id not in self.work_status:
self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]")
self.work_status[work_id] = "complete"
return True
try:
@ -118,6 +135,7 @@ class PixivDownloader:
if not images:
return False
try:
# 下载每张图片
if len(images) > 1:
# 多图作品
@ -127,13 +145,17 @@ class PixivDownloader:
)
success = True
for image in images:
for idx, image in enumerate(images):
if 'urls' not in image or 'original' not in image['urls']:
self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]")
success = False
continue
if not self.download_image(image['urls']['original']):
if not self.download_image(image['urls']['original'], work_id):
self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]")
success = False
else:
self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]")
self.progress.update(subtask_id, advance=1)
@ -143,8 +165,14 @@ class PixivDownloader:
else:
# 单图作品
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]")
return False
return self.download_image(images[0]['urls']['original'])
return self.download_image(images[0]['urls']['original'], work_id)
except (requests.RequestException, KeyError, ValueError):
except Exception as e:
self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]")
return False
except (requests.RequestException, KeyError, ValueError) as e:
self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]")
return False

View File

@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载
"""
from typing import Generator, List, Dict, Any
import requests
import time
from rich.console import Console
from rich.progress import (
Progress,
@ -54,51 +55,77 @@ class PixivSpider:
# 创建布局
self.layout = Layout()
self.layout.split(
Layout(name="PixivSpider", ratio=8),
Layout(name="progress", ratio=2)
Layout(name="header", size=3),
Layout(name="main", size=None),
Layout(name="progress", size=3)
)
# 创建进度条
# 设置标题
self.layout["header"].update(
Panel("PixivSpider", style="bold magenta", border_style="bright_blue")
)
# 创建进度条 - 固定在底部
self.progress = Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
TextColumn("{task.fields[speed]}"),
console=Console(stderr=True),
expand=True
BarColumn(),
TextColumn("{task.percentage:>3.0f}%"),
TextColumn("[bold green]{task.fields[speed]}"),
console=self.console,
expand=True,
transient=False # 保持进度条显示
)
# 设置日志面板
# 设置日志面板 - 滚动显示在进度条上方
self.log_messages = []
self.layout["main"].update(
Panel(
Group(*self.log_messages),
title="下载状态",
border_style="green"
)
)
# 设置进度条任务
self.main_task_id = self.progress.add_task(
"[cyan]总体进度",
"总体进度",
total=self.TOTAL_IMAGES,
speed=""
speed="0.00 t / 秒"
)
def _update_log(self, message: str) -> None:
"""更新日志显示"""
# 检查是否为重复消息
if not self.log_messages or message != self.log_messages[-1]:
self.log_messages.append(message)
if len(self.log_messages) > 18:
self.log_messages.pop(0)
log_group = Group(*self.log_messages)
self.layout["PixivSpider"].update(
if len(self.log_messages) > 100:
self.log_messages = self.log_messages[-100:]
try:
# 构建消息显示,确保顺序正确
messages = self.log_messages[-30:] # 只显示最新的30条
# 更新日志面板
self.layout["main"].update(
Panel(
log_group,
title="PixivSpider",
title_align="left",
border_style="cyan",
Group(*messages),
title="下载状态",
subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息",
border_style="green",
padding=(0, 1)
)
)
except Exception as e:
# 防止界面更新错误影响主程序
print(f"界面更新出错: {e}")
def _setup_session(self) -> None:
"""设置请求会话"""
cookie = self.redis.get_cookie()
if not cookie:
cookie = input('请输入一个cookie')
cookie = input('请输入Pixiv Cookie ')
self.redis.set_cookie(cookie)
self.headers = PIXIV_CONFIG.headers.copy()
self.headers['cookie'] = cookie
@ -141,20 +168,39 @@ class PixivSpider:
def run(self) -> None:
"""运行爬虫"""
self._setup_session()
downloader = PixivDownloader(self.headers, self.progress)
downloader = PixivDownloader(self, self.headers, self.progress)
with Live(self.layout, self.console, refresh_per_second=10):
with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console):
self.layout["progress"].update(self.progress)
self._update_log('[cyan]开始抓取...[/cyan]')
# 处理排行榜页面
# 保存开始时间用于计算速度
start_time = time.time()
completed_works = 0
for page in range(1, 11):
try:
self.get_ranking_page(page)
for work_id in self.process_ranking_data():
if not downloader.download_work(work_id):
self.failed_works.append(work_id)
self.progress.update(self.main_task_id, advance=1)
else:
completed_works += 1
# 计算实际速度(作品/秒)
elapsed_time = max(1, int(time.time() - start_time))
speed = completed_works / elapsed_time
# 更新进度和速度
self.progress.update(
self.main_task_id,
completed=completed_works, # 使用绝对值而不是增量
refresh=True, # 强制刷新显示
)
# 单独设置速度字段
self.progress.tasks[self.main_task_id].fields["speed"] = f"{speed:.2f} t / 秒"
# 更新UI显示
self.layout["progress"].update(self.progress)
except requests.RequestException as e:
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')