Compare commits

..

4 Commits

Author SHA1 Message Date
df1f74a6bf fix or get bug 2024-12-22 18:27:15 +08:00
a604050d6c fix ui 2024-12-22 17:45:19 +08:00
a6bf93ebbc new ui 2024-12-22 05:34:21 +08:00
99a9de90aa update 2024-12-20 19:50:47 +08:00
3 changed files with 144 additions and 71 deletions

View File

@ -4,6 +4,7 @@ import re
from typing import Optional, Union
import requests
from rich.progress import Progress
from rich.console import Console
from config import PIXIV_CONFIG
from redis_client import RedisClient
@ -11,19 +12,23 @@ from redis_client import RedisClient
class PixivDownloader:
"""处理Pixiv图片下载"""
def __init__(self, headers: dict, progress: Progress):
def __init__(self, spider, headers: dict, progress: Progress):
"""
初始化下载器
参数:
spider: PixivSpider实例用于日志更新
headers: 带cookie的请求头
progress: Rich进度条实例
"""
self.spider = spider
self.headers = headers
self.progress = progress
self.redis = RedisClient()
# 用于追踪下载状态
self.work_status = {} # 记录每个作品的下载状态
def download_image(self, url: str) -> bool:
def download_image(self, url: str, work_id: str = None) -> bool:
"""
下载单张图片
@ -41,8 +46,13 @@ class PixivDownloader:
illust_id, page_num, extension = match.groups()
file_name = f"{illust_id}_p{page_num}.{extension}"
# 检查是否已下载
if self.redis.is_image_downloaded(illust_id, page_num):
# 检查文件是否已存在
file_path = f'./img/{file_name}'
if os.path.exists(file_path):
self.spider._update_log(f"[green]{file_name} 已存在![/green]")
# 确保Redis状态同步
if not self.redis.is_image_downloaded(illust_id, page_num):
self.redis.mark_image_downloaded(illust_id, page_num)
return True
# 确保下载目录存在
@ -63,8 +73,9 @@ class PixivDownloader:
with open(f'./img/{file_name}', 'wb') as fp:
fp.write(response.content)
# 更新Redis记录
# 更新Redis记录并显示下载成功信息
self.redis.mark_image_downloaded(illust_id, page_num)
self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]")
# 更新总页数
total_pages = self.redis.get_total_pages(illust_id)
@ -81,9 +92,12 @@ class PixivDownloader:
return True
except requests.RequestException:
except requests.RequestException as e:
if attempt == 2: # 最后一次尝试失败
error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]"
self.spider._update_log(error_msg)
return False
self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]")
continue
return False
@ -100,6 +114,9 @@ class PixivDownloader:
"""
# 跳过已完成的作品
if self.redis.is_work_complete(work_id):
if work_id not in self.work_status:
self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]")
self.work_status[work_id] = "complete"
return True
try:
@ -118,33 +135,44 @@ class PixivDownloader:
if not images:
return False
# 下载每张图片
if len(images) > 1:
# 多图作品
subtask_id = self.progress.add_task(
f"[yellow]PID:{work_id}",
total=len(images)
)
try:
# 下载每张图片
if len(images) > 1:
# 多图作品
subtask_id = self.progress.add_task(
f"[yellow]PID:{work_id}",
total=len(images)
)
success = True
for image in images:
if 'urls' not in image or 'original' not in image['urls']:
success = False
continue
success = True
for idx, image in enumerate(images):
if 'urls' not in image or 'original' not in image['urls']:
self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]")
success = False
continue
if not self.download_image(image['urls']['original']):
success = False
if not self.download_image(image['urls']['original'], work_id):
self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]")
success = False
else:
self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]")
self.progress.update(subtask_id, advance=1)
self.progress.update(subtask_id, advance=1)
self.progress.remove_task(subtask_id)
return success
self.progress.remove_task(subtask_id)
return success
else:
# 单图作品
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
return False
return self.download_image(images[0]['urls']['original'])
else:
# 单图作品
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]")
return False
return self.download_image(images[0]['urls']['original'], work_id)
except (requests.RequestException, KeyError, ValueError):
except Exception as e:
self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]")
return False
except (requests.RequestException, KeyError, ValueError) as e:
self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]")
return False

View File

@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载
"""
from typing import Generator, List, Dict, Any
import requests
import time
from rich.console import Console
from rich.progress import (
Progress,
@ -48,57 +49,74 @@ class PixivSpider:
self.headers = None
self.current_ranking_data = []
self.failed_works = []
self.log_messages = []
def _setup_ui(self) -> None:
"""设置Rich界面组件"""
# 创建布局
self.layout = Layout()
self.layout.split(
Layout(name="PixivSpider", ratio=8),
Layout(name="progress", ratio=2)
Layout(name="header", size=3),
Layout(name="main", size=None),
Layout(name="progress", size=3)
)
# 创建进度条
# 设置标题
self.layout["header"].update(
Panel("PixivSpider", style="bold magenta", border_style="bright_blue")
)
# 创建进度条 - 固定在底部
self.progress = Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
TextColumn("{task.fields[speed]}"),
console=Console(stderr=True),
BarColumn(),
TextColumn("{task.percentage:>3.0f}%"),
console=self.console,
expand=True,
transient=False
)
# 设置进度条任务
self.main_task_id = self.progress.add_task(
"总体进度",
total=self.TOTAL_IMAGES
)
def _update_log(self, message: str, speed: float = 0.0) -> None:
"""更新日志信息"""
if not self.log_messages or message != self.log_messages[-1]:
self.log_messages.insert(0, message)
if len(self.log_messages) > 100:
self.log_messages = self.log_messages[:100]
messages = self.log_messages[:10]
# 清空控制台
self.console.clear()
# 重新渲染布局
self.console.print(self.layout)
# 更新日志面板
log_content = "\n".join(messages)
log_panel = Panel(
log_content,
title="下载状态",
subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息, 速度: {speed:.2f} t/s",
border_style="green",
padding=(1, 2),
expand=True
)
self.layout["main"].update(log_panel)
# 设置日志面板
self.log_messages = []
self.main_task_id = self.progress.add_task(
"[cyan]总体进度",
total=self.TOTAL_IMAGES,
speed=""
)
def _update_log(self, message: str) -> None:
"""更新日志显示"""
self.log_messages.append(message)
if len(self.log_messages) > 18:
self.log_messages.pop(0)
log_group = Group(*self.log_messages)
self.layout["PixivSpider"].update(
Panel(
log_group,
title="PixivSpider",
title_align="left",
border_style="cyan",
padding=(0, 1)
)
)
def _setup_session(self) -> None:
"""设置请求会话"""
cookie = self.redis.get_cookie()
if not cookie:
cookie = input('请输入一个cookie')
cookie = input('请输入Pixiv Cookie ')
self.redis.set_cookie(cookie)
self.headers = PIXIV_CONFIG.headers.copy()
self.headers['cookie'] = cookie
@ -141,20 +159,47 @@ class PixivSpider:
def run(self) -> None:
"""运行爬虫"""
self._setup_session()
downloader = PixivDownloader(self.headers, self.progress)
downloader = PixivDownloader(self, self.headers, self.progress)
with Live(self.layout, self.console, refresh_per_second=10):
with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console):
self.layout["progress"].update(self.progress)
self._update_log('[cyan]开始抓取...[/cyan]')
# 处理排行榜页面
# 保存开始时间用于计算速度
start_time = time.time()
last_update_time = start_time
completed_works = 0
for page in range(1, 11):
try:
self.get_ranking_page(page)
for work_id in self.process_ranking_data():
if not downloader.download_work(work_id):
self.failed_works.append(work_id)
self.progress.update(self.main_task_id, advance=1)
else:
# 计算实际速度(作品/秒)
current_time = time.time()
elapsed_time = current_time - start_time
# 每秒更新一次速度
# 计算实际速度(作品/秒)
current_time = time.time()
elapsed_time = current_time - start_time
completed_works += 1
# 每次下载图片后更新速度
if elapsed_time > 0: # 避免除以零错误
speed = completed_works / elapsed_time
self._update_log(f"[cyan]已爬取[/cyan] {completed_works} [cyan]个页面![/cyan]")
# 更新进度
self.progress.update(
self.main_task_id,
completed=completed_works,
)
# 更新UI显示
self.layout["progress"].update(self.progress)
except requests.RequestException as e:
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
@ -162,6 +207,6 @@ class PixivSpider:
# 清理失败作品的记录
for work_id in self.failed_works:
self.redis.client.delete(work_id)
self.redis.client().delete(work_id)
self._update_log('[green]爬虫运行完成[/green]')

View File

@ -1,4 +1,4 @@
redis==5.2.1
requests==2.32.3
rich==13.7.1
rich==13.9.4
urllib3<2.0.0 # 确保与requests兼容