Compare commits
4 Commits
master
...
dev-ui-fix
Author | SHA1 | Date | |
---|---|---|---|
df1f74a6bf | |||
a604050d6c | |||
a6bf93ebbc | |||
99a9de90aa |
@ -4,6 +4,7 @@ import re
|
||||
from typing import Optional, Union
|
||||
import requests
|
||||
from rich.progress import Progress
|
||||
from rich.console import Console
|
||||
|
||||
from config import PIXIV_CONFIG
|
||||
from redis_client import RedisClient
|
||||
@ -11,19 +12,23 @@ from redis_client import RedisClient
|
||||
class PixivDownloader:
|
||||
"""处理Pixiv图片下载"""
|
||||
|
||||
def __init__(self, headers: dict, progress: Progress):
|
||||
def __init__(self, spider, headers: dict, progress: Progress):
|
||||
"""
|
||||
初始化下载器
|
||||
|
||||
参数:
|
||||
spider: PixivSpider实例,用于日志更新
|
||||
headers: 带cookie的请求头
|
||||
progress: Rich进度条实例
|
||||
"""
|
||||
self.spider = spider
|
||||
self.headers = headers
|
||||
self.progress = progress
|
||||
self.redis = RedisClient()
|
||||
# 用于追踪下载状态
|
||||
self.work_status = {} # 记录每个作品的下载状态
|
||||
|
||||
def download_image(self, url: str) -> bool:
|
||||
def download_image(self, url: str, work_id: str = None) -> bool:
|
||||
"""
|
||||
下载单张图片
|
||||
|
||||
@ -41,8 +46,13 @@ class PixivDownloader:
|
||||
illust_id, page_num, extension = match.groups()
|
||||
file_name = f"{illust_id}_p{page_num}.{extension}"
|
||||
|
||||
# 检查是否已下载
|
||||
if self.redis.is_image_downloaded(illust_id, page_num):
|
||||
# 检查文件是否已存在
|
||||
file_path = f'./img/{file_name}'
|
||||
if os.path.exists(file_path):
|
||||
self.spider._update_log(f"[green]{file_name} 已存在![/green]")
|
||||
# 确保Redis状态同步
|
||||
if not self.redis.is_image_downloaded(illust_id, page_num):
|
||||
self.redis.mark_image_downloaded(illust_id, page_num)
|
||||
return True
|
||||
|
||||
# 确保下载目录存在
|
||||
@ -63,8 +73,9 @@ class PixivDownloader:
|
||||
with open(f'./img/{file_name}', 'wb') as fp:
|
||||
fp.write(response.content)
|
||||
|
||||
# 更新Redis记录
|
||||
# 更新Redis记录并显示下载成功信息
|
||||
self.redis.mark_image_downloaded(illust_id, page_num)
|
||||
self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]")
|
||||
|
||||
# 更新总页数
|
||||
total_pages = self.redis.get_total_pages(illust_id)
|
||||
@ -81,9 +92,12 @@ class PixivDownloader:
|
||||
|
||||
return True
|
||||
|
||||
except requests.RequestException:
|
||||
except requests.RequestException as e:
|
||||
if attempt == 2: # 最后一次尝试失败
|
||||
error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]"
|
||||
self.spider._update_log(error_msg)
|
||||
return False
|
||||
self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]")
|
||||
continue
|
||||
|
||||
return False
|
||||
@ -100,6 +114,9 @@ class PixivDownloader:
|
||||
"""
|
||||
# 跳过已完成的作品
|
||||
if self.redis.is_work_complete(work_id):
|
||||
if work_id not in self.work_status:
|
||||
self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]")
|
||||
self.work_status[work_id] = "complete"
|
||||
return True
|
||||
|
||||
try:
|
||||
@ -118,33 +135,44 @@ class PixivDownloader:
|
||||
if not images:
|
||||
return False
|
||||
|
||||
# 下载每张图片
|
||||
if len(images) > 1:
|
||||
# 多图作品
|
||||
subtask_id = self.progress.add_task(
|
||||
f"[yellow]PID:{work_id}",
|
||||
total=len(images)
|
||||
)
|
||||
|
||||
success = True
|
||||
for image in images:
|
||||
if 'urls' not in image or 'original' not in image['urls']:
|
||||
success = False
|
||||
continue
|
||||
|
||||
if not self.download_image(image['urls']['original']):
|
||||
success = False
|
||||
|
||||
self.progress.update(subtask_id, advance=1)
|
||||
try:
|
||||
# 下载每张图片
|
||||
if len(images) > 1:
|
||||
# 多图作品
|
||||
subtask_id = self.progress.add_task(
|
||||
f"[yellow]PID:{work_id}",
|
||||
total=len(images)
|
||||
)
|
||||
|
||||
self.progress.remove_task(subtask_id)
|
||||
return success
|
||||
success = True
|
||||
for idx, image in enumerate(images):
|
||||
if 'urls' not in image or 'original' not in image['urls']:
|
||||
self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]")
|
||||
success = False
|
||||
continue
|
||||
|
||||
if not self.download_image(image['urls']['original'], work_id):
|
||||
self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]")
|
||||
success = False
|
||||
else:
|
||||
self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]")
|
||||
|
||||
self.progress.update(subtask_id, advance=1)
|
||||
|
||||
self.progress.remove_task(subtask_id)
|
||||
return success
|
||||
|
||||
else:
|
||||
# 单图作品
|
||||
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
|
||||
self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]")
|
||||
return False
|
||||
return self.download_image(images[0]['urls']['original'], work_id)
|
||||
|
||||
except Exception as e:
|
||||
self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]")
|
||||
return False
|
||||
|
||||
else:
|
||||
# 单图作品
|
||||
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
|
||||
return False
|
||||
return self.download_image(images[0]['urls']['original'])
|
||||
|
||||
except (requests.RequestException, KeyError, ValueError):
|
||||
except (requests.RequestException, KeyError, ValueError) as e:
|
||||
self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]")
|
||||
return False
|
||||
|
119
pixiv_spider.py
119
pixiv_spider.py
@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载
|
||||
"""
|
||||
from typing import Generator, List, Dict, Any
|
||||
import requests
|
||||
import time
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
@ -48,57 +49,74 @@ class PixivSpider:
|
||||
self.headers = None
|
||||
self.current_ranking_data = []
|
||||
self.failed_works = []
|
||||
self.log_messages = []
|
||||
|
||||
def _setup_ui(self) -> None:
|
||||
"""设置Rich界面组件"""
|
||||
# 创建布局
|
||||
self.layout = Layout()
|
||||
self.layout.split(
|
||||
Layout(name="PixivSpider", ratio=8),
|
||||
Layout(name="progress", ratio=2)
|
||||
Layout(name="header", size=3),
|
||||
Layout(name="main", size=None),
|
||||
Layout(name="progress", size=3)
|
||||
)
|
||||
|
||||
# 创建进度条
|
||||
# 设置标题
|
||||
self.layout["header"].update(
|
||||
Panel("PixivSpider", style="bold magenta", border_style="bright_blue")
|
||||
)
|
||||
|
||||
# 创建进度条 - 固定在底部
|
||||
self.progress = Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
TextColumn("{task.fields[speed]}"),
|
||||
console=Console(stderr=True),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:>3.0f}%"),
|
||||
console=self.console,
|
||||
expand=True,
|
||||
transient=False
|
||||
)
|
||||
|
||||
# 设置进度条任务
|
||||
self.main_task_id = self.progress.add_task(
|
||||
"总体进度",
|
||||
total=self.TOTAL_IMAGES
|
||||
)
|
||||
|
||||
def _update_log(self, message: str, speed: float = 0.0) -> None:
|
||||
"""更新日志信息"""
|
||||
if not self.log_messages or message != self.log_messages[-1]:
|
||||
self.log_messages.insert(0, message)
|
||||
if len(self.log_messages) > 100:
|
||||
self.log_messages = self.log_messages[:100]
|
||||
|
||||
messages = self.log_messages[:10]
|
||||
|
||||
# 清空控制台
|
||||
self.console.clear()
|
||||
|
||||
# 重新渲染布局
|
||||
self.console.print(self.layout)
|
||||
|
||||
# 更新日志面板
|
||||
log_content = "\n".join(messages)
|
||||
log_panel = Panel(
|
||||
log_content,
|
||||
title="下载状态",
|
||||
subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息, 速度: {speed:.2f} t/s",
|
||||
border_style="green",
|
||||
padding=(1, 2),
|
||||
expand=True
|
||||
)
|
||||
|
||||
# 设置日志面板
|
||||
self.log_messages = []
|
||||
self.main_task_id = self.progress.add_task(
|
||||
"[cyan]总体进度",
|
||||
total=self.TOTAL_IMAGES,
|
||||
speed=""
|
||||
)
|
||||
|
||||
def _update_log(self, message: str) -> None:
|
||||
"""更新日志显示"""
|
||||
self.log_messages.append(message)
|
||||
if len(self.log_messages) > 18:
|
||||
self.log_messages.pop(0)
|
||||
log_group = Group(*self.log_messages)
|
||||
self.layout["PixivSpider"].update(
|
||||
Panel(
|
||||
log_group,
|
||||
title="PixivSpider",
|
||||
title_align="left",
|
||||
border_style="cyan",
|
||||
padding=(0, 1)
|
||||
)
|
||||
)
|
||||
self.layout["main"].update(log_panel)
|
||||
|
||||
|
||||
def _setup_session(self) -> None:
|
||||
"""设置请求会话"""
|
||||
cookie = self.redis.get_cookie()
|
||||
if not cookie:
|
||||
cookie = input('请输入一个cookie:')
|
||||
cookie = input('请输入Pixiv Cookie :')
|
||||
self.redis.set_cookie(cookie)
|
||||
|
||||
self.headers = PIXIV_CONFIG.headers.copy()
|
||||
self.headers['cookie'] = cookie
|
||||
|
||||
@ -141,20 +159,47 @@ class PixivSpider:
|
||||
def run(self) -> None:
|
||||
"""运行爬虫"""
|
||||
self._setup_session()
|
||||
downloader = PixivDownloader(self.headers, self.progress)
|
||||
downloader = PixivDownloader(self, self.headers, self.progress)
|
||||
|
||||
with Live(self.layout, self.console, refresh_per_second=10):
|
||||
with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console):
|
||||
self.layout["progress"].update(self.progress)
|
||||
self._update_log('[cyan]开始抓取...[/cyan]')
|
||||
|
||||
# 处理排行榜页面
|
||||
# 保存开始时间用于计算速度
|
||||
start_time = time.time()
|
||||
last_update_time = start_time
|
||||
completed_works = 0
|
||||
|
||||
for page in range(1, 11):
|
||||
try:
|
||||
self.get_ranking_page(page)
|
||||
for work_id in self.process_ranking_data():
|
||||
if not downloader.download_work(work_id):
|
||||
self.failed_works.append(work_id)
|
||||
self.progress.update(self.main_task_id, advance=1)
|
||||
else:
|
||||
# 计算实际速度(作品/秒)
|
||||
current_time = time.time()
|
||||
elapsed_time = current_time - start_time
|
||||
|
||||
# 每秒更新一次速度
|
||||
|
||||
# 计算实际速度(作品/秒)
|
||||
current_time = time.time()
|
||||
elapsed_time = current_time - start_time
|
||||
completed_works += 1
|
||||
# 每次下载图片后更新速度
|
||||
if elapsed_time > 0: # 避免除以零错误
|
||||
speed = completed_works / elapsed_time
|
||||
self._update_log(f"[cyan]已爬取[/cyan] {completed_works} [cyan]个页面![/cyan]")
|
||||
# 更新进度
|
||||
self.progress.update(
|
||||
self.main_task_id,
|
||||
completed=completed_works,
|
||||
|
||||
)
|
||||
# 更新UI显示
|
||||
self.layout["progress"].update(self.progress)
|
||||
|
||||
except requests.RequestException as e:
|
||||
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
|
||||
@ -162,6 +207,6 @@ class PixivSpider:
|
||||
|
||||
# 清理失败作品的记录
|
||||
for work_id in self.failed_works:
|
||||
self.redis.client.delete(work_id)
|
||||
self.redis.client().delete(work_id)
|
||||
|
||||
self._update_log('[green]爬虫运行完成[/green]')
|
||||
|
@ -1,4 +1,4 @@
|
||||
redis==5.2.1
|
||||
requests==2.32.3
|
||||
rich==13.7.1
|
||||
rich==13.9.4
|
||||
urllib3<2.0.0 # 确保与requests兼容
|
||||
|
Loading…
Reference in New Issue
Block a user