Compare commits

...

4 Commits

Author SHA1 Message Date
df1f74a6bf fix or get bug 2024-12-22 18:27:15 +08:00
a604050d6c fix ui 2024-12-22 17:45:19 +08:00
a6bf93ebbc new ui 2024-12-22 05:34:21 +08:00
99a9de90aa update 2024-12-20 19:50:47 +08:00
3 changed files with 144 additions and 71 deletions

View File

@ -4,6 +4,7 @@ import re
from typing import Optional, Union from typing import Optional, Union
import requests import requests
from rich.progress import Progress from rich.progress import Progress
from rich.console import Console
from config import PIXIV_CONFIG from config import PIXIV_CONFIG
from redis_client import RedisClient from redis_client import RedisClient
@ -11,19 +12,23 @@ from redis_client import RedisClient
class PixivDownloader: class PixivDownloader:
"""处理Pixiv图片下载""" """处理Pixiv图片下载"""
def __init__(self, headers: dict, progress: Progress): def __init__(self, spider, headers: dict, progress: Progress):
""" """
初始化下载器 初始化下载器
参数: 参数:
spider: PixivSpider实例用于日志更新
headers: 带cookie的请求头 headers: 带cookie的请求头
progress: Rich进度条实例 progress: Rich进度条实例
""" """
self.spider = spider
self.headers = headers self.headers = headers
self.progress = progress self.progress = progress
self.redis = RedisClient() self.redis = RedisClient()
# 用于追踪下载状态
self.work_status = {} # 记录每个作品的下载状态
def download_image(self, url: str) -> bool: def download_image(self, url: str, work_id: str = None) -> bool:
""" """
下载单张图片 下载单张图片
@ -41,8 +46,13 @@ class PixivDownloader:
illust_id, page_num, extension = match.groups() illust_id, page_num, extension = match.groups()
file_name = f"{illust_id}_p{page_num}.{extension}" file_name = f"{illust_id}_p{page_num}.{extension}"
# 检查是否已下载 # 检查文件是否已存在
if self.redis.is_image_downloaded(illust_id, page_num): file_path = f'./img/{file_name}'
if os.path.exists(file_path):
self.spider._update_log(f"[green]{file_name} 已存在![/green]")
# 确保Redis状态同步
if not self.redis.is_image_downloaded(illust_id, page_num):
self.redis.mark_image_downloaded(illust_id, page_num)
return True return True
# 确保下载目录存在 # 确保下载目录存在
@ -63,8 +73,9 @@ class PixivDownloader:
with open(f'./img/{file_name}', 'wb') as fp: with open(f'./img/{file_name}', 'wb') as fp:
fp.write(response.content) fp.write(response.content)
# 更新Redis记录 # 更新Redis记录并显示下载成功信息
self.redis.mark_image_downloaded(illust_id, page_num) self.redis.mark_image_downloaded(illust_id, page_num)
self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]")
# 更新总页数 # 更新总页数
total_pages = self.redis.get_total_pages(illust_id) total_pages = self.redis.get_total_pages(illust_id)
@ -81,9 +92,12 @@ class PixivDownloader:
return True return True
except requests.RequestException: except requests.RequestException as e:
if attempt == 2: # 最后一次尝试失败 if attempt == 2: # 最后一次尝试失败
error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]"
self.spider._update_log(error_msg)
return False return False
self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]")
continue continue
return False return False
@ -100,6 +114,9 @@ class PixivDownloader:
""" """
# 跳过已完成的作品 # 跳过已完成的作品
if self.redis.is_work_complete(work_id): if self.redis.is_work_complete(work_id):
if work_id not in self.work_status:
self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]")
self.work_status[work_id] = "complete"
return True return True
try: try:
@ -118,6 +135,7 @@ class PixivDownloader:
if not images: if not images:
return False return False
try:
# 下载每张图片 # 下载每张图片
if len(images) > 1: if len(images) > 1:
# 多图作品 # 多图作品
@ -127,13 +145,17 @@ class PixivDownloader:
) )
success = True success = True
for image in images: for idx, image in enumerate(images):
if 'urls' not in image or 'original' not in image['urls']: if 'urls' not in image or 'original' not in image['urls']:
self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]")
success = False success = False
continue continue
if not self.download_image(image['urls']['original']): if not self.download_image(image['urls']['original'], work_id):
self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]")
success = False success = False
else:
self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]")
self.progress.update(subtask_id, advance=1) self.progress.update(subtask_id, advance=1)
@ -143,8 +165,14 @@ class PixivDownloader:
else: else:
# 单图作品 # 单图作品
if 'urls' not in images[0] or 'original' not in images[0]['urls']: if 'urls' not in images[0] or 'original' not in images[0]['urls']:
self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]")
return False return False
return self.download_image(images[0]['urls']['original']) return self.download_image(images[0]['urls']['original'], work_id)
except (requests.RequestException, KeyError, ValueError): except Exception as e:
self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]")
return False
except (requests.RequestException, KeyError, ValueError) as e:
self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]")
return False return False

View File

@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载
""" """
from typing import Generator, List, Dict, Any from typing import Generator, List, Dict, Any
import requests import requests
import time
from rich.console import Console from rich.console import Console
from rich.progress import ( from rich.progress import (
Progress, Progress,
@ -48,57 +49,74 @@ class PixivSpider:
self.headers = None self.headers = None
self.current_ranking_data = [] self.current_ranking_data = []
self.failed_works = [] self.failed_works = []
self.log_messages = []
def _setup_ui(self) -> None: def _setup_ui(self) -> None:
"""设置Rich界面组件""" """设置Rich界面组件"""
# 创建布局 # 创建布局
self.layout = Layout() self.layout = Layout()
self.layout.split( self.layout.split(
Layout(name="PixivSpider", ratio=8), Layout(name="header", size=3),
Layout(name="progress", ratio=2) Layout(name="main", size=None),
Layout(name="progress", size=3)
) )
# 创建进度条 # 设置标题
self.layout["header"].update(
Panel("PixivSpider", style="bold magenta", border_style="bright_blue")
)
# 创建进度条 - 固定在底部
self.progress = Progress( self.progress = Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"), TextColumn("[bold blue]{task.description}"),
BarColumn(bar_width=40), BarColumn(),
TaskProgressColumn(), TextColumn("{task.percentage:>3.0f}%"),
TextColumn("{task.fields[speed]}"), console=self.console,
console=Console(stderr=True), expand=True,
transient=False
)
# 设置进度条任务
self.main_task_id = self.progress.add_task(
"总体进度",
total=self.TOTAL_IMAGES
)
def _update_log(self, message: str, speed: float = 0.0) -> None:
"""更新日志信息"""
if not self.log_messages or message != self.log_messages[-1]:
self.log_messages.insert(0, message)
if len(self.log_messages) > 100:
self.log_messages = self.log_messages[:100]
messages = self.log_messages[:10]
# 清空控制台
self.console.clear()
# 重新渲染布局
self.console.print(self.layout)
# 更新日志面板
log_content = "\n".join(messages)
log_panel = Panel(
log_content,
title="下载状态",
subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息, 速度: {speed:.2f} t/s",
border_style="green",
padding=(1, 2),
expand=True expand=True
) )
self.layout["main"].update(log_panel)
# 设置日志面板
self.log_messages = []
self.main_task_id = self.progress.add_task(
"[cyan]总体进度",
total=self.TOTAL_IMAGES,
speed=""
)
def _update_log(self, message: str) -> None:
"""更新日志显示"""
self.log_messages.append(message)
if len(self.log_messages) > 18:
self.log_messages.pop(0)
log_group = Group(*self.log_messages)
self.layout["PixivSpider"].update(
Panel(
log_group,
title="PixivSpider",
title_align="left",
border_style="cyan",
padding=(0, 1)
)
)
def _setup_session(self) -> None: def _setup_session(self) -> None:
"""设置请求会话""" """设置请求会话"""
cookie = self.redis.get_cookie() cookie = self.redis.get_cookie()
if not cookie: if not cookie:
cookie = input('请输入一个cookie') cookie = input('请输入Pixiv Cookie ')
self.redis.set_cookie(cookie) self.redis.set_cookie(cookie)
self.headers = PIXIV_CONFIG.headers.copy() self.headers = PIXIV_CONFIG.headers.copy()
self.headers['cookie'] = cookie self.headers['cookie'] = cookie
@ -141,20 +159,47 @@ class PixivSpider:
def run(self) -> None: def run(self) -> None:
"""运行爬虫""" """运行爬虫"""
self._setup_session() self._setup_session()
downloader = PixivDownloader(self.headers, self.progress) downloader = PixivDownloader(self, self.headers, self.progress)
with Live(self.layout, self.console, refresh_per_second=10): with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console):
self.layout["progress"].update(self.progress) self.layout["progress"].update(self.progress)
self._update_log('[cyan]开始抓取...[/cyan]') self._update_log('[cyan]开始抓取...[/cyan]')
# 处理排行榜页面 # 处理排行榜页面
# 保存开始时间用于计算速度
start_time = time.time()
last_update_time = start_time
completed_works = 0
for page in range(1, 11): for page in range(1, 11):
try: try:
self.get_ranking_page(page) self.get_ranking_page(page)
for work_id in self.process_ranking_data(): for work_id in self.process_ranking_data():
if not downloader.download_work(work_id): if not downloader.download_work(work_id):
self.failed_works.append(work_id) self.failed_works.append(work_id)
self.progress.update(self.main_task_id, advance=1) else:
# 计算实际速度(作品/秒)
current_time = time.time()
elapsed_time = current_time - start_time
# 每秒更新一次速度
# 计算实际速度(作品/秒)
current_time = time.time()
elapsed_time = current_time - start_time
completed_works += 1
# 每次下载图片后更新速度
if elapsed_time > 0: # 避免除以零错误
speed = completed_works / elapsed_time
self._update_log(f"[cyan]已爬取[/cyan] {completed_works} [cyan]个页面![/cyan]")
# 更新进度
self.progress.update(
self.main_task_id,
completed=completed_works,
)
# 更新UI显示
self.layout["progress"].update(self.progress)
except requests.RequestException as e: except requests.RequestException as e:
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]') self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
@ -162,6 +207,6 @@ class PixivSpider:
# 清理失败作品的记录 # 清理失败作品的记录
for work_id in self.failed_works: for work_id in self.failed_works:
self.redis.client.delete(work_id) self.redis.client().delete(work_id)
self._update_log('[green]爬虫运行完成[/green]') self._update_log('[green]爬虫运行完成[/green]')

View File

@ -1,4 +1,4 @@
redis==5.2.1 redis==5.2.1
requests==2.32.3 requests==2.32.3
rich==13.7.1 rich==13.9.4
urllib3<2.0.0 # 确保与requests兼容 urllib3<2.0.0 # 确保与requests兼容