diff --git a/pixiv_download.py b/pixiv_download.py index a5842a3..90809f3 100644 --- a/pixiv_download.py +++ b/pixiv_download.py @@ -4,6 +4,7 @@ import re from typing import Optional, Union import requests from rich.progress import Progress +from rich.console import Console from config import PIXIV_CONFIG from redis_client import RedisClient @@ -11,19 +12,23 @@ from redis_client import RedisClient class PixivDownloader: """处理Pixiv图片下载""" - def __init__(self, headers: dict, progress: Progress): + def __init__(self, spider, headers: dict, progress: Progress): """ 初始化下载器 参数: + spider: PixivSpider实例,用于日志更新 headers: 带cookie的请求头 progress: Rich进度条实例 """ + self.spider = spider self.headers = headers self.progress = progress self.redis = RedisClient() + # 用于追踪下载状态 + self.work_status = {} # 记录每个作品的下载状态 - def download_image(self, url: str) -> bool: + def download_image(self, url: str, work_id: str = None) -> bool: """ 下载单张图片 @@ -41,8 +46,13 @@ class PixivDownloader: illust_id, page_num, extension = match.groups() file_name = f"{illust_id}_p{page_num}.{extension}" - # 检查是否已下载 - if self.redis.is_image_downloaded(illust_id, page_num): + # 检查文件是否已存在 + file_path = f'./img/{file_name}' + if os.path.exists(file_path): + self.spider._update_log(f"[green]{file_name} 已存在![/green]") + # 确保Redis状态同步 + if not self.redis.is_image_downloaded(illust_id, page_num): + self.redis.mark_image_downloaded(illust_id, page_num) return True # 确保下载目录存在 @@ -63,8 +73,9 @@ class PixivDownloader: with open(f'./img/{file_name}', 'wb') as fp: fp.write(response.content) - # 更新Redis记录 + # 更新Redis记录并显示下载成功信息 self.redis.mark_image_downloaded(illust_id, page_num) + self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]") # 更新总页数 total_pages = self.redis.get_total_pages(illust_id) @@ -81,9 +92,12 @@ class PixivDownloader: return True - except requests.RequestException: + except requests.RequestException as e: if attempt == 2: # 最后一次尝试失败 + error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]" + self.spider._update_log(error_msg) return False + self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]") continue return False @@ -100,6 +114,9 @@ class PixivDownloader: """ # 跳过已完成的作品 if self.redis.is_work_complete(work_id): + if work_id not in self.work_status: + self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]") + self.work_status[work_id] = "complete" return True try: @@ -118,33 +135,44 @@ class PixivDownloader: if not images: return False - # 下载每张图片 - if len(images) > 1: - # 多图作品 - subtask_id = self.progress.add_task( - f"[yellow]PID:{work_id}", - total=len(images) - ) - - success = True - for image in images: - if 'urls' not in image or 'original' not in image['urls']: - success = False - continue - - if not self.download_image(image['urls']['original']): - success = False - - self.progress.update(subtask_id, advance=1) + try: + # 下载每张图片 + if len(images) > 1: + # 多图作品 + subtask_id = self.progress.add_task( + f"[yellow]PID:{work_id}", + total=len(images) + ) - self.progress.remove_task(subtask_id) - return success + success = True + for idx, image in enumerate(images): + if 'urls' not in image or 'original' not in image['urls']: + self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]") + success = False + continue + + if not self.download_image(image['urls']['original'], work_id): + self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]") + success = False + else: + self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]") + + self.progress.update(subtask_id, advance=1) + + self.progress.remove_task(subtask_id) + return success + + else: + # 单图作品 + if 'urls' not in images[0] or 'original' not in images[0]['urls']: + self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]") + return False + return self.download_image(images[0]['urls']['original'], work_id) + + except Exception as e: + self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]") + return False - else: - # 单图作品 - if 'urls' not in images[0] or 'original' not in images[0]['urls']: - return False - return self.download_image(images[0]['urls']['original']) - - except (requests.RequestException, KeyError, ValueError): + except (requests.RequestException, KeyError, ValueError) as e: + self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]") return False diff --git a/pixiv_spider.py b/pixiv_spider.py index 54b6107..d28c555 100644 --- a/pixiv_spider.py +++ b/pixiv_spider.py @@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载 """ from typing import Generator, List, Dict, Any import requests +import time from rich.console import Console from rich.progress import ( Progress, @@ -54,51 +55,77 @@ class PixivSpider: # 创建布局 self.layout = Layout() self.layout.split( - Layout(name="PixivSpider", ratio=8), - Layout(name="progress", ratio=2) + Layout(name="header", size=3), + Layout(name="main", size=None), + Layout(name="progress", size=3) ) - # 创建进度条 + # 设置标题 + self.layout["header"].update( + Panel("PixivSpider", style="bold magenta", border_style="bright_blue") + ) + + # 创建进度条 - 固定在底部 self.progress = Progress( + SpinnerColumn(), TextColumn("[bold blue]{task.description}"), - BarColumn(bar_width=40), - TaskProgressColumn(), - TextColumn("{task.fields[speed]}"), - console=Console(stderr=True), - expand=True + BarColumn(), + TextColumn("{task.percentage:>3.0f}%"), + TextColumn("[bold green]{task.fields[speed]}"), + console=self.console, + expand=True, + transient=False # 保持进度条显示 ) - # 设置日志面板 + # 设置日志面板 - 滚动显示在进度条上方 self.log_messages = [] + self.layout["main"].update( + Panel( + Group(*self.log_messages), + title="下载状态", + border_style="green" + ) + ) + + # 设置进度条任务 self.main_task_id = self.progress.add_task( - "[cyan]总体进度", + "总体进度", total=self.TOTAL_IMAGES, - speed="" + speed="0.00 t / 秒" ) def _update_log(self, message: str) -> None: """更新日志显示""" - self.log_messages.append(message) - if len(self.log_messages) > 18: - self.log_messages.pop(0) - log_group = Group(*self.log_messages) - self.layout["PixivSpider"].update( - Panel( - log_group, - title="PixivSpider", - title_align="left", - border_style="cyan", - padding=(0, 1) - ) - ) + # 检查是否为重复消息 + if not self.log_messages or message != self.log_messages[-1]: + self.log_messages.append(message) + if len(self.log_messages) > 100: + self.log_messages = self.log_messages[-100:] + + try: + # 构建消息显示,确保顺序正确 + messages = self.log_messages[-30:] # 只显示最新的30条 + + # 更新日志面板 + self.layout["main"].update( + Panel( + Group(*messages), + title="下载状态", + subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息", + border_style="green", + padding=(0, 1) + ) + ) + except Exception as e: + # 防止界面更新错误影响主程序 + print(f"界面更新出错: {e}") def _setup_session(self) -> None: """设置请求会话""" cookie = self.redis.get_cookie() if not cookie: - cookie = input('请输入一个cookie:') + cookie = input('请输入Pixiv Cookie :') self.redis.set_cookie(cookie) - self.headers = PIXIV_CONFIG.headers.copy() self.headers['cookie'] = cookie @@ -141,20 +168,39 @@ class PixivSpider: def run(self) -> None: """运行爬虫""" self._setup_session() - downloader = PixivDownloader(self.headers, self.progress) + downloader = PixivDownloader(self, self.headers, self.progress) - with Live(self.layout, self.console, refresh_per_second=10): + with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console): self.layout["progress"].update(self.progress) self._update_log('[cyan]开始抓取...[/cyan]') # 处理排行榜页面 + # 保存开始时间用于计算速度 + start_time = time.time() + completed_works = 0 + for page in range(1, 11): try: self.get_ranking_page(page) for work_id in self.process_ranking_data(): if not downloader.download_work(work_id): self.failed_works.append(work_id) - self.progress.update(self.main_task_id, advance=1) + else: + completed_works += 1 + # 计算实际速度(作品/秒) + elapsed_time = max(1, int(time.time() - start_time)) + speed = completed_works / elapsed_time + + # 更新进度和速度 + self.progress.update( + self.main_task_id, + completed=completed_works, # 使用绝对值而不是增量 + refresh=True, # 强制刷新显示 + ) + # 单独设置速度字段 + self.progress.tasks[self.main_task_id].fields["speed"] = f"{speed:.2f} t / 秒" + # 更新UI显示 + self.layout["progress"].update(self.progress) except requests.RequestException as e: self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')