Compare commits
4 Commits
master
...
dev-ui-fix
Author | SHA1 | Date | |
---|---|---|---|
df1f74a6bf | |||
a604050d6c | |||
a6bf93ebbc | |||
99a9de90aa |
@ -4,6 +4,7 @@ import re
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
import requests
|
import requests
|
||||||
from rich.progress import Progress
|
from rich.progress import Progress
|
||||||
|
from rich.console import Console
|
||||||
|
|
||||||
from config import PIXIV_CONFIG
|
from config import PIXIV_CONFIG
|
||||||
from redis_client import RedisClient
|
from redis_client import RedisClient
|
||||||
@ -11,19 +12,23 @@ from redis_client import RedisClient
|
|||||||
class PixivDownloader:
|
class PixivDownloader:
|
||||||
"""处理Pixiv图片下载"""
|
"""处理Pixiv图片下载"""
|
||||||
|
|
||||||
def __init__(self, headers: dict, progress: Progress):
|
def __init__(self, spider, headers: dict, progress: Progress):
|
||||||
"""
|
"""
|
||||||
初始化下载器
|
初始化下载器
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
|
spider: PixivSpider实例,用于日志更新
|
||||||
headers: 带cookie的请求头
|
headers: 带cookie的请求头
|
||||||
progress: Rich进度条实例
|
progress: Rich进度条实例
|
||||||
"""
|
"""
|
||||||
|
self.spider = spider
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.progress = progress
|
self.progress = progress
|
||||||
self.redis = RedisClient()
|
self.redis = RedisClient()
|
||||||
|
# 用于追踪下载状态
|
||||||
|
self.work_status = {} # 记录每个作品的下载状态
|
||||||
|
|
||||||
def download_image(self, url: str) -> bool:
|
def download_image(self, url: str, work_id: str = None) -> bool:
|
||||||
"""
|
"""
|
||||||
下载单张图片
|
下载单张图片
|
||||||
|
|
||||||
@ -41,8 +46,13 @@ class PixivDownloader:
|
|||||||
illust_id, page_num, extension = match.groups()
|
illust_id, page_num, extension = match.groups()
|
||||||
file_name = f"{illust_id}_p{page_num}.{extension}"
|
file_name = f"{illust_id}_p{page_num}.{extension}"
|
||||||
|
|
||||||
# 检查是否已下载
|
# 检查文件是否已存在
|
||||||
if self.redis.is_image_downloaded(illust_id, page_num):
|
file_path = f'./img/{file_name}'
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
self.spider._update_log(f"[green]{file_name} 已存在![/green]")
|
||||||
|
# 确保Redis状态同步
|
||||||
|
if not self.redis.is_image_downloaded(illust_id, page_num):
|
||||||
|
self.redis.mark_image_downloaded(illust_id, page_num)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 确保下载目录存在
|
# 确保下载目录存在
|
||||||
@ -63,8 +73,9 @@ class PixivDownloader:
|
|||||||
with open(f'./img/{file_name}', 'wb') as fp:
|
with open(f'./img/{file_name}', 'wb') as fp:
|
||||||
fp.write(response.content)
|
fp.write(response.content)
|
||||||
|
|
||||||
# 更新Redis记录
|
# 更新Redis记录并显示下载成功信息
|
||||||
self.redis.mark_image_downloaded(illust_id, page_num)
|
self.redis.mark_image_downloaded(illust_id, page_num)
|
||||||
|
self.spider._update_log(f"[bold white]{file_name} 已下载![/bold white]")
|
||||||
|
|
||||||
# 更新总页数
|
# 更新总页数
|
||||||
total_pages = self.redis.get_total_pages(illust_id)
|
total_pages = self.redis.get_total_pages(illust_id)
|
||||||
@ -81,9 +92,12 @@ class PixivDownloader:
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except requests.RequestException:
|
except requests.RequestException as e:
|
||||||
if attempt == 2: # 最后一次尝试失败
|
if attempt == 2: # 最后一次尝试失败
|
||||||
|
error_msg = f"[red]下载失败(PID:{work_id}): {str(e)}[/red]"
|
||||||
|
self.spider._update_log(error_msg)
|
||||||
return False
|
return False
|
||||||
|
self.spider._update_log(f"[yellow]重试下载(PID:{work_id}): 第{attempt + 1}次[/yellow]")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return False
|
return False
|
||||||
@ -100,6 +114,9 @@ class PixivDownloader:
|
|||||||
"""
|
"""
|
||||||
# 跳过已完成的作品
|
# 跳过已完成的作品
|
||||||
if self.redis.is_work_complete(work_id):
|
if self.redis.is_work_complete(work_id):
|
||||||
|
if work_id not in self.work_status:
|
||||||
|
self.spider._update_log(f"[green]作品(PID:{work_id})已完成下载[/green]")
|
||||||
|
self.work_status[work_id] = "complete"
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -118,6 +135,7 @@ class PixivDownloader:
|
|||||||
if not images:
|
if not images:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
# 下载每张图片
|
# 下载每张图片
|
||||||
if len(images) > 1:
|
if len(images) > 1:
|
||||||
# 多图作品
|
# 多图作品
|
||||||
@ -127,13 +145,17 @@ class PixivDownloader:
|
|||||||
)
|
)
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for image in images:
|
for idx, image in enumerate(images):
|
||||||
if 'urls' not in image or 'original' not in image['urls']:
|
if 'urls' not in image or 'original' not in image['urls']:
|
||||||
|
self.spider._update_log(f"[red]图片{idx + 1}URL获取失败(PID:{work_id})[/red]")
|
||||||
success = False
|
success = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self.download_image(image['urls']['original']):
|
if not self.download_image(image['urls']['original'], work_id):
|
||||||
|
self.spider._update_log(f"[red]图片{idx + 1}下载失败(PID:{work_id})[/red]")
|
||||||
success = False
|
success = False
|
||||||
|
else:
|
||||||
|
self.spider._update_log(f"[green]图片{idx + 1}/{len(images)}下载完成(PID:{work_id})[/green]")
|
||||||
|
|
||||||
self.progress.update(subtask_id, advance=1)
|
self.progress.update(subtask_id, advance=1)
|
||||||
|
|
||||||
@ -143,8 +165,14 @@ class PixivDownloader:
|
|||||||
else:
|
else:
|
||||||
# 单图作品
|
# 单图作品
|
||||||
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
|
if 'urls' not in images[0] or 'original' not in images[0]['urls']:
|
||||||
|
self.spider._update_log(f"[red]URL获取失败(PID:{work_id})[/red]")
|
||||||
return False
|
return False
|
||||||
return self.download_image(images[0]['urls']['original'])
|
return self.download_image(images[0]['urls']['original'], work_id)
|
||||||
|
|
||||||
except (requests.RequestException, KeyError, ValueError):
|
except Exception as e:
|
||||||
|
self.spider._update_log(f"[red]作品处理出错(PID:{work_id}): {str(e)}[/red]")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except (requests.RequestException, KeyError, ValueError) as e:
|
||||||
|
self.spider._update_log(f"[red]作品信息获取失败(PID:{work_id}): {str(e)}[/red]")
|
||||||
return False
|
return False
|
||||||
|
117
pixiv_spider.py
117
pixiv_spider.py
@ -4,6 +4,7 @@ Pixiv爬虫 - 每日排行榜下载
|
|||||||
"""
|
"""
|
||||||
from typing import Generator, List, Dict, Any
|
from typing import Generator, List, Dict, Any
|
||||||
import requests
|
import requests
|
||||||
|
import time
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.progress import (
|
from rich.progress import (
|
||||||
Progress,
|
Progress,
|
||||||
@ -48,57 +49,74 @@ class PixivSpider:
|
|||||||
self.headers = None
|
self.headers = None
|
||||||
self.current_ranking_data = []
|
self.current_ranking_data = []
|
||||||
self.failed_works = []
|
self.failed_works = []
|
||||||
|
self.log_messages = []
|
||||||
|
|
||||||
def _setup_ui(self) -> None:
|
def _setup_ui(self) -> None:
|
||||||
"""设置Rich界面组件"""
|
"""设置Rich界面组件"""
|
||||||
# 创建布局
|
# 创建布局
|
||||||
self.layout = Layout()
|
self.layout = Layout()
|
||||||
self.layout.split(
|
self.layout.split(
|
||||||
Layout(name="PixivSpider", ratio=8),
|
Layout(name="header", size=3),
|
||||||
Layout(name="progress", ratio=2)
|
Layout(name="main", size=None),
|
||||||
|
Layout(name="progress", size=3)
|
||||||
)
|
)
|
||||||
|
|
||||||
# 创建进度条
|
# 设置标题
|
||||||
|
self.layout["header"].update(
|
||||||
|
Panel("PixivSpider", style="bold magenta", border_style="bright_blue")
|
||||||
|
)
|
||||||
|
|
||||||
|
# 创建进度条 - 固定在底部
|
||||||
self.progress = Progress(
|
self.progress = Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
TextColumn("[bold blue]{task.description}"),
|
TextColumn("[bold blue]{task.description}"),
|
||||||
BarColumn(bar_width=40),
|
BarColumn(),
|
||||||
TaskProgressColumn(),
|
TextColumn("{task.percentage:>3.0f}%"),
|
||||||
TextColumn("{task.fields[speed]}"),
|
console=self.console,
|
||||||
console=Console(stderr=True),
|
expand=True,
|
||||||
|
transient=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# 设置进度条任务
|
||||||
|
self.main_task_id = self.progress.add_task(
|
||||||
|
"总体进度",
|
||||||
|
total=self.TOTAL_IMAGES
|
||||||
|
)
|
||||||
|
|
||||||
|
def _update_log(self, message: str, speed: float = 0.0) -> None:
|
||||||
|
"""更新日志信息"""
|
||||||
|
if not self.log_messages or message != self.log_messages[-1]:
|
||||||
|
self.log_messages.insert(0, message)
|
||||||
|
if len(self.log_messages) > 100:
|
||||||
|
self.log_messages = self.log_messages[:100]
|
||||||
|
|
||||||
|
messages = self.log_messages[:10]
|
||||||
|
|
||||||
|
# 清空控制台
|
||||||
|
self.console.clear()
|
||||||
|
|
||||||
|
# 重新渲染布局
|
||||||
|
self.console.print(self.layout)
|
||||||
|
|
||||||
|
# 更新日志面板
|
||||||
|
log_content = "\n".join(messages)
|
||||||
|
log_panel = Panel(
|
||||||
|
log_content,
|
||||||
|
title="下载状态",
|
||||||
|
subtitle=f"显示最新 {len(messages)}/{len(self.log_messages)} 条消息, 速度: {speed:.2f} t/s",
|
||||||
|
border_style="green",
|
||||||
|
padding=(1, 2),
|
||||||
expand=True
|
expand=True
|
||||||
)
|
)
|
||||||
|
self.layout["main"].update(log_panel)
|
||||||
|
|
||||||
# 设置日志面板
|
|
||||||
self.log_messages = []
|
|
||||||
self.main_task_id = self.progress.add_task(
|
|
||||||
"[cyan]总体进度",
|
|
||||||
total=self.TOTAL_IMAGES,
|
|
||||||
speed=""
|
|
||||||
)
|
|
||||||
|
|
||||||
def _update_log(self, message: str) -> None:
|
|
||||||
"""更新日志显示"""
|
|
||||||
self.log_messages.append(message)
|
|
||||||
if len(self.log_messages) > 18:
|
|
||||||
self.log_messages.pop(0)
|
|
||||||
log_group = Group(*self.log_messages)
|
|
||||||
self.layout["PixivSpider"].update(
|
|
||||||
Panel(
|
|
||||||
log_group,
|
|
||||||
title="PixivSpider",
|
|
||||||
title_align="left",
|
|
||||||
border_style="cyan",
|
|
||||||
padding=(0, 1)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def _setup_session(self) -> None:
|
def _setup_session(self) -> None:
|
||||||
"""设置请求会话"""
|
"""设置请求会话"""
|
||||||
cookie = self.redis.get_cookie()
|
cookie = self.redis.get_cookie()
|
||||||
if not cookie:
|
if not cookie:
|
||||||
cookie = input('请输入一个cookie:')
|
cookie = input('请输入Pixiv Cookie :')
|
||||||
self.redis.set_cookie(cookie)
|
self.redis.set_cookie(cookie)
|
||||||
|
|
||||||
self.headers = PIXIV_CONFIG.headers.copy()
|
self.headers = PIXIV_CONFIG.headers.copy()
|
||||||
self.headers['cookie'] = cookie
|
self.headers['cookie'] = cookie
|
||||||
|
|
||||||
@ -141,20 +159,47 @@ class PixivSpider:
|
|||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
"""运行爬虫"""
|
"""运行爬虫"""
|
||||||
self._setup_session()
|
self._setup_session()
|
||||||
downloader = PixivDownloader(self.headers, self.progress)
|
downloader = PixivDownloader(self, self.headers, self.progress)
|
||||||
|
|
||||||
with Live(self.layout, self.console, refresh_per_second=10):
|
with Live(self.layout, refresh_per_second=20, auto_refresh=True, console=self.console):
|
||||||
self.layout["progress"].update(self.progress)
|
self.layout["progress"].update(self.progress)
|
||||||
self._update_log('[cyan]开始抓取...[/cyan]')
|
self._update_log('[cyan]开始抓取...[/cyan]')
|
||||||
|
|
||||||
# 处理排行榜页面
|
# 处理排行榜页面
|
||||||
|
# 保存开始时间用于计算速度
|
||||||
|
start_time = time.time()
|
||||||
|
last_update_time = start_time
|
||||||
|
completed_works = 0
|
||||||
|
|
||||||
for page in range(1, 11):
|
for page in range(1, 11):
|
||||||
try:
|
try:
|
||||||
self.get_ranking_page(page)
|
self.get_ranking_page(page)
|
||||||
for work_id in self.process_ranking_data():
|
for work_id in self.process_ranking_data():
|
||||||
if not downloader.download_work(work_id):
|
if not downloader.download_work(work_id):
|
||||||
self.failed_works.append(work_id)
|
self.failed_works.append(work_id)
|
||||||
self.progress.update(self.main_task_id, advance=1)
|
else:
|
||||||
|
# 计算实际速度(作品/秒)
|
||||||
|
current_time = time.time()
|
||||||
|
elapsed_time = current_time - start_time
|
||||||
|
|
||||||
|
# 每秒更新一次速度
|
||||||
|
|
||||||
|
# 计算实际速度(作品/秒)
|
||||||
|
current_time = time.time()
|
||||||
|
elapsed_time = current_time - start_time
|
||||||
|
completed_works += 1
|
||||||
|
# 每次下载图片后更新速度
|
||||||
|
if elapsed_time > 0: # 避免除以零错误
|
||||||
|
speed = completed_works / elapsed_time
|
||||||
|
self._update_log(f"[cyan]已爬取[/cyan] {completed_works} [cyan]个页面![/cyan]")
|
||||||
|
# 更新进度
|
||||||
|
self.progress.update(
|
||||||
|
self.main_task_id,
|
||||||
|
completed=completed_works,
|
||||||
|
|
||||||
|
)
|
||||||
|
# 更新UI显示
|
||||||
|
self.layout["progress"].update(self.progress)
|
||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
|
self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]')
|
||||||
@ -162,6 +207,6 @@ class PixivSpider:
|
|||||||
|
|
||||||
# 清理失败作品的记录
|
# 清理失败作品的记录
|
||||||
for work_id in self.failed_works:
|
for work_id in self.failed_works:
|
||||||
self.redis.client.delete(work_id)
|
self.redis.client().delete(work_id)
|
||||||
|
|
||||||
self._update_log('[green]爬虫运行完成[/green]')
|
self._update_log('[green]爬虫运行完成[/green]')
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
redis==5.2.1
|
redis==5.2.1
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
rich==13.7.1
|
rich==13.9.4
|
||||||
urllib3<2.0.0 # 确保与requests兼容
|
urllib3<2.0.0 # 确保与requests兼容
|
||||||
|
Loading…
Reference in New Issue
Block a user