Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
39a51fda8f | ||
|
cb324261e9 | ||
|
02b393ae94 | ||
|
869b74e359 | ||
|
44a2ca7cad | ||
|
1f1acbaca9 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -137,3 +137,4 @@ dmypy.json
|
||||
/results.csv
|
||||
.idea/dataSources.local.xml
|
||||
.gitignore
|
||||
.vscode/launch.json
|
||||
|
122
DoubanSpider/Spider.py
Normal file
122
DoubanSpider/Spider.py
Normal file
@ -0,0 +1,122 @@
|
||||
from DoubanSpider import *
|
||||
from DoubanSpider.db import Douban, engine, Recording
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
logger = logging.getLogger("PAPA")
|
||||
sleeptime = random.randint(0, 3)
|
||||
|
||||
|
||||
class DoubanBook(object):
|
||||
def __init__(self):
|
||||
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
||||
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
||||
self.session = sessionmaker(engine)()
|
||||
self.headers = {
|
||||
'Referer': 'https://www.baidu.com/',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/76.0.3809.132 Safari/537.36 '
|
||||
}
|
||||
self.log = logging.basicConfig(filename='papa.log',
|
||||
filemode='a',
|
||||
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
|
||||
|
||||
def get_url(self, tag_name):
|
||||
"""
|
||||
|
||||
:param tag_name: 字符串格式 TAG名称
|
||||
:return:
|
||||
"""
|
||||
for num in range(0, 10000, 20):
|
||||
time.sleep(sleeptime)
|
||||
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
||||
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
|
||||
response = requests.get(url, headers=self.headers)
|
||||
html = response.content.decode()
|
||||
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||
if not books_url:
|
||||
break
|
||||
for i in books_url:
|
||||
try:
|
||||
self.session.add(Douban(tag=tag_name, url=i))
|
||||
self.session.commit()
|
||||
except:
|
||||
self.session.rollback()
|
||||
|
||||
def get_tags(self):
|
||||
response = requests.get(self.main_url, headers=self.headers)
|
||||
html = response.content.decode()
|
||||
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
|
||||
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
# executor.map(self.get_url, [i for i in tags])
|
||||
for i in tags:
|
||||
print(f'[Spider]正在获取<{i}>链接数据.....')
|
||||
time.sleep(0.5)
|
||||
self.get_url(i)
|
||||
|
||||
# def get_books_url(self, urls, tag_name):
|
||||
# response = requests.get(url, headers=self.headers)
|
||||
# html = response.content.decode()
|
||||
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||
# self.get_data(books_url, tag_name)
|
||||
|
||||
def get_data(self, row):
|
||||
"""
|
||||
:param row: 数据库提取列表
|
||||
:return: 1.异常退出
|
||||
"""
|
||||
time.sleep(sleeptime)
|
||||
print(f"正在解析:{row[0]}")
|
||||
response = requests.get(row[0], headers=self.headers)
|
||||
html = response.content.decode()
|
||||
try:
|
||||
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
|
||||
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
|
||||
except:
|
||||
logger.error(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
|
||||
return 1
|
||||
try:
|
||||
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
||||
except:
|
||||
print(f'《{name}》未发现出版时间!')
|
||||
time_temp = 'N/A'
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
|
||||
try:
|
||||
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
|
||||
print(f'《{name}》未发现定价!')
|
||||
price = 'N/A'
|
||||
try:
|
||||
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
|
||||
print(f'《{name}》未发现评分!')
|
||||
score = 'N/A'
|
||||
try:
|
||||
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
|
||||
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
|
||||
print(f'《{name}》未发现简介!')
|
||||
intro = '无'
|
||||
data = [name, author, time_temp, price, score, row[1], intro]
|
||||
print(f'正在保存:{name}。')
|
||||
self.save_csv(data)
|
||||
rec = self.session.query(Recording).filter_by(id=1).scalar()
|
||||
rec.data = row[2]
|
||||
self.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def save_csv(data):
|
||||
"""
|
||||
:param data: 数据
|
||||
:return:
|
||||
"""
|
||||
with open('results.csv', 'a', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(data)
|
@ -6,22 +6,27 @@ import os
|
||||
try:
|
||||
from sqlalchemy import create_engine, Column, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
import requests
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import csv
|
||||
import requests
|
||||
import time
|
||||
import sqlite3
|
||||
import logging
|
||||
import random
|
||||
except:
|
||||
print('[System]正在安装支持库...')
|
||||
os.system('pip install SQLAlchemy')
|
||||
os.system('pip install sqlite')
|
||||
os.system('pip install csv')
|
||||
os.system('pip install requests')
|
||||
os.system('pip install logging')
|
||||
import requests
|
||||
import csv
|
||||
import logging
|
||||
os.system(r'pip install -r .\DoubanSpider\requirements.txt')
|
||||
from sqlalchemy import create_engine, Column, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import csv
|
||||
import requests
|
||||
import time
|
||||
import sqlite3
|
||||
import logging
|
||||
|
||||
finally:
|
||||
print('[System]运行库加载完毕!')
|
||||
|
@ -16,6 +16,12 @@ class Douban(Base):
|
||||
return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url)
|
||||
|
||||
|
||||
class Recording(Base):
|
||||
__tablename__ = 'Recording'
|
||||
id = Column(Integer, primary_key=True)
|
||||
data = Column(Integer, unique=True, nullable=False)
|
||||
|
||||
|
||||
if os.path.isfile('douban.db') is False:
|
||||
print('正在创建数据库...')
|
||||
Base.metadata.create_all()
|
||||
|
8
DoubanSpider/requirements.txt
Normal file
8
DoubanSpider/requirements.txt
Normal file
@ -0,0 +1,8 @@
|
||||
certifi==2019.6.16
|
||||
chardet==3.0.4
|
||||
idna==2.8
|
||||
requests==2.22.0
|
||||
SQLAlchemy==1.3.8
|
||||
sqlit==0.1.6
|
||||
urllib3==1.25.8
|
||||
wincertstore==0.2
|
@ -4,14 +4,14 @@
|
||||
```shell
|
||||
git clone https://github.com/nyaasuki/DoubanBookSpider.git
|
||||
cd ./DoubanBookSpider
|
||||
python3 Spider.py
|
||||
python3 main.py
|
||||
```
|
||||
## Window
|
||||
配置好环境后
|
||||
下载并解压https://github.com/nyaasuki/DoubanBookSpider/archive/master.zip
|
||||
使用cmd命令运行
|
||||
```DOS
|
||||
python path\DoubanBookSpider\Spider.py #path为文件存放路径
|
||||
python path\DoubanBookSpider\main.py #path为文件存放路径
|
||||
```
|
||||
|
||||
或者
|
||||
@ -22,4 +22,5 @@ __将已经写好的脚本文件拖拽到当前光标位置,然后敲回车运
|
||||
|
||||
|
||||
# 重置程序
|
||||
删除DoubanBookSpider目录下的Spider.db即可
|
||||
删除DoubanBookSpider目录下的douban.db即可
|
||||
|
||||
|
134
Spider.py
134
Spider.py
@ -1,134 +0,0 @@
|
||||
from DoubanSpider import *
|
||||
from DoubanSpider.db import Douban, engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
class DoubanBook(object):
|
||||
def __init__(self):
|
||||
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
||||
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
||||
self.session = sessionmaker(engine)()
|
||||
self.headers = {
|
||||
'Referer': 'https://www.baidu.com/',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/76.0.3809.132 Safari/537.36 '
|
||||
}
|
||||
self.log = logging.basicConfig(filename='papa.log',
|
||||
filemode='a',
|
||||
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
|
||||
|
||||
def get_url(self, tag_name):
|
||||
for num in range(0, 10000, 20):
|
||||
time.sleep(sleeptime)
|
||||
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
||||
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
|
||||
response = requests.get(url, headers=self.headers)
|
||||
html = response.content.decode()
|
||||
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||
if not books_url:
|
||||
break
|
||||
for i in books_url:
|
||||
try:
|
||||
self.session.add(Douban(tag=tag_name, url=i))
|
||||
self.session.commit()
|
||||
except:
|
||||
self.session.rollback()
|
||||
|
||||
def get_tags(self):
|
||||
print('[SQL]未发现TAGS数据!')
|
||||
print('[Spider]正在准备TAG数据,这需要一定时间.....')
|
||||
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG(需要大量时间)\n2.获取单一TAG\n请输入对应数字,回车确定\n')
|
||||
if do_not_get_all == '1':
|
||||
response = requests.get(self.main_url, headers=self.headers)
|
||||
html = response.content.decode()
|
||||
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
|
||||
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
# executor.map(self.get_url, [i for i in tags])
|
||||
for i in tags:
|
||||
print(f'[Spider]正在获取<{i}>链接数据.....')
|
||||
time.sleep(0.5)
|
||||
self.get_url(i)
|
||||
elif do_not_get_all == '2':
|
||||
user_tag = input('请输入标签:')
|
||||
self.get_url(user_tag)
|
||||
self.main()
|
||||
else:
|
||||
print("[Spider]输入有误,请重新输入!")
|
||||
self.get_tags()
|
||||
self.get_data()
|
||||
|
||||
# def get_books_url(self, urls, tag_name):
|
||||
# response = requests.get(url, headers=self.headers)
|
||||
# html = response.content.decode()
|
||||
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||
# self.get_data(books_url, tag_name)
|
||||
|
||||
def get_data(self):
|
||||
for row in self.session.query(Douban.url, Douban.tag).all():
|
||||
time.sleep(sleeptime)
|
||||
print(f"正在解析:{row[0]}")
|
||||
response = requests.get(row[0], headers=self.headers)
|
||||
html = response.content.decode()
|
||||
try:
|
||||
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
|
||||
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
|
||||
except:
|
||||
logger.error(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
|
||||
continue
|
||||
try:
|
||||
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
||||
except:
|
||||
print(f'《{name}》未发现出版时间!')
|
||||
time_temp = 'N/A'
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
|
||||
try:
|
||||
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
|
||||
print(f'《{name}》未发现定价!')
|
||||
price = 'N/A'
|
||||
try:
|
||||
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
|
||||
print(f'《{name}》未发现评分!')
|
||||
score = 'N/A'
|
||||
try:
|
||||
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
|
||||
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
|
||||
except:
|
||||
logger.warning(
|
||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
|
||||
print(f'《{name}》未发现简介!')
|
||||
intro = '无'
|
||||
data = [name, author, time_temp, price, score, row[1], intro]
|
||||
print(f'正在保存:{name}。')
|
||||
self.save_csv(data)
|
||||
|
||||
def save_csv(self, data):
|
||||
with open('results.csv', 'a', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(data)
|
||||
|
||||
def main(self):
|
||||
n = self.session.query(Douban.url, Douban.tag).all()
|
||||
if not n:
|
||||
self.get_tags()
|
||||
else:
|
||||
print('[Spider]检测到现有TAG数据,开始抓取...')
|
||||
self.get_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger = logging.getLogger("PAPA")
|
||||
sleeptime = random.randint(0,3)
|
||||
with open("results.csv", "a", encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
||||
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
|
||||
douban = DoubanBook()
|
||||
douban.main()
|
71
main.py
Normal file
71
main.py
Normal file
@ -0,0 +1,71 @@
|
||||
import csv
|
||||
import time
|
||||
from cmd import Cmd
|
||||
|
||||
from DoubanSpider.db import Douban, Recording
|
||||
from DoubanSpider.Spider import DoubanBook
|
||||
|
||||
|
||||
class SpiderMain(Cmd):
|
||||
intro = '豆瓣图书爬虫V2.0 ---------- 输入help获取帮助。'
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
pass
|
||||
|
||||
def do_help(self, arg):
|
||||
print('[Help] start - 开始爬取任务,默认从上次结束的地方开始')
|
||||
print('[Help] tag TAG - 添加/爬取 标签下的书籍,TAG是你需要添加的标签')
|
||||
print('[Help] tag all - 爬取所有标签下的书籍')
|
||||
print('[Help] quit - 退出程序')
|
||||
|
||||
def do_start(self, arg):
|
||||
for row in url_pool():
|
||||
douban.get_data(row)
|
||||
print('爬取结束!')
|
||||
|
||||
def do_tag(self, arg):
|
||||
if arg == "all":
|
||||
print("[WAR]请注意,在没有代理池的情况下,此操作通常无法完成!")
|
||||
douban.get_tags()
|
||||
print('[Spider]标签下所有书籍信息爬取完成!请输入start开始抓取数据!')
|
||||
else:
|
||||
print(f"[Spider]开始获取{arg}标签下的所有书籍,这需要一定时间!")
|
||||
douban.get_url(arg)
|
||||
print('[Spider]标签下所有书籍信息爬取完成!请输入start开始抓取数据!')
|
||||
|
||||
def do_quit(self, arg):
|
||||
exit()
|
||||
|
||||
def main(self):
|
||||
self.cmdloop()
|
||||
|
||||
|
||||
def url_pool():
|
||||
m = douban.session.query(Douban.url, Douban.tag).all()
|
||||
if not m:
|
||||
print('[Spider]你需要先获取tag数据!')
|
||||
else:
|
||||
for row in douban.session.query(Douban.url, Douban.tag, Douban.id).all():
|
||||
ago = douban.session.query(Recording.data).first()
|
||||
if row[2] > ago[0]:
|
||||
yield row
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open("results.csv", "a", encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
||||
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
|
||||
spider = SpiderMain()
|
||||
douban = DoubanBook()
|
||||
rec = douban.session.query(Recording.id).all()
|
||||
if not rec:
|
||||
douban.session.add(Recording(id=1, data=0))
|
||||
douban.session.commit()
|
||||
n = douban.session.query(Douban.url, Douban.tag).all()
|
||||
if not n:
|
||||
print('未检测到任何数据,请使用 tag 关键字获取标签数据,输入help获取帮助。')
|
||||
else:
|
||||
print('检测到现有TAG数据,输入start直接开始抓取...')
|
||||
spider.main()
|
Loading…
Reference in New Issue
Block a user