DoubanBookSpider/Spider.py

135 lines
6.1 KiB
Python
Raw Normal View History

2019-09-06 11:32:43 +08:00
from DoubanSpider import *
2019-09-06 18:31:51 +08:00
from DoubanSpider.db import Douban, engine
from sqlalchemy.orm import sessionmaker
2019-09-06 11:32:43 +08:00
class DoubanBook(object):
def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
2019-09-06 18:31:51 +08:00
self.session = sessionmaker(engine)()
2019-09-06 11:32:43 +08:00
self.headers = {
2019-09-06 18:31:51 +08:00
'Referer': 'https://www.baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/76.0.3809.132 Safari/537.36 '
2019-09-06 11:32:43 +08:00
}
2019-09-06 18:31:51 +08:00
self.log = logging.basicConfig(filename='papa.log',
filemode='a',
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
2019-09-06 11:32:43 +08:00
def get_url(self, tag_name):
2019-09-06 18:31:51 +08:00
for num in range(0, 10000, 20):
2019-09-06 20:01:24 +08:00
time.sleep(sleeptime)
2019-09-06 11:32:43 +08:00
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
2019-09-06 18:31:51 +08:00
print(f'正在获取 TAG<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers)
html = response.content.decode()
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
if not books_url:
2019-09-06 11:32:43 +08:00
break
2019-09-06 18:31:51 +08:00
for i in books_url:
try:
self.session.add(Douban(tag=tag_name, url=i))
self.session.commit()
except:
self.session.rollback()
2019-09-06 11:32:43 +08:00
def get_tags(self):
2019-09-06 18:31:51 +08:00
print('[SQL]未发现TAGS数据')
print('[Spider]正在准备TAG数据这需要一定时间.....')
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG需要大量时间\n2.获取单一TAG\n请输入对应数字,回车确定\n')
if do_not_get_all == '1':
response = requests.get(self.main_url, headers=self.headers)
html = response.content.decode()
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# executor.map(self.get_url, [i for i in tags])
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
2019-09-06 19:31:02 +08:00
time.sleep(0.5)
2019-09-06 18:31:51 +08:00
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
self.get_url(user_tag)
self.main()
else:
print("[Spider]输入有误,请重新输入!")
self.get_tags()
self.get_data()
2019-09-06 11:32:43 +08:00
# def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers)
# html = response.content.decode()
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
# self.get_data(books_url, tag_name)
2019-09-06 18:31:51 +08:00
def get_data(self):
for row in self.session.query(Douban.url, Douban.tag).all():
2019-09-06 20:01:24 +08:00
time.sleep(sleeptime)
2019-09-06 18:31:51 +08:00
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
2019-09-06 11:32:43 +08:00
html = response.content.decode()
2019-09-06 18:31:51 +08:00
try:
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
except:
logger.error(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue
try:
2019-09-06 19:31:02 +08:00
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
2019-09-06 18:31:51 +08:00
except:
print(f'{name}》未发现出版时间!')
2019-09-06 19:31:02 +08:00
time_temp = 'N/A'
2019-09-06 18:31:51 +08:00
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
try:
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
print(f'{name}》未发现定价!')
price = 'N/A'
try:
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
print(f'{name}》未发现评分!')
score = 'N/A'
try:
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'{name}》未发现简介!')
intro = ''
2019-09-06 19:31:02 +08:00
data = [name, author, time_temp, price, score, row[1], intro]
2019-09-06 11:32:43 +08:00
print(f'正在保存:{name}')
self.save_csv(data)
def save_csv(self, data):
with open('results.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
def main(self):
2019-09-06 18:31:51 +08:00
n = self.session.query(Douban.url, Douban.tag).all()
if not n:
self.get_tags()
else:
print('[Spider]检测到现有TAG数据开始抓取...')
self.get_data()
2019-09-06 11:32:43 +08:00
if __name__ == '__main__':
2019-09-06 18:31:51 +08:00
logger = logging.getLogger("PAPA")
2019-09-06 20:01:24 +08:00
sleeptime = random.randint(0,3)
2019-09-06 18:31:51 +08:00
with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
2019-09-06 11:32:43 +08:00
douban = DoubanBook()
douban.main()