Merge branch 'Dev' into 'master'

Dev

See merge request nyaasuki/doubanbookspider!2
This commit is contained in:
岛风 2019-09-10 12:58:13 +00:00
commit 44a2ca7cad
9 changed files with 223 additions and 144 deletions

1
.gitignore vendored
View File

@ -137,3 +137,4 @@ dmypy.json
/results.csv /results.csv
.idea/dataSources.local.xml .idea/dataSources.local.xml
.gitignore .gitignore
.vscode/launch.json

122
DoubanSpider/Spider.py Normal file
View File

@ -0,0 +1,122 @@
from DoubanSpider import *
from DoubanSpider.db import Douban, engine, Recording
from sqlalchemy.orm import sessionmaker
logger = logging.getLogger("PAPA")
sleeptime = random.randint(0, 3)
class DoubanBook(object):
def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
self.session = sessionmaker(engine)()
self.headers = {
'Referer': 'https://www.baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/76.0.3809.132 Safari/537.36 '
}
self.log = logging.basicConfig(filename='papa.log',
filemode='a',
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
def get_url(self, tag_name):
"""
:param tag_name: 字符串格式 TAG名称
:return:
"""
for num in range(0, 10000, 20):
time.sleep(sleeptime)
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
print(f'正在获取 TAG<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers)
html = response.content.decode()
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
if not books_url:
break
for i in books_url:
try:
self.session.add(Douban(tag=tag_name, url=i))
self.session.commit()
except:
self.session.rollback()
def get_tags(self):
response = requests.get(self.main_url, headers=self.headers)
html = response.content.decode()
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# executor.map(self.get_url, [i for i in tags])
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i)
# def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers)
# html = response.content.decode()
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
# self.get_data(books_url, tag_name)
def get_data(self, row):
"""
:param row: 数据库提取列表
:return: 1.异常退出
"""
time.sleep(sleeptime)
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
html = response.content.decode()
try:
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
except:
logger.error(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
return 1
try:
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
except:
print(f'{name}》未发现出版时间!')
time_temp = 'N/A'
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
try:
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
print(f'{name}》未发现定价!')
price = 'N/A'
try:
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
print(f'{name}》未发现评分!')
score = 'N/A'
try:
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'{name}》未发现简介!')
intro = ''
data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}')
self.save_csv(data)
rec = self.session.query(Recording).filter_by(id=1).scalar()
rec.data = row[2]
self.session.commit()
@staticmethod
def save_csv(data):
"""
:param data: 数据
:return:
"""
with open('results.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)

View File

@ -6,22 +6,27 @@ import os
try: try:
from sqlalchemy import create_engine, Column, Integer, String from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
import requests import random
import re import re
import time import time
import csv import csv
import requests
import time
import sqlite3 import sqlite3
import logging import logging
import random
except: except:
print('[System]正在安装支持库...') print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy') os.system(r'pip install -r .\DoubanSpider\requirements.txt')
os.system('pip install sqlite')
os.system('pip install csv')
os.system('pip install requests')
os.system('pip install logging')
import requests
import csv
import logging
from sqlalchemy import create_engine, Column, Integer, String from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
import random
import re
import time
import csv
import requests
import time
import sqlite3
import logging
finally:
print('[System]运行库加载完毕!')

View File

@ -16,6 +16,12 @@ class Douban(Base):
return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url) return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url)
class Recording(Base):
__tablename__ = 'Recording'
id = Column(Integer, primary_key=True)
data = Column(Integer, unique=True, nullable=False)
if os.path.isfile('douban.db') is False: if os.path.isfile('douban.db') is False:
print('正在创建数据库...') print('正在创建数据库...')
Base.metadata.create_all() Base.metadata.create_all()

BIN
DoubanSpider/douban.db Normal file

Binary file not shown.

View File

@ -0,0 +1,8 @@
certifi==2019.6.16
chardet==3.0.4
idna==2.8
requests==2.22.0
SQLAlchemy==1.3.8
sqlit==0.1.6
urllib3==1.25.3
wincertstore==0.2

134
Spider.py
View File

@ -1,134 +0,0 @@
from DoubanSpider import *
from DoubanSpider.db import Douban, engine
from sqlalchemy.orm import sessionmaker
class DoubanBook(object):
def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
self.session = sessionmaker(engine)()
self.headers = {
'Referer': 'https://www.baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/76.0.3809.132 Safari/537.36 '
}
self.log = logging.basicConfig(filename='papa.log',
filemode='a',
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
def get_url(self, tag_name):
for num in range(0, 10000, 20):
time.sleep(sleeptime)
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
print(f'正在获取 TAG<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers)
html = response.content.decode()
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
if not books_url:
break
for i in books_url:
try:
self.session.add(Douban(tag=tag_name, url=i))
self.session.commit()
except:
self.session.rollback()
def get_tags(self):
print('[SQL]未发现TAGS数据')
print('[Spider]正在准备TAG数据这需要一定时间.....')
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG需要大量时间\n2.获取单一TAG\n请输入对应数字,回车确定\n')
if do_not_get_all == '1':
response = requests.get(self.main_url, headers=self.headers)
html = response.content.decode()
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# executor.map(self.get_url, [i for i in tags])
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
self.get_url(user_tag)
self.main()
else:
print("[Spider]输入有误,请重新输入!")
self.get_tags()
self.get_data()
# def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers)
# html = response.content.decode()
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
# self.get_data(books_url, tag_name)
def get_data(self):
for row in self.session.query(Douban.url, Douban.tag).all():
time.sleep(sleeptime)
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
html = response.content.decode()
try:
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
except:
logger.error(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue
try:
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
except:
print(f'{name}》未发现出版时间!')
time_temp = 'N/A'
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
try:
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
print(f'{name}》未发现定价!')
price = 'N/A'
try:
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
print(f'{name}》未发现评分!')
score = 'N/A'
try:
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
except:
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'{name}》未发现简介!')
intro = ''
data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}')
self.save_csv(data)
def save_csv(self, data):
with open('results.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
def main(self):
n = self.session.query(Douban.url, Douban.tag).all()
if not n:
self.get_tags()
else:
print('[Spider]检测到现有TAG数据开始抓取...')
self.get_data()
if __name__ == '__main__':
logger = logging.getLogger("PAPA")
sleeptime = random.randint(0,3)
with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
douban = DoubanBook()
douban.main()

BIN
douban.db

Binary file not shown.

71
main.py Normal file
View File

@ -0,0 +1,71 @@
import csv
import time
from cmd import Cmd
from DoubanSpider.db import Douban, Recording
from DoubanSpider.Spider import DoubanBook
class SpiderMain(Cmd):
intro = '豆瓣图书爬虫V2.0 ---------- 输入help获取帮助。'
def __init__(self):
super().__init__()
pass
def do_help(self, arg):
print('[Help] start - 开始爬取任务,默认从上次结束的地方开始')
print('[Help] tag TAG - 添加/爬取 标签下的书籍TAG是你需要添加的标签')
print('[Help] tag all - 爬取所有标签下的书籍')
print('[Help] quit - 退出程序')
def do_start(self, arg):
for row in url_pool():
douban.get_data(row)
print('爬取结束!')
def do_tag(self, arg):
if arg == "all":
print("[WAR]请注意,在没有代理池的情况下,此操作通常无法完成!")
douban.get_tags()
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
else:
print(f"[Spider]开始获取{arg}标签下的所有书籍,这需要一定时间!")
douban.get_url(arg)
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
def do_quit(self, arg):
exit()
def main(self):
self.cmdloop()
def url_pool():
if not n:
print('[Spider]你需要先获取tag数据!')
else:
for row in douban.session.query(Douban.url, Douban.tag, Douban.id).all():
ago = douban.session.query(Recording.data).first()
if row[2] > ago[0]:
yield row
if __name__ == '__main__':
with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
spider = SpiderMain()
douban = DoubanBook()
rec = douban.session.query(Recording.id).all()
if not rec:
douban.session.add(Recording(id=1, data=0))
douban.session.commit()
n = douban.session.query(Douban.url, Douban.tag).all()
if not n:
print('未检测到任何数据,请使用 tag 关键字获取标签数据输入help获取帮助。')
else:
print('检测到现有TAG数据输入start直接开始抓取...')
spider.main()