release 2.0

This commit is contained in:
岛风 2019-09-10 20:49:20 +08:00
parent 2a5c032013
commit d20cd9b093
5 changed files with 138 additions and 98 deletions

View File

@ -3,6 +3,7 @@ from DoubanSpider.db import Douban, engine, Recording
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
logger = logging.getLogger("PAPA") logger = logging.getLogger("PAPA")
sleeptime = random.randint(0, 3)
class DoubanBook(object): class DoubanBook(object):
@ -20,6 +21,11 @@ class DoubanBook(object):
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING) format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
def get_url(self, tag_name): def get_url(self, tag_name):
"""
:param tag_name: 字符串格式 TAG名称
:return:
"""
for num in range(0, 10000, 20): for num in range(0, 10000, 20):
time.sleep(sleeptime) time.sleep(sleeptime)
url = self.base_url.format(tag_name) + f'?start={num}&type=T' url = self.base_url.format(tag_name) + f'?start={num}&type=T'
@ -37,26 +43,15 @@ class DoubanBook(object):
self.session.rollback() self.session.rollback()
def get_tags(self): def get_tags(self):
print('[SQL]未发现TAGS数据') response = requests.get(self.main_url, headers=self.headers)
print('[Spider]正在准备TAG数据这需要一定时间.....') html = response.content.decode()
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG需要大量时间\n2.获取单一TAG\n请输入对应数字,回车确定\n') tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
if do_not_get_all == '1': # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
response = requests.get(self.main_url, headers=self.headers) # executor.map(self.get_url, [i for i in tags])
html = response.content.decode() for i in tags:
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html) print(f'[Spider]正在获取<{i}>链接数据.....')
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: time.sleep(0.5)
# executor.map(self.get_url, [i for i in tags]) self.get_url(i)
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
self.get_url(user_tag)
else:
print("[Spider]输入有误,请重新输入!")
self.get_tags()
self.get_data()
# def get_books_url(self, urls, tag_name): # def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers) # response = requests.get(url, headers=self.headers)
@ -64,66 +59,64 @@ class DoubanBook(object):
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html) # books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
# self.get_data(books_url, tag_name) # self.get_data(books_url, tag_name)
def get_data(self): def get_data(self, row):
for row in self.session.query(Douban.url, Douban.tag, Douban.id).all(): """
time.sleep(sleeptime) :param row: 数据库提取列表
print(f"正在解析:{row[0]}") :return: 1.异常退出
response = requests.get(row[0], headers=self.headers) """
html = response.content.decode() time.sleep(sleeptime)
try: print(f"正在解析:{row[0]}")
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0] response = requests.get(row[0], headers=self.headers)
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0] html = response.content.decode()
except: try:
logger.error( name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}") author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
continue except:
try: logger.error(
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0] f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
except: return 1
print(f'{name}》未发现出版时间!') try:
time_temp = 'N/A' time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
logger.warning( except:
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}") print(f'{name}》未发现出版时间!')
try: time_temp = 'N/A'
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0] logger.warning(
except: f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
logger.warning( try:
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}") price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
print(f'{name}》未发现定价!') except:
price = 'N/A' logger.warning(
try: f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0] print(f'{name}》未发现定价!')
except: price = 'N/A'
logger.warning( try:
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}") score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
print(f'{name}》未发现评分!') except:
score = 'N/A' logger.warning(
try: f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0] print(f'{name}》未发现评分!')
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ') score = 'N/A'
except: try:
logger.warning( intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}") intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
print(f'{name}》未发现简介!') except:
intro = '' logger.warning(
data = [name, author, time_temp, price, score, row[1], intro] f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'正在保存:{name}') print(f'{name}》未发现简介!')
self.save_csv(data) intro = ''
data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}')
self.save_csv(data)
rec = self.session.query(Recording).filter_by(id=1).scalar()
rec.data = row[2]
self.session.commit()
@staticmethod @staticmethod
def save_csv(data): def save_csv(data):
"""
:param data: 数据
:return:
"""
with open('results.csv', 'a', encoding='utf-8') as f: with open('results.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f) writer = csv.writer(f)
writer.writerow(data) writer.writerow(data)
def main(self):
rec = self.session.query(Recording.id).all()
if not rec:
self.session.add(Recording(id=1, data=1))
self.session.commit()
n = self.session.query(Douban.url, Douban.tag).all()
if not n:
self.get_tags()
else:
print('[Spider]检测到现有TAG数据开始抓取...')
self.get_data()

View File

@ -6,22 +6,27 @@ import os
try: try:
from sqlalchemy import create_engine, Column, Integer, String from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
import requests import random
import re import re
import time import time
import csv import csv
import requests
import time
import sqlite3 import sqlite3
import logging import logging
import random
except: except:
print('[System]正在安装支持库...') print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy') os.system(r'pip install -r .\DoubanSpider\requirements.txt')
os.system('pip install sqlite')
os.system('pip install csv')
os.system('pip install requests')
os.system('pip install logging')
import requests
import csv
import logging
from sqlalchemy import create_engine, Column, Integer, String from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
import random
import re
import time
import csv
import requests
import time
import sqlite3
import logging
finally:
print('[System]运行库加载完毕!')

BIN
DoubanSpider/douban.db Normal file

Binary file not shown.

View File

@ -0,0 +1,8 @@
certifi==2019.6.16
chardet==3.0.4
idna==2.8
requests==2.22.0
SQLAlchemy==1.3.8
sqlit==0.1.6
urllib3==1.25.3
wincertstore==0.2

56
main.py
View File

@ -1,37 +1,71 @@
import csv import csv
import logging
import random
import time import time
from cmd import Cmd from cmd import Cmd
from DoubanSpider.db import Douban from DoubanSpider.db import Douban, Recording
from DoubanSpider.Spider import DoubanBook from DoubanSpider.Spider import DoubanBook
class SpiderMain(Cmd): class SpiderMain(Cmd):
intro = '豆瓣图书爬虫V2.0 ---------- 输入help获取帮助。'
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass pass
def do_help(self, arg): def do_help(self, arg):
pass print('[Help] start - 开始爬取任务,默认从上次结束的地方开始')
print('[Help] tag TAG - 添加/爬取 标签下的书籍TAG是你需要添加的标签')
print('[Help] tag all - 爬取所有标签下的书籍')
print('[Help] quit - 退出程序')
def do_start(self, arg): def do_start(self, arg):
pass for row in url_pool():
douban.get_data(row)
print('爬取结束!')
def do_tag(self, arg):
if arg == "all":
print("[WAR]请注意,在没有代理池的情况下,此操作通常无法完成!")
douban.get_tags()
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
else:
print(f"[Spider]开始获取{arg}标签下的所有书籍,这需要一定时间!")
douban.get_url(arg)
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
def do_quit(self, arg):
exit()
def main(self):
self.cmdloop()
def do_tag(self,arg):
pass
def url_pool(): def url_pool():
for row in douban.session.query(Douban.url, Douban.tag).all(): if not n:
yield row print('[Spider]你需要先获取tag数据!')
else:
for row in douban.session.query(Douban.url, Douban.tag, Douban.id).all():
ago = douban.session.query(Recording.data).first()
if row[2] > ago[0]:
yield row
if __name__ == '__main__': if __name__ == '__main__':
sleeptime = random.randint(0, 3)
with open("results.csv", "a", encoding='utf-8') as f: with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.writer(f) writer = csv.writer(f)
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"]) writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
spider = SpiderMain()
douban = DoubanBook() douban = DoubanBook()
douban.main() rec = douban.session.query(Recording.id).all()
if not rec:
douban.session.add(Recording(id=1, data=0))
douban.session.commit()
n = douban.session.query(Douban.url, Douban.tag).all()
if not n:
print('未检测到任何数据,请使用 tag 关键字获取标签数据输入help获取帮助。')
else:
print('检测到现有TAG数据输入start直接开始抓取...')
spider.main()