release 2.0

This commit is contained in:
岛风 2019-09-10 20:49:20 +08:00
parent 2a5c032013
commit d20cd9b093
5 changed files with 138 additions and 98 deletions

View File

@ -3,6 +3,7 @@ from DoubanSpider.db import Douban, engine, Recording
from sqlalchemy.orm import sessionmaker
logger = logging.getLogger("PAPA")
sleeptime = random.randint(0, 3)
class DoubanBook(object):
@ -20,6 +21,11 @@ class DoubanBook(object):
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
def get_url(self, tag_name):
"""
:param tag_name: 字符串格式 TAG名称
:return:
"""
for num in range(0, 10000, 20):
time.sleep(sleeptime)
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
@ -37,10 +43,6 @@ class DoubanBook(object):
self.session.rollback()
def get_tags(self):
print('[SQL]未发现TAGS数据')
print('[Spider]正在准备TAG数据这需要一定时间.....')
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG需要大量时间\n2.获取单一TAG\n请输入对应数字,回车确定\n')
if do_not_get_all == '1':
response = requests.get(self.main_url, headers=self.headers)
html = response.content.decode()
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
@ -50,13 +52,6 @@ class DoubanBook(object):
print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
self.get_url(user_tag)
else:
print("[Spider]输入有误,请重新输入!")
self.get_tags()
self.get_data()
# def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers)
@ -64,8 +59,11 @@ class DoubanBook(object):
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
# self.get_data(books_url, tag_name)
def get_data(self):
for row in self.session.query(Douban.url, Douban.tag, Douban.id).all():
def get_data(self, row):
"""
:param row: 数据库提取列表
:return: 1.异常退出
"""
time.sleep(sleeptime)
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
@ -76,7 +74,7 @@ class DoubanBook(object):
except:
logger.error(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue
return 1
try:
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
except:
@ -109,21 +107,16 @@ class DoubanBook(object):
data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}')
self.save_csv(data)
rec = self.session.query(Recording).filter_by(id=1).scalar()
rec.data = row[2]
self.session.commit()
@staticmethod
def save_csv(data):
"""
:param data: 数据
:return:
"""
with open('results.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
def main(self):
rec = self.session.query(Recording.id).all()
if not rec:
self.session.add(Recording(id=1, data=1))
self.session.commit()
n = self.session.query(Douban.url, Douban.tag).all()
if not n:
self.get_tags()
else:
print('[Spider]检测到现有TAG数据开始抓取...')
self.get_data()

View File

@ -6,22 +6,27 @@ import os
try:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
import requests
import random
import re
import time
import csv
import requests
import time
import sqlite3
import logging
import random
except:
print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy')
os.system('pip install sqlite')
os.system('pip install csv')
os.system('pip install requests')
os.system('pip install logging')
import requests
import csv
import logging
os.system(r'pip install -r .\DoubanSpider\requirements.txt')
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
import random
import re
import time
import csv
import requests
import time
import sqlite3
import logging
finally:
print('[System]运行库加载完毕!')

BIN
DoubanSpider/douban.db Normal file

Binary file not shown.

View File

@ -0,0 +1,8 @@
certifi==2019.6.16
chardet==3.0.4
idna==2.8
requests==2.22.0
SQLAlchemy==1.3.8
sqlit==0.1.6
urllib3==1.25.3
wincertstore==0.2

54
main.py
View File

@ -1,37 +1,71 @@
import csv
import logging
import random
import time
from cmd import Cmd
from DoubanSpider.db import Douban
from DoubanSpider.db import Douban, Recording
from DoubanSpider.Spider import DoubanBook
class SpiderMain(Cmd):
intro = '豆瓣图书爬虫V2.0 ---------- 输入help获取帮助。'
def __init__(self):
super().__init__()
pass
def do_help(self, arg):
pass
print('[Help] start - 开始爬取任务,默认从上次结束的地方开始')
print('[Help] tag TAG - 添加/爬取 标签下的书籍TAG是你需要添加的标签')
print('[Help] tag all - 爬取所有标签下的书籍')
print('[Help] quit - 退出程序')
def do_start(self, arg):
pass
for row in url_pool():
douban.get_data(row)
print('爬取结束!')
def do_tag(self, arg):
if arg == "all":
print("[WAR]请注意,在没有代理池的情况下,此操作通常无法完成!")
douban.get_tags()
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
else:
print(f"[Spider]开始获取{arg}标签下的所有书籍,这需要一定时间!")
douban.get_url(arg)
print('[Spider]标签下所有书籍信息爬取完成请输入start开始抓取数据')
def do_quit(self, arg):
exit()
def main(self):
self.cmdloop()
def do_tag(self,arg):
pass
def url_pool():
for row in douban.session.query(Douban.url, Douban.tag).all():
if not n:
print('[Spider]你需要先获取tag数据!')
else:
for row in douban.session.query(Douban.url, Douban.tag, Douban.id).all():
ago = douban.session.query(Recording.data).first()
if row[2] > ago[0]:
yield row
if __name__ == '__main__':
sleeptime = random.randint(0, 3)
with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
spider = SpiderMain()
douban = DoubanBook()
douban.main()
rec = douban.session.query(Recording.id).all()
if not rec:
douban.session.add(Recording(id=1, data=0))
douban.session.commit()
n = douban.session.query(Douban.url, Douban.tag).all()
if not n:
print('未检测到任何数据,请使用 tag 关键字获取标签数据输入help获取帮助。')
else:
print('检测到现有TAG数据输入start直接开始抓取...')
spider.main()