This commit is contained in:
岛风 2019-09-06 19:31:02 +08:00
parent 39879fa349
commit c9f8bfd3ba
2 changed files with 15 additions and 6 deletions

View File

@ -10,17 +10,24 @@ try:
import re
import time
import csv
import threading
import sqlite3
import logging
except:
print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy')
os.system('pip install sqlite')
os.system('pip install threading')
os.system('pip install csv')
os.system('pip install requests')
os.system('pip install logging')
else:
os.system('pip3 install SQLAlchemy')
os.system('pip3 install sqlite')
os.system('pip3 install csv')
os.system('pip3 install requests')
os.system('pip3 install logging')
finally:
import requests
import csv
import logging

View File

@ -7,7 +7,6 @@ class DoubanBook(object):
def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
self._lock = threading.Lock()
self.session = sessionmaker(engine)()
self.headers = {
'Referer': 'https://www.baidu.com/',
@ -20,6 +19,7 @@ class DoubanBook(object):
def get_url(self, tag_name):
for num in range(0, 10000, 20):
time.sleep(0.5)
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
print(f'正在获取 TAG<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers)
@ -46,6 +46,7 @@ class DoubanBook(object):
# executor.map(self.get_url, [i for i in tags])
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
@ -64,6 +65,7 @@ class DoubanBook(object):
def get_data(self):
for row in self.session.query(Douban.url, Douban.tag).all():
time.sleep(0.5)
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
html = response.content.decode()
@ -75,10 +77,10 @@ class DoubanBook(object):
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue
try:
time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
except:
print(f'{name}》未发现出版时间!')
time = 'N/A'
time_temp = 'N/A'
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
try:
@ -103,7 +105,7 @@ class DoubanBook(object):
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'{name}》未发现简介!')
intro = ''
data = [name, author, time, price, score, row[1], intro]
data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}')
self.save_csv(data)