This commit is contained in:
岛风 2019-09-06 19:31:02 +08:00
parent 39879fa349
commit c9f8bfd3ba
2 changed files with 15 additions and 6 deletions

View File

@ -10,17 +10,24 @@ try:
import re import re
import time import time
import csv import csv
import threading
import sqlite3 import sqlite3
import logging import logging
except: except:
print('[System]正在安装支持库...') print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy') os.system('pip install SQLAlchemy')
os.system('pip install sqlite') os.system('pip install sqlite')
os.system('pip install threading')
os.system('pip install csv') os.system('pip install csv')
os.system('pip install requests') os.system('pip install requests')
os.system('pip install logging') os.system('pip install logging')
else:
os.system('pip3 install SQLAlchemy')
os.system('pip3 install sqlite')
os.system('pip3 install csv')
os.system('pip3 install requests')
os.system('pip3 install logging')
finally:
import requests import requests
import csv import csv
import logging import logging

View File

@ -7,7 +7,6 @@ class DoubanBook(object):
def __init__(self): def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
self._lock = threading.Lock()
self.session = sessionmaker(engine)() self.session = sessionmaker(engine)()
self.headers = { self.headers = {
'Referer': 'https://www.baidu.com/', 'Referer': 'https://www.baidu.com/',
@ -20,6 +19,7 @@ class DoubanBook(object):
def get_url(self, tag_name): def get_url(self, tag_name):
for num in range(0, 10000, 20): for num in range(0, 10000, 20):
time.sleep(0.5)
url = self.base_url.format(tag_name) + f'?start={num}&type=T' url = self.base_url.format(tag_name) + f'?start={num}&type=T'
print(f'正在获取 TAG<{tag_name}> 书籍信息', num) print(f'正在获取 TAG<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers) response = requests.get(url, headers=self.headers)
@ -46,6 +46,7 @@ class DoubanBook(object):
# executor.map(self.get_url, [i for i in tags]) # executor.map(self.get_url, [i for i in tags])
for i in tags: for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....') print(f'[Spider]正在获取<{i}>链接数据.....')
time.sleep(0.5)
self.get_url(i) self.get_url(i)
elif do_not_get_all == '2': elif do_not_get_all == '2':
user_tag = input('请输入标签:') user_tag = input('请输入标签:')
@ -64,6 +65,7 @@ class DoubanBook(object):
def get_data(self): def get_data(self):
for row in self.session.query(Douban.url, Douban.tag).all(): for row in self.session.query(Douban.url, Douban.tag).all():
time.sleep(0.5)
print(f"正在解析:{row[0]}") print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers) response = requests.get(row[0], headers=self.headers)
html = response.content.decode() html = response.content.decode()
@ -75,10 +77,10 @@ class DoubanBook(object):
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}") f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue continue
try: try:
time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0] time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
except: except:
print(f'{name}》未发现出版时间!') print(f'{name}》未发现出版时间!')
time = 'N/A' time_temp = 'N/A'
logger.warning( logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}") f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
try: try:
@ -103,7 +105,7 @@ class DoubanBook(object):
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}") f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
print(f'{name}》未发现简介!') print(f'{name}》未发现简介!')
intro = '' intro = ''
data = [name, author, time, price, score, row[1], intro] data = [name, author, time_temp, price, score, row[1], intro]
print(f'正在保存:{name}') print(f'正在保存:{name}')
self.save_csv(data) self.save_csv(data)