diff --git a/DoubanSpider/__init__.py b/DoubanSpider/__init__.py
index 24f5d67..86c874d 100644
--- a/DoubanSpider/__init__.py
+++ b/DoubanSpider/__init__.py
@@ -10,17 +10,24 @@ try:
import re
import time
import csv
- import threading
import sqlite3
import logging
except:
print('[System]正在安装支持库...')
os.system('pip install SQLAlchemy')
os.system('pip install sqlite')
- os.system('pip install threading')
os.system('pip install csv')
os.system('pip install requests')
os.system('pip install logging')
+
+else:
+ os.system('pip3 install SQLAlchemy')
+ os.system('pip3 install sqlite')
+ os.system('pip3 install csv')
+ os.system('pip3 install requests')
+ os.system('pip3 install logging')
+
+finally:
import requests
import csv
import logging
diff --git a/Spider.py b/Spider.py
index 3eb06d8..a59c697 100644
--- a/Spider.py
+++ b/Spider.py
@@ -7,7 +7,6 @@ class DoubanBook(object):
def __init__(self):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
- self._lock = threading.Lock()
self.session = sessionmaker(engine)()
self.headers = {
'Referer': 'https://www.baidu.com/',
@@ -20,6 +19,7 @@ class DoubanBook(object):
def get_url(self, tag_name):
for num in range(0, 10000, 20):
+ time.sleep(0.5)
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
response = requests.get(url, headers=self.headers)
@@ -46,6 +46,7 @@ class DoubanBook(object):
# executor.map(self.get_url, [i for i in tags])
for i in tags:
print(f'[Spider]正在获取<{i}>链接数据.....')
+ time.sleep(0.5)
self.get_url(i)
elif do_not_get_all == '2':
user_tag = input('请输入标签:')
@@ -64,6 +65,7 @@ class DoubanBook(object):
def get_data(self):
for row in self.session.query(Douban.url, Douban.tag).all():
+ time.sleep(0.5)
print(f"正在解析:{row[0]}")
response = requests.get(row[0], headers=self.headers)
html = response.content.decode()
@@ -75,10 +77,10 @@ class DoubanBook(object):
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
continue
try:
- time = re.findall('出版年: (.*?)
.*?', html)[0]
+ time_temp = re.findall('出版年: (.*?)
.*?', html)[0]
except:
print(f'《{name}》未发现出版时间!')
- time = 'N/A'
+ time_temp = 'N/A'
logger.warning(
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE