fix bugs
This commit is contained in:
parent
39879fa349
commit
c9f8bfd3ba
@ -10,17 +10,24 @@ try:
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import csv
|
import csv
|
||||||
import threading
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
import logging
|
||||||
except:
|
except:
|
||||||
print('[System]正在安装支持库...')
|
print('[System]正在安装支持库...')
|
||||||
os.system('pip install SQLAlchemy')
|
os.system('pip install SQLAlchemy')
|
||||||
os.system('pip install sqlite')
|
os.system('pip install sqlite')
|
||||||
os.system('pip install threading')
|
|
||||||
os.system('pip install csv')
|
os.system('pip install csv')
|
||||||
os.system('pip install requests')
|
os.system('pip install requests')
|
||||||
os.system('pip install logging')
|
os.system('pip install logging')
|
||||||
|
|
||||||
|
else:
|
||||||
|
os.system('pip3 install SQLAlchemy')
|
||||||
|
os.system('pip3 install sqlite')
|
||||||
|
os.system('pip3 install csv')
|
||||||
|
os.system('pip3 install requests')
|
||||||
|
os.system('pip3 install logging')
|
||||||
|
|
||||||
|
finally:
|
||||||
import requests
|
import requests
|
||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
|
10
Spider.py
10
Spider.py
@ -7,7 +7,6 @@ class DoubanBook(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
||||||
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
||||||
self._lock = threading.Lock()
|
|
||||||
self.session = sessionmaker(engine)()
|
self.session = sessionmaker(engine)()
|
||||||
self.headers = {
|
self.headers = {
|
||||||
'Referer': 'https://www.baidu.com/',
|
'Referer': 'https://www.baidu.com/',
|
||||||
@ -20,6 +19,7 @@ class DoubanBook(object):
|
|||||||
|
|
||||||
def get_url(self, tag_name):
|
def get_url(self, tag_name):
|
||||||
for num in range(0, 10000, 20):
|
for num in range(0, 10000, 20):
|
||||||
|
time.sleep(0.5)
|
||||||
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
||||||
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
|
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(url, headers=self.headers)
|
||||||
@ -46,6 +46,7 @@ class DoubanBook(object):
|
|||||||
# executor.map(self.get_url, [i for i in tags])
|
# executor.map(self.get_url, [i for i in tags])
|
||||||
for i in tags:
|
for i in tags:
|
||||||
print(f'[Spider]正在获取<{i}>链接数据.....')
|
print(f'[Spider]正在获取<{i}>链接数据.....')
|
||||||
|
time.sleep(0.5)
|
||||||
self.get_url(i)
|
self.get_url(i)
|
||||||
elif do_not_get_all == '2':
|
elif do_not_get_all == '2':
|
||||||
user_tag = input('请输入标签:')
|
user_tag = input('请输入标签:')
|
||||||
@ -64,6 +65,7 @@ class DoubanBook(object):
|
|||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
for row in self.session.query(Douban.url, Douban.tag).all():
|
for row in self.session.query(Douban.url, Douban.tag).all():
|
||||||
|
time.sleep(0.5)
|
||||||
print(f"正在解析:{row[0]}")
|
print(f"正在解析:{row[0]}")
|
||||||
response = requests.get(row[0], headers=self.headers)
|
response = requests.get(row[0], headers=self.headers)
|
||||||
html = response.content.decode()
|
html = response.content.decode()
|
||||||
@ -75,10 +77,10 @@ class DoubanBook(object):
|
|||||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
||||||
except:
|
except:
|
||||||
print(f'《{name}》未发现出版时间!')
|
print(f'《{name}》未发现出版时间!')
|
||||||
time = 'N/A'
|
time_temp = 'N/A'
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
|
||||||
try:
|
try:
|
||||||
@ -103,7 +105,7 @@ class DoubanBook(object):
|
|||||||
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
|
||||||
print(f'《{name}》未发现简介!')
|
print(f'《{name}》未发现简介!')
|
||||||
intro = '无'
|
intro = '无'
|
||||||
data = [name, author, time, price, score, row[1], intro]
|
data = [name, author, time_temp, price, score, row[1], intro]
|
||||||
print(f'正在保存:{name}。')
|
print(f'正在保存:{name}。')
|
||||||
self.save_csv(data)
|
self.save_csv(data)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user