diff --git a/.gitignore b/.gitignore
index d06298a..9ee2a04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,4 @@ dmypy.json
/results.csv
.idea/dataSources.local.xml
.gitignore
+.vscode/launch.json
diff --git a/DoubanSpider/Spider.py b/DoubanSpider/Spider.py
new file mode 100644
index 0000000..d127097
--- /dev/null
+++ b/DoubanSpider/Spider.py
@@ -0,0 +1,122 @@
+from DoubanSpider import *
+from DoubanSpider.db import Douban, engine, Recording
+from sqlalchemy.orm import sessionmaker
+
+logger = logging.getLogger("PAPA")
+sleeptime = random.randint(0, 3)
+
+
+class DoubanBook(object):
+ def __init__(self):
+ self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
+ self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
+ self.session = sessionmaker(engine)()
+ self.headers = {
+ 'Referer': 'https://www.baidu.com/',
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/76.0.3809.132 Safari/537.36 '
+ }
+ self.log = logging.basicConfig(filename='papa.log',
+ filemode='a',
+ format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
+
+ def get_url(self, tag_name):
+ """
+
+ :param tag_name: 字符串格式 TAG名称
+ :return:
+ """
+ for num in range(0, 10000, 20):
+ time.sleep(sleeptime)
+ url = self.base_url.format(tag_name) + f'?start={num}&type=T'
+ print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
+ response = requests.get(url, headers=self.headers)
+ html = response.content.decode()
+ books_url = re.findall('.*?.*?.*?', html)
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+ # executor.map(self.get_url, [i for i in tags])
+ for i in tags:
+ print(f'[Spider]正在获取<{i}>链接数据.....')
+ time.sleep(0.5)
+ self.get_url(i)
+
+ # def get_books_url(self, urls, tag_name):
+ # response = requests.get(url, headers=self.headers)
+ # html = response.content.decode()
+ # books_url = re.findall('.*?(.*?).*?', html)[0]
+ author = re.findall('出版年: (.*?)
.*?', html)[0]
+ except:
+ print(f'《{name}》未发现出版时间!')
+ time_temp = 'N/A'
+ logger.warning(
+ f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE