diff --git a/.gitignore b/.gitignore
index 96d1f30..03fcf7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,4 +132,6 @@ dmypy.json
.idea/inspectionProfiles/
.idea/misc.xml
.idea/modules.xml
-.idea/vcs.xml
\ No newline at end of file
+.idea/vcs.xml
+/papa.log
+/results.csv
diff --git a/.idea/dataSources.local.xml b/.idea/dataSources.local.xml
new file mode 100644
index 0000000..095e1fa
--- /dev/null
+++ b/.idea/dataSources.local.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+ "
+
+
+ false
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
new file mode 100644
index 0000000..2af8e9e
--- /dev/null
+++ b/.idea/dataSources.xml
@@ -0,0 +1,22 @@
+
+
+
+
+ sqlite.xerial
+ true
+ org.sqlite.JDBC
+ jdbc:sqlite:E:\python\DoubanBookTAGSpider\douban.db
+
+
+
+
+
+ file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/license.txt
+
+
+ file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/sqlite-jdbc-3.25.1.jar
+
+
+
+
+
\ No newline at end of file
diff --git a/DoubanSpider/__init__.py b/DoubanSpider/__init__.py
new file mode 100644
index 0000000..bf457cf
--- /dev/null
+++ b/DoubanSpider/__init__.py
@@ -0,0 +1,27 @@
+import os
+
+# 环境检查
+
+
+try:
+ from sqlalchemy import create_engine, Column, Integer, String
+ from sqlalchemy.ext.declarative import declarative_base
+ import requests
+ import re
+ import time
+ import csv
+ import threading
+ import sqlite3
+ import logging
+except:
+ print('[System]正在安装支持库...')
+ os.system('pip install SQLAlchemy')
+ os.system('pip install sqlite')
+ os.system('pip install threading')
+ os.system('pip install csv')
+ os.system('pip install requests')
+ os.system('pip install logging')
+ import requests
+ import csv
+ from sqlalchemy import create_engine, Column, Integer, String
+ from sqlalchemy.ext.declarative import declarative_base
diff --git a/DoubanSpider/db.py b/DoubanSpider/db.py
new file mode 100644
index 0000000..695be2c
--- /dev/null
+++ b/DoubanSpider/db.py
@@ -0,0 +1,28 @@
+from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy.ext.declarative import declarative_base
+import os
+
+engine = create_engine('sqlite:///douban.db')
+Base = declarative_base(engine)
+
+
+class Douban(Base):
+ __tablename__ = 'DouBan'
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ tag = Column(String, unique=False, nullable=False)
+ url = Column(String, unique=True, nullable=False)
+
+ def __repr__(self):
+ return "" % (self.id, self.tag, self.url)
+
+
+if os.path.isfile('douban.db') is False:
+ print('正在创建数据库...')
+ Base.metadata.create_all()
+else:
+ print('检测到现有数据库,正在读取...')
+
+if __name__ == '__main__':
+ # 重置数据库
+ Base.metadata.drop_all(engine)
+ print('Done')
diff --git a/Spider.py b/Spider.py
index 7bcde39..3eb06d8 100644
--- a/Spider.py
+++ b/Spider.py
@@ -1,20 +1,6 @@
from DoubanSpider import *
-
-try:
- import requests
- import re
- import time
- import csv
-
- print('支持库检查完成...')
-
-except:
- print('正在安装支持库...')
- os.system('pip install SQLAlchemy')
- os.system('pip install csv')
- os.system('pip install requests')
- import requests
- import csv
+from DoubanSpider.db import Douban, engine
+from sqlalchemy.orm import sessionmaker
class DoubanBook(object):
@@ -22,38 +8,53 @@ class DoubanBook(object):
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
self._lock = threading.Lock()
+ self.session = sessionmaker(engine)()
self.headers = {
- 'DNT': '1',
- 'Host': 'book.douban.com',
- 'Referer': 'https://book.douban.com/',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-Site': 'none',
- 'Sec-Fetch-User': '?1',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
+ 'Referer': 'https://www.baidu.com/',
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/76.0.3809.132 Safari/537.36 '
}
+ self.log = logging.basicConfig(filename='papa.log',
+ filemode='a',
+ format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
def get_url(self, tag_name):
- for num in range(0, 1000, 20):
+ for num in range(0, 10000, 20):
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
- print(f"正在获取{tag_name}分类下的书籍...当前页码:{num / 20 + 1}")
- try:
- response = requests.get(url, headers=self.headers)
- html = response.content.decode()
- books_url = re.findall('.*? 书籍信息', num)
+ response = requests.get(url, headers=self.headers)
+ html = response.content.decode()
+ books_url = re.findall('.*?.*?.*?', html)
- # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
- # executor.map(self.get_url, [i for i in tags])
- for i in tags:
- self.get_url(i)
+ print('[SQL]未发现TAGS数据!')
+ print('[Spider]正在准备TAG数据,这需要一定时间.....')
+ do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG(需要大量时间)\n2.获取单一TAG\n请输入对应数字,回车确定\n')
+ if do_not_get_all == '1':
+ response = requests.get(self.main_url, headers=self.headers)
+ html = response.content.decode()
+ tags = re.findall('.*?.*?.*?', html)
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+ # executor.map(self.get_url, [i for i in tags])
+ for i in tags:
+ print(f'[Spider]正在获取<{i}>链接数据.....')
+ self.get_url(i)
+ elif do_not_get_all == '2':
+ user_tag = input('请输入标签:')
+ self.get_url(user_tag)
+ self.main()
+ else:
+ print("[Spider]输入有误,请重新输入!")
+ self.get_tags()
+ self.get_data()
# def get_books_url(self, urls, tag_name):
# response = requests.get(url, headers=self.headers)
@@ -61,19 +62,48 @@ class DoubanBook(object):
# books_url = re.findall('.*?(.*?).*?', html)[0]
- author = re.findall('出版年: (.*?)
.*?', html)[0]
- price = re.findall('定价: (.*?)
.*?', html)[0]
- score = re.findall('(.*?).*?', html)[0]
- intro = re.findall('内容简介[\\s\\S]*?([\\s\\S]*?)
', html)[0]
- intro = (re.sub('\s', '', intro)).replace('', '').replace('
', ' ')
- data = [name, author, time, price, score, tag_name, intro]
+ try:
+ name = re.findall('.*?(.*?).*?', html)[0]
+ author = re.findall('出版年: (.*?)
.*?', html)[0]
+ except:
+ print(f'《{name}》未发现出版时间!')
+ time = 'N/A'
+ logger.warning(
+ f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE