diff --git a/.gitignore b/.gitignore index 96d1f30..03fcf7f 100644 --- a/.gitignore +++ b/.gitignore @@ -132,4 +132,6 @@ dmypy.json .idea/inspectionProfiles/ .idea/misc.xml .idea/modules.xml -.idea/vcs.xml \ No newline at end of file +.idea/vcs.xml +/papa.log +/results.csv diff --git a/.idea/dataSources.local.xml b/.idea/dataSources.local.xml new file mode 100644 index 0000000..095e1fa --- /dev/null +++ b/.idea/dataSources.local.xml @@ -0,0 +1,17 @@ + + + + + + " + + + false + + + + + + + + \ No newline at end of file diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..2af8e9e --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,22 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:E:\python\DoubanBookTAGSpider\douban.db + + + + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/license.txt + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/sqlite-jdbc-3.25.1.jar + + + + + \ No newline at end of file diff --git a/DoubanSpider/__init__.py b/DoubanSpider/__init__.py new file mode 100644 index 0000000..bf457cf --- /dev/null +++ b/DoubanSpider/__init__.py @@ -0,0 +1,27 @@ +import os + +# 环境检查 + + +try: + from sqlalchemy import create_engine, Column, Integer, String + from sqlalchemy.ext.declarative import declarative_base + import requests + import re + import time + import csv + import threading + import sqlite3 + import logging +except: + print('[System]正在安装支持库...') + os.system('pip install SQLAlchemy') + os.system('pip install sqlite') + os.system('pip install threading') + os.system('pip install csv') + os.system('pip install requests') + os.system('pip install logging') + import requests + import csv + from sqlalchemy import create_engine, Column, Integer, String + from sqlalchemy.ext.declarative import declarative_base diff --git a/DoubanSpider/db.py b/DoubanSpider/db.py new file mode 100644 index 0000000..695be2c --- /dev/null +++ b/DoubanSpider/db.py @@ -0,0 +1,28 @@ +from sqlalchemy import create_engine, Column, Integer, String +from sqlalchemy.ext.declarative import declarative_base +import os + +engine = create_engine('sqlite:///douban.db') +Base = declarative_base(engine) + + +class Douban(Base): + __tablename__ = 'DouBan' + id = Column(Integer, primary_key=True, autoincrement=True) + tag = Column(String, unique=False, nullable=False) + url = Column(String, unique=True, nullable=False) + + def __repr__(self): + return "" % (self.id, self.tag, self.url) + + +if os.path.isfile('douban.db') is False: + print('正在创建数据库...') + Base.metadata.create_all() +else: + print('检测到现有数据库,正在读取...') + +if __name__ == '__main__': + # 重置数据库 + Base.metadata.drop_all(engine) + print('Done') diff --git a/Spider.py b/Spider.py index 7bcde39..3eb06d8 100644 --- a/Spider.py +++ b/Spider.py @@ -1,20 +1,6 @@ from DoubanSpider import * - -try: - import requests - import re - import time - import csv - - print('支持库检查完成...') - -except: - print('正在安装支持库...') - os.system('pip install SQLAlchemy') - os.system('pip install csv') - os.system('pip install requests') - import requests - import csv +from DoubanSpider.db import Douban, engine +from sqlalchemy.orm import sessionmaker class DoubanBook(object): @@ -22,38 +8,53 @@ class DoubanBook(object): self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T self._lock = threading.Lock() + self.session = sessionmaker(engine)() self.headers = { - 'DNT': '1', - 'Host': 'book.douban.com', - 'Referer': 'https://book.douban.com/', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' + 'Referer': 'https://www.baidu.com/', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/76.0.3809.132 Safari/537.36 ' } + self.log = logging.basicConfig(filename='papa.log', + filemode='a', + format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING) def get_url(self, tag_name): - for num in range(0, 1000, 20): + for num in range(0, 10000, 20): url = self.base_url.format(tag_name) + f'?start={num}&type=T' - print(f"正在获取{tag_name}分类下的书籍...当前页码:{num / 20 + 1}") - try: - response = requests.get(url, headers=self.headers) - html = response.content.decode() - books_url = re.findall('.*? 书籍信息', num) + response = requests.get(url, headers=self.headers) + html = response.content.decode() + books_url = re.findall('.*?.*?.*?', html) - # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - # executor.map(self.get_url, [i for i in tags]) - for i in tags: - self.get_url(i) + print('[SQL]未发现TAGS数据!') + print('[Spider]正在准备TAG数据,这需要一定时间.....') + do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG(需要大量时间)\n2.获取单一TAG\n请输入对应数字,回车确定\n') + if do_not_get_all == '1': + response = requests.get(self.main_url, headers=self.headers) + html = response.content.decode() + tags = re.findall('.*?.*?.*?', html) + # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # executor.map(self.get_url, [i for i in tags]) + for i in tags: + print(f'[Spider]正在获取<{i}>链接数据.....') + self.get_url(i) + elif do_not_get_all == '2': + user_tag = input('请输入标签:') + self.get_url(user_tag) + self.main() + else: + print("[Spider]输入有误,请重新输入!") + self.get_tags() + self.get_data() # def get_books_url(self, urls, tag_name): # response = requests.get(url, headers=self.headers) @@ -61,19 +62,48 @@ class DoubanBook(object): # books_url = re.findall('.*?(.*?).*?', html)[0] - author = re.findall('出版年: (.*?)
.*?', html)[0] - price = re.findall('定价: (.*?)
.*?', html)[0] - score = re.findall('(.*?).*?', html)[0] - intro = re.findall('内容简介[\\s\\S]*?
([\\s\\S]*?)
', html)[0] - intro = (re.sub('\s', '', intro)).replace('

', '').replace('

', ' ') - data = [name, author, time, price, score, tag_name, intro] + try: + name = re.findall('.*?(.*?).*?', html)[0] + author = re.findall('出版年: (.*?)
.*?', html)[0] + except: + print(f'《{name}》未发现出版时间!') + time = 'N/A' + logger.warning( + f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE