Merge branch 'dev' into 'master'
Dev See merge request nyaasuki/doubanbookspider!1
This commit is contained in:
commit
fc312a2e7f
2
.gitignore
vendored
2
.gitignore
vendored
@ -133,3 +133,5 @@ dmypy.json
|
|||||||
.idea/misc.xml
|
.idea/misc.xml
|
||||||
.idea/modules.xml
|
.idea/modules.xml
|
||||||
.idea/vcs.xml
|
.idea/vcs.xml
|
||||||
|
/papa.log
|
||||||
|
/results.csv
|
||||||
|
17
.idea/dataSources.local.xml
Normal file
17
.idea/dataSources.local.xml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="dataSourceStorageLocal">
|
||||||
|
<data-source name="douban" uuid="c36e8c61-c777-41be-9e3b-9be7cbbae171">
|
||||||
|
<database-info product="SQLite" version="3.25.1" jdbc-version="2.1" driver-name="SQLite JDBC" driver-version="3.25.1" dbms="SQLITE" exact-version="3.25.1" exact-driver-version="3.25">
|
||||||
|
<identifier-quote-string>"</identifier-quote-string>
|
||||||
|
</database-info>
|
||||||
|
<case-sensitivity plain-identifiers="mixed" quoted-identifiers="mixed" />
|
||||||
|
<auth-required>false</auth-required>
|
||||||
|
<schema-mapping>
|
||||||
|
<introspection-scope>
|
||||||
|
<node kind="schema" qname="@" />
|
||||||
|
</introspection-scope>
|
||||||
|
</schema-mapping>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
22
.idea/dataSources.xml
Normal file
22
.idea/dataSources.xml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||||
|
<data-source source="LOCAL" name="douban" uuid="c36e8c61-c777-41be-9e3b-9be7cbbae171">
|
||||||
|
<driver-ref>sqlite.xerial</driver-ref>
|
||||||
|
<synchronize>true</synchronize>
|
||||||
|
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||||
|
<jdbc-url>jdbc:sqlite:E:\python\DoubanBookTAGSpider\douban.db</jdbc-url>
|
||||||
|
<driver-properties>
|
||||||
|
<property name="enable_load_extension" value="true" />
|
||||||
|
</driver-properties>
|
||||||
|
<libraries>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/license.txt</url>
|
||||||
|
</library>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/sqlite-jdbc-3.25.1.jar</url>
|
||||||
|
</library>
|
||||||
|
</libraries>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
27
DoubanSpider/__init__.py
Normal file
27
DoubanSpider/__init__.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
# 环境检查
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from sqlalchemy import create_engine, Column, Integer, String
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
import threading
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
except:
|
||||||
|
print('[System]正在安装支持库...')
|
||||||
|
os.system('pip install SQLAlchemy')
|
||||||
|
os.system('pip install sqlite')
|
||||||
|
os.system('pip install threading')
|
||||||
|
os.system('pip install csv')
|
||||||
|
os.system('pip install requests')
|
||||||
|
os.system('pip install logging')
|
||||||
|
import requests
|
||||||
|
import csv
|
||||||
|
from sqlalchemy import create_engine, Column, Integer, String
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
28
DoubanSpider/db.py
Normal file
28
DoubanSpider/db.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from sqlalchemy import create_engine, Column, Integer, String
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
import os
|
||||||
|
|
||||||
|
engine = create_engine('sqlite:///douban.db')
|
||||||
|
Base = declarative_base(engine)
|
||||||
|
|
||||||
|
|
||||||
|
class Douban(Base):
|
||||||
|
__tablename__ = 'DouBan'
|
||||||
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
tag = Column(String, unique=False, nullable=False)
|
||||||
|
url = Column(String, unique=True, nullable=False)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url)
|
||||||
|
|
||||||
|
|
||||||
|
if os.path.isfile('douban.db') is False:
|
||||||
|
print('正在创建数据库...')
|
||||||
|
Base.metadata.create_all()
|
||||||
|
else:
|
||||||
|
print('检测到现有数据库,正在读取...')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 重置数据库
|
||||||
|
Base.metadata.drop_all(engine)
|
||||||
|
print('Done')
|
113
Spider.py
113
Spider.py
@ -1,20 +1,6 @@
|
|||||||
from DoubanSpider import *
|
from DoubanSpider import *
|
||||||
|
from DoubanSpider.db import Douban, engine
|
||||||
try:
|
from sqlalchemy.orm import sessionmaker
|
||||||
import requests
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import csv
|
|
||||||
|
|
||||||
print('支持库检查完成...')
|
|
||||||
|
|
||||||
except:
|
|
||||||
print('正在安装支持库...')
|
|
||||||
os.system('pip install SQLAlchemy')
|
|
||||||
os.system('pip install csv')
|
|
||||||
os.system('pip install requests')
|
|
||||||
import requests
|
|
||||||
import csv
|
|
||||||
|
|
||||||
|
|
||||||
class DoubanBook(object):
|
class DoubanBook(object):
|
||||||
@ -22,38 +8,53 @@ class DoubanBook(object):
|
|||||||
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
|
||||||
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
|
self.session = sessionmaker(engine)()
|
||||||
self.headers = {
|
self.headers = {
|
||||||
'DNT': '1',
|
'Referer': 'https://www.baidu.com/',
|
||||||
'Host': 'book.douban.com',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
'Referer': 'https://book.douban.com/',
|
'Chrome/76.0.3809.132 Safari/537.36 '
|
||||||
'Sec-Fetch-Mode': 'navigate',
|
|
||||||
'Sec-Fetch-Site': 'none',
|
|
||||||
'Sec-Fetch-User': '?1',
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
|
|
||||||
}
|
}
|
||||||
|
self.log = logging.basicConfig(filename='papa.log',
|
||||||
|
filemode='a',
|
||||||
|
format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
|
||||||
|
|
||||||
def get_url(self, tag_name):
|
def get_url(self, tag_name):
|
||||||
for num in range(0, 1000, 20):
|
for num in range(0, 10000, 20):
|
||||||
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
url = self.base_url.format(tag_name) + f'?start={num}&type=T'
|
||||||
print(f"正在获取{tag_name}分类下的书籍...当前页码:{num / 20 + 1}")
|
print(f'正在获取 TAG:<{tag_name}> 书籍信息', num)
|
||||||
try:
|
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(url, headers=self.headers)
|
||||||
html = response.content.decode()
|
html = response.content.decode()
|
||||||
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||||
print(books_url)
|
if not books_url:
|
||||||
self.get_data(books_url, tag_name)
|
|
||||||
except:
|
|
||||||
break
|
break
|
||||||
|
for i in books_url:
|
||||||
|
try:
|
||||||
|
self.session.add(Douban(tag=tag_name, url=i))
|
||||||
|
self.session.commit()
|
||||||
|
except:
|
||||||
|
self.session.rollback()
|
||||||
|
|
||||||
def get_tags(self):
|
def get_tags(self):
|
||||||
print('开始获取tags。')
|
print('[SQL]未发现TAGS数据!')
|
||||||
|
print('[Spider]正在准备TAG数据,这需要一定时间.....')
|
||||||
|
do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG(需要大量时间)\n2.获取单一TAG\n请输入对应数字,回车确定\n')
|
||||||
|
if do_not_get_all == '1':
|
||||||
response = requests.get(self.main_url, headers=self.headers)
|
response = requests.get(self.main_url, headers=self.headers)
|
||||||
html = response.content.decode()
|
html = response.content.decode()
|
||||||
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
|
tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
|
||||||
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||||
# executor.map(self.get_url, [i for i in tags])
|
# executor.map(self.get_url, [i for i in tags])
|
||||||
for i in tags:
|
for i in tags:
|
||||||
|
print(f'[Spider]正在获取<{i}>链接数据.....')
|
||||||
self.get_url(i)
|
self.get_url(i)
|
||||||
|
elif do_not_get_all == '2':
|
||||||
|
user_tag = input('请输入标签:')
|
||||||
|
self.get_url(user_tag)
|
||||||
|
self.main()
|
||||||
|
else:
|
||||||
|
print("[Spider]输入有误,请重新输入!")
|
||||||
|
self.get_tags()
|
||||||
|
self.get_data()
|
||||||
|
|
||||||
# def get_books_url(self, urls, tag_name):
|
# def get_books_url(self, urls, tag_name):
|
||||||
# response = requests.get(url, headers=self.headers)
|
# response = requests.get(url, headers=self.headers)
|
||||||
@ -61,19 +62,48 @@ class DoubanBook(object):
|
|||||||
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
# books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
|
||||||
# self.get_data(books_url, tag_name)
|
# self.get_data(books_url, tag_name)
|
||||||
|
|
||||||
def get_data(self, urls, tag_name):
|
def get_data(self):
|
||||||
for url in urls:
|
for row in self.session.query(Douban.url, Douban.tag).all():
|
||||||
print(f"正在解析:{url}")
|
print(f"正在解析:{row[0]}")
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(row[0], headers=self.headers)
|
||||||
html = response.content.decode()
|
html = response.content.decode()
|
||||||
|
try:
|
||||||
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
|
name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
|
||||||
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
|
author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
|
||||||
|
except:
|
||||||
|
logger.error(
|
||||||
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
|
||||||
|
except:
|
||||||
|
print(f'《{name}》未发现出版时间!')
|
||||||
|
time = 'N/A'
|
||||||
|
logger.warning(
|
||||||
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
|
||||||
|
try:
|
||||||
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
|
price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
|
||||||
|
except:
|
||||||
|
logger.warning(
|
||||||
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
|
||||||
|
print(f'《{name}》未发现定价!')
|
||||||
|
price = 'N/A'
|
||||||
|
try:
|
||||||
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
|
score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
|
||||||
|
except:
|
||||||
|
logger.warning(
|
||||||
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
|
||||||
|
print(f'《{name}》未发现评分!')
|
||||||
|
score = 'N/A'
|
||||||
|
try:
|
||||||
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
|
intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
|
||||||
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
|
intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
|
||||||
data = [name, author, time, price, score, tag_name, intro]
|
except:
|
||||||
|
logger.warning(
|
||||||
|
f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
|
||||||
|
print(f'《{name}》未发现简介!')
|
||||||
|
intro = '无'
|
||||||
|
data = [name, author, time, price, score, row[1], intro]
|
||||||
print(f'正在保存:{name}。')
|
print(f'正在保存:{name}。')
|
||||||
self.save_csv(data)
|
self.save_csv(data)
|
||||||
|
|
||||||
@ -83,12 +113,19 @@ class DoubanBook(object):
|
|||||||
writer.writerow(data)
|
writer.writerow(data)
|
||||||
|
|
||||||
def main(self):
|
def main(self):
|
||||||
with open("results.csv", "w", encoding='utf-8') as f:
|
n = self.session.query(Douban.url, Douban.tag).all()
|
||||||
writer = csv.writer(f)
|
if not n:
|
||||||
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", '内容简介'])
|
|
||||||
self.get_tags()
|
self.get_tags()
|
||||||
|
else:
|
||||||
|
print('[Spider]检测到现有TAG数据,开始抓取...')
|
||||||
|
self.get_data()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
logger = logging.getLogger("PAPA")
|
||||||
|
with open("results.csv", "a", encoding='utf-8') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
||||||
|
writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
|
||||||
douban = DoubanBook()
|
douban = DoubanBook()
|
||||||
douban.main()
|
douban.main()
|
||||||
|
Loading…
Reference in New Issue
Block a user