Merge branch 'dev' into 'master'

Dev See merge request nyaasuki/doubanbookspider!1
2019-09-06 10:31:52 +00:00 · 2019-09-06 10:31:52 +00:00 · fc312a2e7f
commit fc312a2e7f
parent 7798f67655 fd22abaaff
7 changed files with 190 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@ -132,4 +132,6 @@ dmypy.json
 .idea/inspectionProfiles/
 .idea/misc.xml
 .idea/modules.xml
-.idea/vcs.xml
+.idea/vcs.xml
+/papa.log
+/results.csv
--- a/.idea/dataSources.local.xml
+++ b/.idea/dataSources.local.xml
@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="dataSourceStorageLocal">
+    <data-source name="douban" uuid="c36e8c61-c777-41be-9e3b-9be7cbbae171">
+      <database-info product="SQLite" version="3.25.1" jdbc-version="2.1" driver-name="SQLite JDBC" driver-version="3.25.1" dbms="SQLITE" exact-version="3.25.1" exact-driver-version="3.25">
+        <identifier-quote-string>&quot;</identifier-quote-string>
+      </database-info>
+      <case-sensitivity plain-identifiers="mixed" quoted-identifiers="mixed" />
+      <auth-required>false</auth-required>
+      <schema-mapping>
+        <introspection-scope>
+          <node kind="schema" qname="@" />
+        </introspection-scope>
+      </schema-mapping>
+    </data-source>
+  </component>
+</project>
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="douban" uuid="c36e8c61-c777-41be-9e3b-9be7cbbae171">
+      <driver-ref>sqlite.xerial</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
+      <jdbc-url>jdbc:sqlite:E:\python\DoubanBookTAGSpider\douban.db</jdbc-url>
+      <driver-properties>
+        <property name="enable_load_extension" value="true" />
+      </driver-properties>
+      <libraries>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/license.txt</url>
+        </library>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.25.1/sqlite-jdbc-3.25.1.jar</url>
+        </library>
+      </libraries>
+    </data-source>
+  </component>
+</project>
--- a/DoubanSpider/init.py
+++ b/DoubanSpider/init.py
@ -0,0 +1,27 @@
+import os
+
+# 环境检查
+
+
+try:
+    from sqlalchemy import create_engine, Column, Integer, String
+    from sqlalchemy.ext.declarative import declarative_base
+    import requests
+    import re
+    import time
+    import csv
+    import threading
+    import sqlite3
+    import logging
+except:
+    print('[System]正在安装支持库...')
+    os.system('pip install SQLAlchemy')
+    os.system('pip install sqlite')
+    os.system('pip install threading')
+    os.system('pip install csv')
+    os.system('pip install requests')
+    os.system('pip install logging')
+    import requests
+    import csv
+    from sqlalchemy import create_engine, Column, Integer, String
+    from sqlalchemy.ext.declarative import declarative_base
--- a/DoubanSpider/db.py
+++ b/DoubanSpider/db.py
@ -0,0 +1,28 @@
+from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy.ext.declarative import declarative_base
+import os
+
+engine = create_engine('sqlite:///douban.db')
+Base = declarative_base(engine)
+
+
+class Douban(Base):
+    __tablename__ = 'DouBan'
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    tag = Column(String, unique=False, nullable=False)
+    url = Column(String, unique=True, nullable=False)
+
+    def __repr__(self):
+        return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url)
+
+
+if os.path.isfile('douban.db') is False:
+    print('正在创建数据库...')
+    Base.metadata.create_all()
+else:
+    print('检测到现有数据库，正在读取...')
+
+if __name__ == '__main__':
+    # 重置数据库
+    Base.metadata.drop_all(engine)
+    print('Done')
--- a/Spider.py
+++ b/Spider.py
@ -1,20 +1,6 @@
 from DoubanSpider import *
-
-try:
-    import requests
-    import re
-    import time
-    import csv
-
-    print('支持库检查完成...')
-
-except:
-    print('正在安装支持库...')
-    os.system('pip install SQLAlchemy')
-    os.system('pip install csv')
-    os.system('pip install requests')
-    import requests
-    import csv
+from DoubanSpider.db import Douban, engine
+from sqlalchemy.orm import sessionmaker


 class DoubanBook(object):
@ -22,38 +8,53 @@ class DoubanBook(object):
        self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
        self.base_url = 'https://book.douban.com/tag/{}'  # ?start={}&type=T
        self._lock = threading.Lock()
+        self.session = sessionmaker(engine)()
        self.headers = {
-            'DNT': '1',
-            'Host': 'book.douban.com',
-            'Referer': 'https://book.douban.com/',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
-            'Sec-Fetch-User': '?1',
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
+            'Referer': 'https://www.baidu.com/',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                          'Chrome/76.0.3809.132 Safari/537.36 '
        }
+        self.log = logging.basicConfig(filename='papa.log',
+                                       filemode='a',
+                                       format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)

    def get_url(self, tag_name):
-        for num in range(0, 1000, 20):
+        for num in range(0, 10000, 20):
            url = self.base_url.format(tag_name) + f'?start={num}&type=T'
-            print(f"正在获取{tag_name}分类下的书籍...当前页码:{num / 20 + 1}")
-            try:
-                response = requests.get(url, headers=self.headers)
-                html = response.content.decode()
-                books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
-                print(books_url)
-                self.get_data(books_url, tag_name)
-            except:
+            print(f'正在获取 TAG：<{tag_name}> 书籍信息', num)
+            response = requests.get(url, headers=self.headers)
+            html = response.content.decode()
+            books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
+            if not books_url:
                break
+            for i in books_url:
+                try:
+                    self.session.add(Douban(tag=tag_name, url=i))
+                    self.session.commit()
+                except:
+                    self.session.rollback()

    def get_tags(self):
-        print('开始获取tags。')
-        response = requests.get(self.main_url, headers=self.headers)
-        html = response.content.decode()
-        tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
-        # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
-        #     executor.map(self.get_url, [i for i in tags])
-        for i in tags:
-            self.get_url(i)
+        print('[SQL]未发现TAGS数据！')
+        print('[Spider]正在准备TAG数据，这需要一定时间.....')
+        do_not_get_all = input('[Spider]请选择运行模式：\n1.获取所有TAG（需要大量时间）\n2.获取单一TAG\n请输入对应数字，回车确定\n')
+        if do_not_get_all == '1':
+            response = requests.get(self.main_url, headers=self.headers)
+            html = response.content.decode()
+            tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
+            # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+            #     executor.map(self.get_url, [i for i in tags])
+            for i in tags:
+                print(f'[Spider]正在获取<{i}>链接数据.....')
+                self.get_url(i)
+        elif do_not_get_all == '2':
+            user_tag = input('请输入标签：')
+            self.get_url(user_tag)
+            self.main()
+        else:
+            print("[Spider]输入有误，请重新输入！")
+            self.get_tags()
+        self.get_data()

    # def get_books_url(self, urls, tag_name):
    #     response = requests.get(url, headers=self.headers)
@ -61,19 +62,48 @@ class DoubanBook(object):
    #     books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
    #     self.get_data(books_url, tag_name)

-    def get_data(self, urls, tag_name):
-        for url in urls:
-            print(f"正在解析：{url}")
-            response = requests.get(url, headers=self.headers)
+    def get_data(self):
+        for row in self.session.query(Douban.url, Douban.tag).all():
+            print(f"正在解析：{row[0]}")
+            response = requests.get(row[0], headers=self.headers)
            html = response.content.decode()
-            name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
-            author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
-            time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
-            price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
-            score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
-            intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
-            intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
-            data = [name, author, time, price, score, tag_name, intro]
+            try:
+                name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
+                author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
+            except:
+                logger.error(
+                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
+                continue
+            try:
+                time = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
+            except:
+                print(f'《{name}》未发现出版时间！')
+                time = 'N/A'
+                logger.warning(
+                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
+            try:
+                price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
+            except:
+                logger.warning(
+                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
+                print(f'《{name}》未发现定价！')
+                price = 'N/A'
+            try:
+                score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
+            except:
+                logger.warning(
+                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
+                print(f'《{name}》未发现评分！')
+                score = 'N/A'
+            try:
+                intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
+                intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
+            except:
+                logger.warning(
+                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
+                print(f'《{name}》未发现简介！')
+                intro = '无'
+            data = [name, author, time, price, score, row[1], intro]
            print(f'正在保存：{name}。')
            self.save_csv(data)

@ -83,12 +113,19 @@ class DoubanBook(object):
            writer.writerow(data)

    def main(self):
-        with open("results.csv", "w", encoding='utf-8') as f:
-            writer = csv.writer(f)
-            writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", '内容简介'])
-        self.get_tags()
+        n = self.session.query(Douban.url, Douban.tag).all()
+        if not n:
+            self.get_tags()
+        else:
+            print('[Spider]检测到现有TAG数据，开始抓取...')
+            self.get_data()


 if __name__ == '__main__':
+    logger = logging.getLogger("PAPA")
+    with open("results.csv", "a", encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
+        writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
    douban = DoubanBook()
    douban.main()
--- a/douban.db
+++ b/douban.db