Dev

2019-09-10 12:58:12 +00:00 · 2019-09-10 12:58:12 +00:00 · 1f1acbaca9
commit 1f1acbaca9
parent d96269fb5d
9 changed files with 223 additions and 144 deletions
--- a/.gitignore
+++ b/.gitignore
@ -137,3 +137,4 @@ dmypy.json
 /results.csv
 .idea/dataSources.local.xml
 .gitignore
+.vscode/launch.json
--- a/DoubanSpider/Spider.py
+++ b/DoubanSpider/Spider.py
@ -0,0 +1,122 @@
+from DoubanSpider import *
+from DoubanSpider.db import Douban, engine, Recording
+from sqlalchemy.orm import sessionmaker
+
+logger = logging.getLogger("PAPA")
+sleeptime = random.randint(0, 3)
+
+
+class DoubanBook(object):
+    def __init__(self):
+        self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
+        self.base_url = 'https://book.douban.com/tag/{}'  # ?start={}&type=T
+        self.session = sessionmaker(engine)()
+        self.headers = {
+            'Referer': 'https://www.baidu.com/',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                          'Chrome/76.0.3809.132 Safari/537.36 '
+        }
+        self.log = logging.basicConfig(filename='papa.log',
+                                       filemode='a',
+                                       format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
+
+    def get_url(self, tag_name):
+        """
+
+        :param tag_name: 字符串格式 TAG名称
+        :return:
+        """
+        for num in range(0, 10000, 20):
+            time.sleep(sleeptime)
+            url = self.base_url.format(tag_name) + f'?start={num}&type=T'
+            print(f'正在获取 TAG：<{tag_name}> 书籍信息', num)
+            response = requests.get(url, headers=self.headers)
+            html = response.content.decode()
+            books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
+            if not books_url:
+                break
+            for i in books_url:
+                try:
+                    self.session.add(Douban(tag=tag_name, url=i))
+                    self.session.commit()
+                except:
+                    self.session.rollback()
+
+    def get_tags(self):
+        response = requests.get(self.main_url, headers=self.headers)
+        html = response.content.decode()
+        tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
+        # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+        #     executor.map(self.get_url, [i for i in tags])
+        for i in tags:
+            print(f'[Spider]正在获取<{i}>链接数据.....')
+            time.sleep(0.5)
+            self.get_url(i)
+
+    # def get_books_url(self, urls, tag_name):
+    #     response = requests.get(url, headers=self.headers)
+    #     html = response.content.decode()
+    #     books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
+    #     self.get_data(books_url, tag_name)
+
+    def get_data(self, row):
+        """
+        :param row: 数据库提取列表
+        :return:  1.异常退出
+        """
+        time.sleep(sleeptime)
+        print(f"正在解析：{row[0]}")
+        response = requests.get(row[0], headers=self.headers)
+        html = response.content.decode()
+        try:
+            name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
+            author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
+        except:
+            logger.error(
+                f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
+            return 1
+        try:
+            time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
+        except:
+            print(f'《{name}》未发现出版时间！')
+            time_temp = 'N/A'
+            logger.warning(
+                f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
+        try:
+            price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
+        except:
+            logger.warning(
+                f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
+            print(f'《{name}》未发现定价！')
+            price = 'N/A'
+        try:
+            score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
+        except:
+            logger.warning(
+                f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
+            print(f'《{name}》未发现评分！')
+            score = 'N/A'
+        try:
+            intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
+            intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
+        except:
+            logger.warning(
+                f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
+            print(f'《{name}》未发现简介！')
+            intro = '无'
+        data = [name, author, time_temp, price, score, row[1], intro]
+        print(f'正在保存：{name}。')
+        self.save_csv(data)
+        rec = self.session.query(Recording).filter_by(id=1).scalar()
+        rec.data = row[2]
+        self.session.commit()
+
+    @staticmethod
+    def save_csv(data):
+        """
+        :param data: 数据
+        :return:
+        """
+        with open('results.csv', 'a', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(data)
--- a/DoubanSpider/init.py
+++ b/DoubanSpider/init.py
@ -6,22 +6,27 @@ import os
 try:
    from sqlalchemy import create_engine, Column, Integer, String
    from sqlalchemy.ext.declarative import declarative_base
-    import requests
+    import random
    import re
    import time
    import csv
+    import requests
+    import time
    import sqlite3
    import logging
-    import random
 except:
    print('[System]正在安装支持库...')
-    os.system('pip install SQLAlchemy')
-    os.system('pip install sqlite')
-    os.system('pip install csv')
-    os.system('pip install requests')
-    os.system('pip install logging')
-    import requests
-    import csv
-    import logging
+    os.system(r'pip install -r .\DoubanSpider\requirements.txt')
    from sqlalchemy import create_engine, Column, Integer, String
    from sqlalchemy.ext.declarative import declarative_base
+    import random
+    import re
+    import time
+    import csv
+    import requests
+    import time
+    import sqlite3
+    import logging
+
+finally:
+    print('[System]运行库加载完毕！')
--- a/DoubanSpider/db.py
+++ b/DoubanSpider/db.py
@ -16,6 +16,12 @@ class Douban(Base):
        return "<Douban(id='%d', tag='%s',url='%s')>" % (self.id, self.tag, self.url)


+class Recording(Base):
+    __tablename__ = 'Recording'
+    id = Column(Integer, primary_key=True)
+    data = Column(Integer, unique=True, nullable=False)
+
+
 if os.path.isfile('douban.db') is False:
    print('正在创建数据库...')
    Base.metadata.create_all()
--- a/DoubanSpider/douban.db
+++ b/DoubanSpider/douban.db
--- a/DoubanSpider/requirements.txt
+++ b/DoubanSpider/requirements.txt
@ -0,0 +1,8 @@
+certifi==2019.6.16
+chardet==3.0.4
+idna==2.8
+requests==2.22.0
+SQLAlchemy==1.3.8
+sqlit==0.1.6
+urllib3==1.25.3
+wincertstore==0.2
--- a/Spider.py
+++ b/Spider.py
@ -1,134 +0,0 @@
-from DoubanSpider import *
-from DoubanSpider.db import Douban, engine
-from sqlalchemy.orm import sessionmaker
-
-
-class DoubanBook(object):
-    def __init__(self):
-        self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
-        self.base_url = 'https://book.douban.com/tag/{}'  # ?start={}&type=T
-        self.session = sessionmaker(engine)()
-        self.headers = {
-            'Referer': 'https://www.baidu.com/',
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
-                          'Chrome/76.0.3809.132 Safari/537.36 '
-        }
-        self.log = logging.basicConfig(filename='papa.log',
-                                       filemode='a',
-                                       format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING)
-
-    def get_url(self, tag_name):
-        for num in range(0, 10000, 20):
-            time.sleep(sleeptime)
-            url = self.base_url.format(tag_name) + f'?start={num}&type=T'
-            print(f'正在获取 TAG：<{tag_name}> 书籍信息', num)
-            response = requests.get(url, headers=self.headers)
-            html = response.content.decode()
-            books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
-            if not books_url:
-                break
-            for i in books_url:
-                try:
-                    self.session.add(Douban(tag=tag_name, url=i))
-                    self.session.commit()
-                except:
-                    self.session.rollback()
-
-    def get_tags(self):
-        print('[SQL]未发现TAGS数据！')
-        print('[Spider]正在准备TAG数据，这需要一定时间.....')
-        do_not_get_all = input('[Spider]请选择运行模式：\n1.获取所有TAG（需要大量时间）\n2.获取单一TAG\n请输入对应数字，回车确定\n')
-        if do_not_get_all == '1':
-            response = requests.get(self.main_url, headers=self.headers)
-            html = response.content.decode()
-            tags = re.findall('.*?<a href="/tag/(.*?)">.*?</a>.*?', html)
-            # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
-            #     executor.map(self.get_url, [i for i in tags])
-            for i in tags:
-                print(f'[Spider]正在获取<{i}>链接数据.....')
-                time.sleep(0.5)
-                self.get_url(i)
-        elif do_not_get_all == '2':
-            user_tag = input('请输入标签：')
-            self.get_url(user_tag)
-            self.main()
-        else:
-            print("[Spider]输入有误，请重新输入！")
-            self.get_tags()
-        self.get_data()
-
-    # def get_books_url(self, urls, tag_name):
-    #     response = requests.get(url, headers=self.headers)
-    #     html = response.content.decode()
-    #     books_url = re.findall('.*?<a class="nbg" href="(.*?)".*?', html)
-    #     self.get_data(books_url, tag_name)
-
-    def get_data(self):
-        for row in self.session.query(Douban.url, Douban.tag).all():
-            time.sleep(sleeptime)
-            print(f"正在解析：{row[0]}")
-            response = requests.get(row[0], headers=self.headers)
-            html = response.content.decode()
-            try:
-                name = re.findall('.*?<span property="v:itemreviewed">(.*?)</span>.*?', html)[0]
-                author = re.findall('<meta name="keywords" content=".*?,(.*?),.*?', html)[0]
-            except:
-                logger.error(
-                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]UNKNOWN URL:{row[0]}")
-                continue
-            try:
-                time_temp = re.findall('<span class="pl">出版年:</span> (.*?)<br/>.*?', html)[0]
-            except:
-                print(f'《{name}》未发现出版时间！')
-                time_temp = 'N/A'
-                logger.warning(
-                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <time>:{row[0]}")
-            try:
-                price = re.findall('<span class="pl">定价:</span> (.*?)<br/>.*?', html)[0]
-            except:
-                logger.warning(
-                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <price>:{row[0]}")
-                print(f'《{name}》未发现定价！')
-                price = 'N/A'
-            try:
-                score = re.findall('<strong class="ll rating_num " property="v:average">(.*?)</strong>.*?', html)[0]
-            except:
-                logger.warning(
-                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <score>:{row[0]}")
-                print(f'《{name}》未发现评分！')
-                score = 'N/A'
-            try:
-                intro = re.findall('内容简介[\\s\\S]*?<div class="intro">([\\s\\S]*?)</div>', html)[0]
-                intro = (re.sub('\s', '', intro)).replace('<p>', '').replace('</p>', ' ')
-            except:
-                logger.warning(
-                    f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE <intro>:{row[0]}")
-                print(f'《{name}》未发现简介！')
-                intro = '无'
-            data = [name, author, time_temp, price, score, row[1], intro]
-            print(f'正在保存：{name}。')
-            self.save_csv(data)
-
-    def save_csv(self, data):
-        with open('results.csv', 'a', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            writer.writerow(data)
-
-    def main(self):
-        n = self.session.query(Douban.url, Douban.tag).all()
-        if not n:
-            self.get_tags()
-        else:
-            print('[Spider]检测到现有TAG数据，开始抓取...')
-            self.get_data()
-
-
-if __name__ == '__main__':
-    logger = logging.getLogger("PAPA")
-    sleeptime = random.randint(0,3)
-    with open("results.csv", "a", encoding='utf-8') as f:
-        writer = csv.writer(f)
-        writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
-        writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
-    douban = DoubanBook()
-    douban.main()
--- a/douban.db
+++ b/douban.db
--- a/main.py
+++ b/main.py
@ -0,0 +1,71 @@
+import csv
+import time
+from cmd import Cmd
+
+from DoubanSpider.db import Douban, Recording
+from DoubanSpider.Spider import DoubanBook
+
+
+class SpiderMain(Cmd):
+    intro = '豆瓣图书爬虫V2.0 ---------- 输入help获取帮助。'
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def do_help(self, arg):
+        print('[Help] start  - 开始爬取任务，默认从上次结束的地方开始')
+        print('[Help] tag TAG - 添加/爬取 标签下的书籍，TAG是你需要添加的标签')
+        print('[Help] tag all - 爬取所有标签下的书籍')
+        print('[Help] quit  - 退出程序')
+
+
+    def do_start(self, arg):
+        for row in url_pool():
+            douban.get_data(row)
+        print('爬取结束！')
+
+    def do_tag(self, arg):
+        if arg == "all":
+            print("[WAR]请注意，在没有代理池的情况下，此操作通常无法完成！")
+            douban.get_tags()
+            print('[Spider]标签下所有书籍信息爬取完成！请输入start开始抓取数据！')
+        else:
+            print(f"[Spider]开始获取{arg}标签下的所有书籍，这需要一定时间！")
+            douban.get_url(arg)
+            print('[Spider]标签下所有书籍信息爬取完成！请输入start开始抓取数据！')
+
+    def do_quit(self, arg):
+        exit()
+
+    def main(self):
+        self.cmdloop()
+
+
+def url_pool():
+    if not n:
+        print('[Spider]你需要先获取tag数据!')
+    else:
+        for row in douban.session.query(Douban.url, Douban.tag, Douban.id).all():
+            ago = douban.session.query(Recording.data).first()
+            if row[2] > ago[0]:
+                yield row
+
+
+if __name__ == '__main__':
+    with open("results.csv", "a", encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
+        writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"])
+    spider = SpiderMain()
+    douban = DoubanBook()
+    rec = douban.session.query(Recording.id).all()
+    if not rec:
+        douban.session.add(Recording(id=1, data=0))
+        douban.session.commit()
+    n = douban.session.query(Douban.url, Douban.tag).all()
+    if not n:
+        print('未检测到任何数据，请使用 tag 关键字获取标签数据，输入help获取帮助。')
+    else:
+        print('检测到现有TAG数据，输入start直接开始抓取...')
+    spider.main()