diff --git a/.gitignore b/.gitignore index d06298a..9ee2a04 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,4 @@ dmypy.json /results.csv .idea/dataSources.local.xml .gitignore +.vscode/launch.json diff --git a/Spider.py b/DoubanSpider/Spider.py similarity index 93% rename from Spider.py rename to DoubanSpider/Spider.py index d641f08..5517c12 100644 --- a/Spider.py +++ b/DoubanSpider/Spider.py @@ -1,5 +1,5 @@ from DoubanSpider import * -from DoubanSpider.db import Douban, engine +from DoubanSpider.db import Douban, engine, Recording from sqlalchemy.orm import sessionmaker @@ -51,7 +51,6 @@ class DoubanBook(object): elif do_not_get_all == '2': user_tag = input('请输入标签:') self.get_url(user_tag) - self.main() else: print("[Spider]输入有误,请重新输入!") self.get_tags() @@ -64,7 +63,7 @@ class DoubanBook(object): # self.get_data(books_url, tag_name) def get_data(self): - for row in self.session.query(Douban.url, Douban.tag).all(): + for row in self.session.query(Douban.url, Douban.tag, Douban.id).all(): time.sleep(sleeptime) print(f"正在解析:{row[0]}") response = requests.get(row[0], headers=self.headers) @@ -115,6 +114,10 @@ class DoubanBook(object): writer.writerow(data) def main(self): + rec = self.session.query(Recording.id).all() + if not rec: + self.session.add(Recording(id=1, data=1)) + self.session.commit() n = self.session.query(Douban.url, Douban.tag).all() if not n: self.get_tags() @@ -123,9 +126,14 @@ class DoubanBook(object): self.get_data() +def url_pool(): + for row in douban.session.query(Douban.url, Douban.tag).all(): + yield row + + if __name__ == '__main__': logger = logging.getLogger("PAPA") - sleeptime = random.randint(0,3) + sleeptime = random.randint(0, 3) with open("results.csv", "a", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) diff --git a/DoubanSpider/db.py b/DoubanSpider/db.py index 695be2c..6914814 100644 --- a/DoubanSpider/db.py +++ b/DoubanSpider/db.py @@ -15,6 +15,10 @@ class Douban(Base): def __repr__(self): return "" % (self.id, self.tag, self.url) +class Recording(Base): + __tablename__ = 'Recording' + id = Column(Integer, primary_key=True) + data = Column(Integer, unique=True, nullable=False) if os.path.isfile('douban.db') is False: print('正在创建数据库...') diff --git a/douban.db b/douban.db deleted file mode 100644 index 9a1cbc1..0000000 Binary files a/douban.db and /dev/null differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..6eca3b4 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +import csv +import logging +import random +import time + +from DoubanSpider.Spider import DoubanBook + +if __name__ == '__main__': + logger = logging.getLogger("PAPA") + sleeptime = random.randint(0, 3) + with open("results.csv", "a", encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) + writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"]) + douban = DoubanBook() + douban.main()