diff --git a/.gitignore b/.gitignore index d06298a..9ee2a04 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,4 @@ dmypy.json /results.csv .idea/dataSources.local.xml .gitignore +.vscode/launch.json diff --git a/DoubanSpider/Spider.py b/DoubanSpider/Spider.py new file mode 100644 index 0000000..d127097 --- /dev/null +++ b/DoubanSpider/Spider.py @@ -0,0 +1,122 @@ +from DoubanSpider import * +from DoubanSpider.db import Douban, engine, Recording +from sqlalchemy.orm import sessionmaker + +logger = logging.getLogger("PAPA") +sleeptime = random.randint(0, 3) + + +class DoubanBook(object): + def __init__(self): + self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' + self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T + self.session = sessionmaker(engine)() + self.headers = { + 'Referer': 'https://www.baidu.com/', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/76.0.3809.132 Safari/537.36 ' + } + self.log = logging.basicConfig(filename='papa.log', + filemode='a', + format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING) + + def get_url(self, tag_name): + """ + + :param tag_name: 字符串格式 TAG名称 + :return: + """ + for num in range(0, 10000, 20): + time.sleep(sleeptime) + url = self.base_url.format(tag_name) + f'?start={num}&type=T' + print(f'正在获取 TAG:<{tag_name}> 书籍信息', num) + response = requests.get(url, headers=self.headers) + html = response.content.decode() + books_url = re.findall('.*?.*?.*?', html) + # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # executor.map(self.get_url, [i for i in tags]) + for i in tags: + print(f'[Spider]正在获取<{i}>链接数据.....') + time.sleep(0.5) + self.get_url(i) + + # def get_books_url(self, urls, tag_name): + # response = requests.get(url, headers=self.headers) + # html = response.content.decode() + # books_url = re.findall('.*?(.*?).*?', html)[0] + author = re.findall('出版年: (.*?)
.*?', html)[0] + except: + print(f'《{name}》未发现出版时间!') + time_temp = 'N/A' + logger.warning( + f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE
.*?.*?', html) - # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - # executor.map(self.get_url, [i for i in tags]) - for i in tags: - print(f'[Spider]正在获取<{i}>链接数据.....') - time.sleep(0.5) - self.get_url(i) - elif do_not_get_all == '2': - user_tag = input('请输入标签:') - self.get_url(user_tag) - self.main() - else: - print("[Spider]输入有误,请重新输入!") - self.get_tags() - self.get_data() - - # def get_books_url(self, urls, tag_name): - # response = requests.get(url, headers=self.headers) - # html = response.content.decode() - # books_url = re.findall('.*?(.*?).*?', html)[0] - author = re.findall('出版年: (.*?)
.*?', html)[0] - except: - print(f'《{name}》未发现出版时间!') - time_temp = 'N/A' - logger.warning( - f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE