from DoubanSpider import * from DoubanSpider.db import Douban, engine, Recording from sqlalchemy.orm import sessionmaker logger = logging.getLogger("PAPA") sleeptime = random.randint(0, 3) class DoubanBook(object): def __init__(self): self.main_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' self.base_url = 'https://book.douban.com/tag/{}' # ?start={}&type=T self.session = sessionmaker(engine)() self.headers = { 'Referer': 'https://www.baidu.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/76.0.3809.132 Safari/537.36 ' } self.log = logging.basicConfig(filename='papa.log', filemode='a', format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING) def get_url(self, tag_name): """ :param tag_name: 字符串格式 TAG名称 :return: """ for num in range(0, 10000, 20): time.sleep(sleeptime) url = self.base_url.format(tag_name) + f'?start={num}&type=T' print(f'正在获取 TAG:<{tag_name}> 书籍信息', num) response = requests.get(url, headers=self.headers) html = response.content.decode() books_url = re.findall('.*?.*?.*?', html) # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # executor.map(self.get_url, [i for i in tags]) for i in tags: print(f'[Spider]正在获取<{i}>链接数据.....') time.sleep(0.5) self.get_url(i) # def get_books_url(self, urls, tag_name): # response = requests.get(url, headers=self.headers) # html = response.content.decode() # books_url = re.findall('.*?(.*?).*?', html)[0] author = re.findall('出版年: (.*?)
.*?', html)[0] except: print(f'《{name}》未发现出版时间!') time_temp = 'N/A' logger.warning( f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE