From c33ad58d34b713a574ad41773170ef05fa80fbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B2=9B=E9=A3=8E?= Date: Sun, 8 Sep 2019 22:41:58 +0800 Subject: [PATCH 1/6] coding rebuild --- .gitignore | 1 + Spider.py => DoubanSpider/Spider.py | 16 ++++++++++++---- DoubanSpider/db.py | 4 ++++ douban.db | Bin 122880 -> 0 bytes main.py | 16 ++++++++++++++++ 5 files changed, 33 insertions(+), 4 deletions(-) rename Spider.py => DoubanSpider/Spider.py (93%) delete mode 100644 douban.db create mode 100644 main.py diff --git a/.gitignore b/.gitignore index d06298a..9ee2a04 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,4 @@ dmypy.json /results.csv .idea/dataSources.local.xml .gitignore +.vscode/launch.json diff --git a/Spider.py b/DoubanSpider/Spider.py similarity index 93% rename from Spider.py rename to DoubanSpider/Spider.py index d641f08..5517c12 100644 --- a/Spider.py +++ b/DoubanSpider/Spider.py @@ -1,5 +1,5 @@ from DoubanSpider import * -from DoubanSpider.db import Douban, engine +from DoubanSpider.db import Douban, engine, Recording from sqlalchemy.orm import sessionmaker @@ -51,7 +51,6 @@ class DoubanBook(object): elif do_not_get_all == '2': user_tag = input('请输入标签:') self.get_url(user_tag) - self.main() else: print("[Spider]输入有误,请重新输入!") self.get_tags() @@ -64,7 +63,7 @@ class DoubanBook(object): # self.get_data(books_url, tag_name) def get_data(self): - for row in self.session.query(Douban.url, Douban.tag).all(): + for row in self.session.query(Douban.url, Douban.tag, Douban.id).all(): time.sleep(sleeptime) print(f"正在解析:{row[0]}") response = requests.get(row[0], headers=self.headers) @@ -115,6 +114,10 @@ class DoubanBook(object): writer.writerow(data) def main(self): + rec = self.session.query(Recording.id).all() + if not rec: + self.session.add(Recording(id=1, data=1)) + self.session.commit() n = self.session.query(Douban.url, Douban.tag).all() if not n: self.get_tags() @@ -123,9 +126,14 @@ class DoubanBook(object): self.get_data() +def url_pool(): + for row in douban.session.query(Douban.url, Douban.tag).all(): + yield row + + if __name__ == '__main__': logger = logging.getLogger("PAPA") - sleeptime = random.randint(0,3) + sleeptime = random.randint(0, 3) with open("results.csv", "a", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) diff --git a/DoubanSpider/db.py b/DoubanSpider/db.py index 695be2c..6914814 100644 --- a/DoubanSpider/db.py +++ b/DoubanSpider/db.py @@ -15,6 +15,10 @@ class Douban(Base): def __repr__(self): return "" % (self.id, self.tag, self.url) +class Recording(Base): + __tablename__ = 'Recording' + id = Column(Integer, primary_key=True) + data = Column(Integer, unique=True, nullable=False) if os.path.isfile('douban.db') is False: print('正在创建数据库...') diff --git a/douban.db b/douban.db deleted file mode 100644 index 9a1cbc180a4d671c6f60ad4f2254d49d01a687d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 122880 zcmeIb2ed3zwfDc*4 z&3og?r=57%{CSDDnCJW6X7lEGUU}N{yjA$$++RL_lbQPm{@?z2A%8K(2E9VH{ueK( zukY2@uRpjDg+>2)(F!bDfki8@XayFnz@imcv;vD(V9^RJT7g9?uxJGqt-$~J6&S4O z=a*f6d4IV0{KF1E;i#=nK4Xi+PMZ7~Y`MqQoA0&tyuCKxVyCU=tv>metIu1zR6O>` zdE4)@*Vfx^y~n&=cHL{ZFHq1NxI#Bo8Ppx zt`s*DCfIC5$yN?ZonFFEYDLMGx>C|=wR-IqziA5xrKHtqwiABS=DJecZMJzBJp-(x$pn(r7iioeoNy=t^<75qEi6QL?dv62GaD^mtDj=}Nq( zcB|9iJ#FZql(dp&(qJ3)9h4f)ZmZcyqPU0BLE4ww?8kAx%j~)i58`W2;tuQ3aZ=*T zZSj-Zx>DTfChbPE6U8n4C$aImNsphDxIC!e;O+8Z#JW<_NE&hamYOa~tQ<=oB@HK~ zq{W7alE_J^&u-;4ZJ;ahvB#Z`oki>GN-RXuj9Yw7>$xb!t!}&7jpB7t61Qus*Xr1z z7r#!|X(sJn&#swu945zC+w8aEC|TPJCattJ`Te_ySjRP-5ftx_0)@ zb5Y{-iuQUtE0){6?7%Gc*2>7QeIbLx5mBLws=lg;z_M$n>7pcbm>e6AuY%Jncoijl;ALjI{JG-vN&LCW^qTy+()d9B zTzb4De=aw*1Ao@0EJ45gsK=kn4o~LKm4~n8&-~yZ{;Us{;?K+uLG9aKrP{MFz*4nr z{n&b^{(SszT7CV_=m9D8@TYhtS{qS6WugVg+{c`L2!{F}hM{Dn@J?A}`iHpA}J(9_kzfky8 zc7E>W(piN)Du1p%?yr?QlexaqU+ag5_(B|@jn@}Pdn?j$yWflBb{y@sUU<+#$L6-` zu}`8sm7&>(jX1`t+C!9%+wFck+Fg<6yTxDUW8Y1Y##H0nYWJgEHR%}lrQe8lQI@2M zslde_?W_pLwkDl+w39NAxZQ2^JH2+aqaw}7lkm?Diu8iG&-Rk^T)<~LNqR2Qv#lb% zAjGqcBE2BCv$Z5W7u4BGkzNqZ*;1083*&5|NH2)rY%WR91#ULeq-P^GuUDiOglaZb zq!+|!Hj$*~f-@T{(hH(88%ff0;g}62>A5&eU->NO0x&&k=()&CSCpQFTsn&Mf>=vi znf6?erKJr$7hO@L7lc(}O?oz-(iEj9fs}?QJ&B-1iu8ie$p(t_f|$wrn)Gb2WIg5O z%tcAoRfOllBd?R@F&7tEN0Ockh^(zhFNlP!rARLbfxK3cUJ(0OQ(BVQpvM}@&@S; z8fEDzT_UHgq#JpksiTu!KRjrmKc?A=yOx{_mq%ksMI-6Qc0onK#rBXTB$Z|}?h%6u z-)t%?NWZ1iZ*~YuM*a`$4|&q9{O?_6PpK+TF-IxJ>4>~fqEb*)@KR!82i+(<6_uRw z^Wz4|mtGusccW319@0tr%&i>-JE0=|`hS1FRaAEHZuiLbzo&j!ef`>twR>up*7mQh zS^Zu0_UcS^kLt>mU;LlR|1-5$`)g7De^LHF1$Qj)9~b5S7v=xw&_0Xu|MSVeS80|`iiyR)b6TXRy(W) z{-yez>c^{RS9h$gRQXNi&dOz#BPy**w*1}l$IIuGcPOt^`fcg1(q*N?N^vPH{#WtZ z;#vM>#qEnL7JgH>tMHD(5ruXkpMNa>$>6MD`(VZV1^Hd`tL6TfyVw6s?uy*8xs7tw z?2oAo7-jd)u9bN)^Nq}hGN)#?%q$%~6Miv#Yj{u?g?{j8aIJr&-}Q^$4__@H@E`xp ze_&zZITF<1x#P&+ONq@)qzN;9U5Ot~q#Mzh^3cG0m4HtZz*WBk^ep_TG&JGY7WnS) z%aU~5?6iUJgkMsnTm60%-l0n~liNjU+}TDHepr%j0n{J_7?wn7@|w-a`@1IHqzt4J z1=@O(xahMc!QsH*=xQMv<2H44ZJ zD>Jcr2IhUC^ySO~qRIFvL$@htVdzJ-p(jApm9(MBp|pCDFaE})8+TbhfHcaqlMVou zC^%n~ZnnGZZW5Ks*l{2Dbrc*VNykLbsR#=GsY!zd2gw^et4X){aN3b4O0&7a%=rLV zgkKrQfOZ-C8BLmVt-&`;I#C(hq}fzA>(WUSkbG3f=2zaUOY<&2FG@E^;6~wDvNVXb z;1O9G{ClWfXfc&OcI%L%mevI6P86P@OLPCt*QJxl`>`h7X!StX`UOcE=vYrk_+t+a+XHZ{TAa6a%5;dJ3= z;?+gLTg1WRJ^+Bo`!7)(#0NLI|0!)Qtv*F#QScFM@D@OR+^|sGimjGeh>`zsZ7vl5 z*;xdTLs>=4p;kZgej`d_y>}yje@&VXnyMH7&!Th(S0eJ)6Qx_A!lF>a)Xi2jWX=EI zDcHx$?3P(Qd_KHCd|!BcxM{dp@U!6a^+)R0)X%JMS6`v_Z0$}W0*BXHwOsZ4)$6P0 zS9ht-tNfvIPvwfrv6YP~)nK3UkIOfeN9DcCYn5IsJy^ODc7V-GOBSCh-cr1{ctCN3 zqE~pNa82RN!ghrf^3Ue)%D*FjM82KR=N`*_GIv34m)xqsIQ#qT-Py~tM`!!lQs#%5 zPiHO+R`)*ZKkwh?zt=y`-`KBtKjGhOurRlt`W@i%II$n{yzoN{mj-Zy3l{uJS!hZ$ zIKp`?V4D}dol@x-V{da&n(7pc>mQ2JjYg9X{;QgFrwht3@_r^tvx1<5uJ^p)z=f}1 zr;V>31=r1vtvsUHGGG)R{ZFJbq7T(Uql@@ww81T~gK7IwUHX4f7|46KIBlw$iAUWk z-92-~7~0TQn8pD<5~T^ha5;QZl;-N;%KV-3;tdc5DFmDA31x7~S+RlmgrsZDq#M+p zML|=WOVW#zHlCgolVWr@9GLmt%KM2M)Ef~uI!gPQO+01h{aa<;AeSjZ?y}?(i2C0^ zXK`@s4s77yFRD0I-c0=k<%i;>^w=#?K$*(IkK1Oy!>NU7sNorjqCx@^`Tx+yZZ|k@ zBma0!I)NGj%itbOn%XhjwL?+5+a)d)yh+(tFDc+q7LX;xpb7TSCerLsVjBfinh9g~ zy40u>%GgwtZsDOuJ~&`$>?H0n_E$t{>@b3tt7*~{o7o-yD`~lVT!j>-ovsaSpC*Qi z_9MIfCg)Y;eN!5{+aO{C`=TffGY3As7PfA+xd3rT->;3`?s1hw!CsS%h&H%Zm*(wUqf0aIt5xX^2b+JPF3k$wrb}~Mc6Dh& ze|sDLe|zsn!~eg#en@?t+Vgn&Z?5f8TdDd?^(OrMZK{h^eo(o#atdC4w*0N~`^ra_ zHz@t3ba&}(r8ku36@OK{xp)y7|D_5)DqL4My|8Jakbg9PW&W6aocnw3zT9QG19EF- zpUZxc9RJSQ~}SxM&ns~=vr1uN&FrnkiqU~i3`uWf-(?}m%(nj5;R?1yHk?EuKN72viMn? z|H)t%U5hCFBy$uGc6ONHtSuT3cG8vFeasjVj>*7CJZ0`qt8+0bm4O4}CvO;c1IIBL zxKMuL4)U&IUY2owif@=uW6$#K$-ss4vrCxST&-l_K>8VDeZe?rABhX=r_t-!)sqaI zXg^8eG}7(l!220@+TGMUjR!{jDK6n$*;_JjA^s$3#N+HMc3}Rb_zfG0cbp7ds6U}v zWl!0Da^U_X2H9=!%Gggv{wXd7S5V5xk_;TkKRKMZ?!ceK17rUb9|d!2>JTLZ7xqsc z)#>vc#{;APR3F8WW~OXBaG?JrrQN28l2v!$|KvTz*ogL#I1zxR^>p^_8w05Rt5TCY z85jeoc%&^T@l6J99H5rrvzx?7K*b4?ftU~j4qQk;Co~FnI61L^LL~${5L#|DpzXM6 zOMqoQ|;{?SmA)Xiyj1*K{3o_&&wr%4YDX6#w`B_xo zLtqvUj22XURMP3RtpYz8xX^+cev8;KTE+{iPH@6FY1`eHK~*g*3>d%dE_UGt#T{%x z!o*@2IjFiA4N_G`$v1Ey2W8zMi#5v5fde}zpKzm3+zBlQdQjMfj2F}HRpSR0e^v@k z2X9{TPw=+#ZuI@S_mjdq!RxmQx%~I?pU9t=-#Nc(?)SN`=H8uMGk0{ZpDSmd$lj10 zWcTzpeHk*~OSMaCht!(2Fs&6hhgyM^E6-KFQh8VAgvwEsZsu#5_g6~g$IG87UsT?$ zyjtlGrLUGQ_iij5RqB-T#m9=*XZ}sdw8sN(&81`#zbfU;@OEBLrinmjK5dW>3IB%)J@g6P# zH^xQ_T^VwQP?(aM`jLX3{5N-_G#4VFJ!>K0ud0eSjSJdeP80_Q4GoI_HBr0;@`uph zjjDL74Iy*5ttf5;D};T?JS?0slXWx$vK1XK4&G>Yd8fgF|0%9SCSy$F4jVMo5HPrL zR#p&7sSC;#$UidnJ^wkk|7;n8sR)D-z)bjgY2u9zrw&uONt0$x$@h9v<~aosEdnUL ztfR}Eo$Ot&t@j)tOqG5aoat42%PG@1Hg=PEFq<60R^?aX4;pH3TS>avrA-9J`a_y@3p)|4F?1tLyXg zU#@RmU#9jI{SVL6-yt|9_nq9w za%bhX%dL?8b@q6xw4+W%X_w}c0T>jnP^9(qL>z@mStuYmNG z;GZ@cc0~SM6SpKO2!!x4NgS(`+96!2hedH3oLJ@mHzaZLLj?AK>1p0I(L7RloO~KW z9Pt#(;C)w`3;{+8^Z{&Lu8PNLWI$u^Uch~?4o*!NM)ilJQ+okHy*xOqtC9Z+QJkb4 zSa|PwRXm{+WE4onA#n%c9)OduqK;1KI9JH-qIkR8vVw&bRq?oQ(9ZQmaU%4?3ZON#9Z}^3(w!OMO!m9wg3%-3xFgI7<_c`_x+? z-WSDtbiKq8zfTk=dterjREtYg)>0JsZF{Jr17rXpy`v~TN!)qYN#aChIq_KcFN)%H zL$JV?=I7BQhiE0@9~DRELnYE0N~NI|-v0kVfLmG_@~W^-a0h=-7RMor!uN~fi^Aux z)Eb%qH>t$`4}{NOAw*3ZIIcYor8T52FcIh4dyOtmpW5IKQ5;{Ds7CONC=PhbPRPSV zaZFT3_a#ON(p)Y2GI2}pB92b-)vC<46UAZjAXep{tBS(|&rkiFDo%8R(Qi`40qJ2a zcV%%>EK%5z#jT;OcbO*Mg>lOE%x0=M);k3Y;#=)<#Z%bgzd;<`DVI8jD)6){vx(CY zAo4C1CqtUR_RP0L@iw(bd`;Jj;y5Wpxcw_6yNz#&7G2%w#8PD+@c-ZND{GfuEI&~G zK>397rsaC+C#9Q8!_uCmHHt3;rw3aV?<-#6zqxo!v0p3|epvWaVVA-M`9I{pn!h~s z@<-)6we|e{tdiezs=s2z0CVY_ONV{&1All`B>(x%yyX- z!e57Xgm3rP_5L0_9DFp43ai#1u79*Xe=hP+ySR41E9HY<-RYGU${?d;dZX|LEqpWw zQ1Lz}E$mze)uqJASUn$R-&Zwpa0)T7mv9G796A`9az|l96{mC&KV%(I9KV%1rtln1 z94`cvUlgvXic=5E=qqdDpblv@68Y~?#nVay?>m~fRU}%P`F)x=JPSnCqi`)v+yWBT zRpoq9yid0`zVx+KacbN#5%w3wd)V?EJ5P(-9m+twqC?fe;VU2j`ZH0SpglKL@C8vE zH^eflT7r_o8!9s}hTf=-jyMK|mWgF_^0xix_2S?h#+dT{0jhWkd-uK~i?iJ${~}eq znOG(8?V>oCWFx6+s^Te9)f2-;ydE1$_g`yu4Z=Q;@I#i>(4MKa!dS``OG zz?rm(Do!|psef4%hv?7D=iPL1qf9${z3`CYE8VR}oP1(*`Wc5$I;zak18ZY<6Kjzc z1!rjDxFK+!Mxm#PQ*KO2Nfe0p3?1#;Rvh|wZS>h<6;Hf@5l>q{y&#T`5od12&YCzr z3;FxVmxweL06)F=h@&TPV_-asyPZB%1W6-bd`+o1=c9VRPiVG$&2zPIdm}nY9K1=T5!cd#syOTi>@YEAG+WW--J_09xEWKBHKTg8khj)!hJ(5u9Sq0AH5LR%d?C7tpADvMio zEj+~X|9g0U@$l$Z&Hpa{mHa#NM`Y&bTlsA6d;U0gUGALR4!ISx&t|`ry(D``wvqKS zk7TaKTL&pn-=@B7?H9E#*4|P(ptfH1AJvDdS5;4`ZeCro^7G2gmFdbpX;k6C@(0T& zW|ovbYr~hT=)zhhoi$`L*3iEqwwTY!uAX zCPR;lmJw$yrilYgYEc;+_^LRq>~TUh4}@YYD#Zzlh;yH<-SVy$CqwBvFq~jxS)5-R z`H!jMJ<91LPx}(tVvyT087=>BJF-vdiRLE)NGB@7`M07tb&EK1x_(v^Ck{dJTlfJ{ z95^UB_wYhVoQj+#slO-`-E+J??Ec9Ayf}KtN(Z9wa8aCqJhg4U=;1(B*|V-9S#@;2 zC0x0yWO3uo^fjO?HS1~7u%{&-CgPa=qPa?98Gkc4@+DyLeP!wljaEO8zRq<}> zh>C`J-+(vO-Yv3tlbX~(+XHrpQDqzX`%0rD&c{KZLyH1-8%Ewidh3tk=wynmTT4q8 zw?;idO%}(=iGm|kaTq4q1gDDPmg3~x_^B#RnvjKjLKo+X_aE2ADUkKFbHnOsjgHvg zMjSn%Zh|d#rz~z&(w=qzW9L!ai}LrXkr0l=4ts>#3?}H?s%6tb>hs1lT*Jv2p=rsxsfL^8l8BQ+ihMETiC<|PUE!hXWZ;M*{<^s+j_G3;nHYherCstK zP)COpmSltG)xcqtG%X-?x;i@L`+XF|+A^Rv;XUq0YpR2% z1ZCc%>fqL;FjB2$u1UkceyUV^3;zG3etqfc$@uV_rC$8vl_x8ouZ$~isH|1~OZn?5 zDqxdxZ2>A^&ElWy4;Q~yd|&an;zq?v?Uuq5h0he;RC~I>P=(jzpRXN|zc+sc9Dx0N zIrn(()42>I0BWxLrz=CRE6ne#F`WmXBF3%?w`BRo88 zh1uY{!F8|!Ug{s}H~j!?!0Gj^n5y!Zw7lFVP`P1<-mQr9sbl*Ai43Hx*2Fp3*rrRW zqeDu==u0T?+D7N?b)&Py!L45bH?4Rm!wA|ke8eTn=RdLc5hz}jaiBk>4 zg#lj7_bZvLEF+hprMiz(=7KoZ5J!(M0YOY0d_*f!cL0HTUQ#G!=@~ppdS=Q6;?$v4 z1Ox7GrHHd8)V&+ZYF(+8h@4Uz(%$d@X>ckvpvh$FV&sV9+gR7UyQI;(usWJ=;m_B^ zJCvLn(ZY9B@h&yfk*_&vT|*dHZiWUfib(;Lx_wGeC2P|7bdBgBapo~WGIqefL~&rH zX0lGH;#BwGDtuZMr>q&y5-o7vr3IE%%!ga3qthCJclwMd-XQ43X1`Kgc8a%2aqXuL zo{}|s_o?EX0xUb7>Xa>D?ILK&Bs#|6^Z_hJzPMxXi>-Dj6s>K ztKx8oG5VXuWw+BYiMA7^@y-ZdXkG#QSv+?2BvY4&dmW{6Q?!xN(b+TqrHu~p7G-!*Xt~|>7nZw) z&q(~Y#TAtSZog79fgoI8o;&{6oLCT>5JYbKCzxR~q^Rv6#HGRo*CPD4xpPpbc5 z;+*UkE2Hyq;QH8yeSfL?Pj6T6k#G)qU;bBl%j&l(2UR~(J-xbhb=k@(Slz*VGPWdCHr%GQaz1ja#X@CEc(t5>z6dx*HRk+zdqH zPsneQujPK6yD>K?{M>K&&*t{bt&x2pdtdg7>@l+){yvqtAhSzm)$n)WSHgFON2H$r z_k!z!bMh}1rpu=l_N{DSzQ5x3&&enYz&?Uc(`s&S7R7Oh>=*E_{w>TGI_Z)=MX~JAcDVX86yf_yebetSt0)yD-n2B75UsFeic7|WN zjw+75#RVi~_H8ooW#OQy%WiB*Y#GT%XY>hAxp#*+_X*dycbhD3?YDf13BV>}33O`+@lF8Kk!x zNks#^C8`AN+xxya7dYcBZKa-&CeAOFE`FO#N>}dPAWnuflohUyb#Y#*FNI#13>TRl zGedFon5%|!%ooO4*2yttE$)K z)Wj{sZ9EhK4jJ||NB%BH2O}YzVL4HM>47Kajn?->{0Sq~t@o97EU7(KMHb*_& zQ``g8gTbF!76+d+Z1N;Dx-ATo??$JHQ)$3i*on^2r7bL}O5+!|qPi^Ik*&iV8I{r{6G-=R|k)p{vVJ(==!EzSUIB2EvH z(M7tJc@k-oEgoH{H)6n)OerMe(FM8^o}vXfncw*i^P58Q3maRMF@AUH;ovmvbpdc-qj7Mjor8dAImTb_rWOSy(qd<+M-1+h74E<5y zuUQS+dW_~fJc{b^wzU{YMyKmqtRTPxwBpfe`UK%MGmcaqb*jUoz~Wg02Of2b{wOM_ zY1i06>117rY#ye(ZHto}=SP?xtrK-EGKYN_VC`8a=vo*Klba?U9k2f^_C;c5SUfsT zS8Dbq&B)`?vAR;)caiff9v!1A+1s`Dz{%)nl+@NdWb4!Pdop^Xu9eW5hkz+~V#`+0p&KtQV;el1oln+{SNvIQc6j^j>$hgS^0L$O zz{+@%k#*x0wJc$l!Z?zVOFLeRC>iBfJhFDY;?Ig_Y21^Mb>kH$$ZndX815>4kJtFdXf|r^*!uB`i;=b|BfOc6tR1iTvtV?bv|CC>F8z36 z5Kfz2B_oG^yck@N?sGiFBkRYj{wxA8ZP42B$lCF$N+iCi_hBtuy76+YrezSx$hz^0 zE6B^@RI^6cBWuSiP7tOZTw@u03fcKxD*)&rK9|=DgzAo21~eTS^Gc&p2=Ww2QAufkZ9*<8!V=4!E0)VFW=ap z?xHnIRhbNGx>jl^TlOazR2{UipW2qIO9mAuEe<#8sggn2LCcOdU5hh-#)Rg04KlhG z7Jy-3?QMk)S`wy>YLomxi6 zX#R->{p5q5o02;=N-XH7DiL2y3j`8}pda=LFW5qv4lzI4>YUO%XT#3A0tyM>FvPEHH=iHm1|y0mC2ws4>LlP1J+2W1FA;L#M2Di~%9t-e^3r`a5xn?GY?Ws|FH>&>o+~1R;}HWKVsw7DY6c-NYfXXD(dd8pg*K*b^7e7)31^ zn>dB_;6h0A02b8~AJpl0AYtNDa*659ksrkt(o-Ll5LmVi?iA1?^KKZH#37y+Q&F5& zVCAsE}=Yjc$-j9l(O5aSNv)5bN(~Maps%FeX{Eo*D3t1@Xf+ig;NV#2B#L5%0HdI zwZ2_&RsJpc1M=(T{+auB?jyO=a$AP`W|JeB62qAWsn6^YxkZ@;CD-R&41MtEMWs>x&KSykA_ zHR&EjWJrIERf(i^Ov~~$l$V9pNaaWV<=Vo`M&i7;OW3-(J9rOuaA+=p%Swzhuzya1 z=>C_q(PslO-kU^m^BZC0dQkhNvu@_ex-?1c54)@EJ`O}rlMeJO`2GPaa)`zNgKq| zNb}yKNkhyD6IURfTayHs{}pX)P9iP~?_pIsy@=i>N|Q&%uDC#H>kjapYRGORwMQpm zOaUMFr?}h@U|0)UV9&xf0rUlIhASOsw}rngyRI1xg?E6!8feln@IC=D*D)bXJuWQ!Z*r{ z#CcOrA86KOni!=zrl60_yz{y=_xBeun+{y4bpUmp@tAn;LwbXSOHa^x{x8Ituwf~> zxK5O|cy1KFlbPrWnEUULrH$F)GF_Uwu1iH}>UJ?a#bXoKklQ$Xt3Ec%0xz8;4`!ij zq|?_*(&atKn#NqBBxaY}LF zc~_~O7Y{5>DJoI>V*zF6=Mbl41v%6?zmkDNpptAao*Y{7z$H>i{2oIPC5KRDibJ>9 z@W3HfiSIb^?vjB6-f#jAx9IS|@P=a0mberA6b9!T7~oKRSwTw5~^ z`93zrgas;|aY*=qcJIO*#G@QCeq;$b1uf$~<8rU2T9%NUv3pfqc2bLgsM)wP4%aH* zJ|#Q^?UEUXYZYuAU%o+QW*qWtfGR-(0?-y7b|obu@Mj@GY%K4 zg)h?>b28&_v06$vB{xfE>|#|9wYfG@$&B5ssumxA8}@R3m&4V{7EIW?0KI1HYE{2U zqL9`OIhk>|Suv|gZu0pjGfq(cF2qQLrji-At98O}7SGt#D*mjw992AHH>;>b{SHE6_wb5sM+!4g58q@9Oh?L7hsIznT_3)T82TI*~nGNR$@bxkoVXs!T8F5%|m6ZIyjWdsfyimb1St|F!&JX^q0C3Kv+<5evqDY?<#qDz;za;SwW1S#w&Ai3d0<*wFkrJQYTxkxlB~E>UHTOg& zENtMeHNx3W936%w_=o*tWN~BA^H)*Dt!rQu?5K*TsB2$j2TI_QvJP->QAf9;v&a`& zyfE@9H;a6cunb6og+{!eNa@iApm`YjqSaJs;?zk*4nZM@N%%O3?W`|K6JX_9T|rz# zLt?^f^-fV75^~}l{-LTk&4*y-zEPa|1YowSIyk6ZG8!wX;xX?i@2}E07sdcH?X*g-GHC9GjCm@p42>Xo6o~wvvurtmCvKMs$&3G@Dh}TwH`r%X z@fc>8$ltM~$tU)jNV8}eQ!DocT^{aTu4}{7z%tD!ouKVV*l+l-Jq;xTz6&S=IEm%O zO#ts7zKTSkKCv9I%R`aX6}UMWkSKhuxGs$jEXGl|x+b1bz+)-G)kJX$2Z-W^1yLM# z0$U&yO^4a4L+`EH=(CIi{==F$I4m|<6rM)rTsoC-gjjfymble~jFh`zS#fZZ7hLX} zYEz%Bh4=3h#V3h2?{Q7ss3q7k!9O$r0<6^ZNR5JyI67>%*i?bI2MB_8z=eiai=zW? zH;UOmh~kr&Vkq+6l461sF$%s_(@q&mV9ECcDi@`d0|JG)SDbr~8y-tpJj1Nu#NgGp zl0#Rv`)u*U8??#H(qZ|3)CQ-igOwwmqYXY=W9o_B$q2R4`YI9+NOn7&I-_7eZRWEz zks{gcEboW^jZ#qYcJ}V~g1&!4?)m)g`PKa=3L6$G`6qH$nCC zsT6lCu2ek}48ii@6X9nnzpmU7?pC>^a!937@ym~tuP)Ee-CN$eyiDmArQ1ty_wFwp z60YtKN^x*SDO3Dz@w(!ACxWq(TA3|2Xn@C}A_l3p3B0{V zhn#|6&K$S!4je#=A)4eOmKMcu;tu#`*04cXDhgjOt%?=;ndP>rBu)PYK#ojYoH!EN~)77k*gG=pwNfbVXS2d9lx6s#$J7cdYp`WE8gw9UkZ^uDZ$)9-<) zze^R5EuHM|E>3+?y{9#{z&4=?Vk?rjS4zF%0g)3fV0I&XN395aq$W+(N+a4(l%CYA z9->P3D2K+)v5HS5t1QF^{a-jAfYa6)spb6)HviE}Q}{EXY; zy2%A)T9lsz)Yr<1*Gs5`=%SUq@g<}#amF@M&A21ak$7qS4Dw#?3?vuE#sudq+Eq`jwD9r-6S1v6Y)6G`YNTS zmeu2Kp$-mOwH+-dO1B`T=tf(r(u9F}(aO?iVcrVI;lbkj!H&mq@*Apn$_wrPUKi&a z@~@M{jn&!L3g)s{u3mY*a$jy#d0*xD z{2wcuRO;oQmOodX&L3OexBR-?-lf0iuP8lK`f%y=?2DzXOUo92S-d^mzIaLQ$Hl{n zt?YyOdyBckV}(x^F8CkQ`-}d=q80exxdO_GF^3@zf2K&!VROSDDbjOn$o?bB1+c)> zmWGBHH%-1AsY%bdADZ<4<7Rv$rQZG>dXT0)7Y5r|lLp3Y>8Nc~Y3s(||3Q+T#j}Rr zR;1_Pis8MI^eoCQyi<`T0oWnA6op?AtF00>=7b+ZE{Y*_L4MVW|Q`QB}tS1 zqNXtNi5m(t!CJN;w!feBBZ(duZQK6Z(3AEs{&SM_Y#pTceMQ~EoUm~vHP-|Ey z!QflJ@_*pt@~!iOS=Zk5x{oY*@*@tpDHJXZ!!XwEq9JwEq8L z>i^%EZ{+@(_Wz^)Kl|Kl|GyP7KhNAiJ;7F)diYp)b$C+PqmJOA;C;c7LFB*af7O4x zzmLDV_Z#n)S9SA}@sfJ6gK=M`xw&|}1X^M>3t4ec*%W?^7uS{MUPCfoOrIYK5~y*l zo^o8*wW!gVv@TD^HC>CMxh`of$`!{|UCYdZl_F*HsbXRTx$)S6(m%2O3q%s*u~GWVziC1Um5lQa%SZe)w->5#twA;ICI#sAd=&; z(fR9-0v=15-IUS!i&a=;xgcf4PKc+B&|iHNC}Jy#jHfQpmH0kVlCF5_d|e6nF`uIS zsPi0tlyw1UTieB{a~&RqE7iBkvZ-?%9!13~&|bFE)Y)#2Y8z|!lmn+fF%`ahuE==m zO#M-F{AJ0M1FJuu6?#TcJmo;^Z$vF=tMz2+be9$Ev)j22;;Ga0p9RXM*QXzOGIgr{ zDC>&AuYzV|>J%3(3hbftV}d6;Opq!{0*O58B>ho5h;QF|22P#mpoMh<@XWT;3C>zn zyiXmkYw?aJ-O-aNCx(BJh*SadEEkG@B4$JeZ8ePJUp&!3@=u`6$<)!hmT}2|bGJ2g zVgE4v77gs!8pi%1o}N_KrBDw%R3|KRfBOF_$p>%W5ynOFzfDF^ls zb}n>A+~Dz)1N(=aGsc0KOgXTBSeiS{p~q7W>>t#oq>fZP^*_^a{Rc`1XlXwsyP0wz z|FAX~@JQKycOcbD*?1bzSSC|W>>m_2W89(T!2V(NE-BzTo^l}7A~Fdu%O2%I{y{DP zNP=lOk$=#gEbUl5mh=`zP{1grSQ5-;=CUzRnM7oxB!i4yRk97tpin{I;CXgkx zgLP(oT?+;c$S}+Zo>@=V!e*J&ha@xWI%v(YJtQ-)bI_W@F2^(Lpd}wdu=_%e7SF8h zs$|3}YdI>}9W(P<7o`bRemt|LgVHR8YCN-su0)%(Gwvzm)m7uy4)G@hBK|Ewk*)(jP%%&hA0C>&Y75_{At`lI;rX*^(G{>l!IYV(t< zn_@Dvl7kk#Iicw>wz3r+v>>t~gJm;Z!9mNOMf)7ImUq$OBAM`CEvIWSxe16ro>>+x z@f)7Q^2amF=t`7+r0|D$W@%js_A7G~*%wQ>DjB_@h2zEFls<{fP(0%hjh`slx#SRy zpD5vf%~&+vWqK*2mj&aUl#KAnB^FQoo^4^5oI@<$qW);v-R%^MH)d;$^BIR&JnNUz zVaGELvG|FSy(yPid`b#pH;7X#-nc>RH(4xRe^Y61nRvz_6rcX4)UI#}#gliMSb;8~ zcpS=3+GTRaLh<5l1;rCiIR&N3jK$(bE!#>YpKRN@1mh=KcpNhpj2E9}=U-YN8&6v_ zUVU4$b+XB{1>?m>S)mG)A(-FO7L6CRIQ@)Ng|Bhi!ttUOhAMFbiVEXti^r=!%P65a zY?EmV$BU1`k|qXamh`k+Jf0+%`G?aMk9T+$E-Z)4v<2kFpM^!jHEc=_@%U7U;^uhT z;_)s|f(tR8ws5?Y61UZ~MdL-KIdot$ZNYeX`DRHxl4*x%JdQ|AFv-@>qVeJc=KzVx zv<2hE2@=*IcSk+f#Q(o>lK(gV-}3(#lmB12`b_nv>Uq_ztM$qcD%X<#--!JGx61D; zA5mVv^yku7N^dQ_p|o1@+2R+91M2*jDg2~xePMoK(?VeZ|Nq}+^Z#p-|GzCemEAS_ z-}C<`hTYH)9t_?c91^VK|Iz=F|7L#=e5C=$3P^Y%<5k z&>`nQfOt~to(vsw4i>~F3ym=}w48(Z%A0-K%%u}_NI5_tK!2wmSH#dI<-l#k3gXZV z-BJz{GQMQ!lyV?K2hlFia!5JY!l#|clA)y>)DHk!8>o{o{f7=Y2kL6A$1-ll&>`o* zONLL6lxQ+^$~i!=pTGc;3|(>#ZTwH8%u9xra}Za9yKlm77Y{AzAZ|lK*eOCO9y;Y5 zCMdFGXgLRQe#T-;E{a&j(2@?~{K%uEw8~-{=P6-E; ztfleL5)SH8aGF_{d9*C!AZyVUFKrkybVxZEhAbs)NQRbjke@}j1`J;oV^@b2q~S$O zMPxFxl!N*x82Od_1|0rG)7OWA&4_~iq@q*ha zb8a%+RM+BU>KYc5XKmu51z$VX0$LlpXp!~kSs-(`k%Ja-V45}9cHhuJi}MqJmwj4& zUCTPTriIM$u!olTkpOA{vTZxVA!9eWN(k@_9Wr(_@PR7Qo@5z2b$Ts;g4V=i=#a2u zH%<7jlc6Q-)Pox0R(J{R4LfA)h)wX0p^Qm}ma&tch4J1ouM7VytxZ1_JHlv{(pzmUpL$T&+`Aj zu7069sP0%@y7J@7Cn~2?&tE7%TE4P;bh%mjTj^f%|NE8JC_Y!btvHqD|9@WiOyTUp zmW68md-;##Psn%jUhaY1<++1%>tz3!{Ze)&yL)z}%rlwKW-iEVn^`=3Jp5RAO1M#& z4Za2b|A=4%@c;Mtm-zcC`2Q&<69P;cC1PMW%!ClPiy=O(7%rJ|upj`hC)CLp8pA9I z`E3x^v$Dlx%Eg4hGJ@~gP&zIqM1yOIdo!Lg6GB`I0OdU^OpT`;EC^iQlx8%UG7Cao z3a$x2y_}9yW6GEI|YqlgPnR2lpNUd9AA13HvL0HI(s}rwi%E^KtenlWE zo^r4tjA+gHU*ahT3&LVG#vB|^Iam;+$I}|hWXi#U0P=(3VVR04Ckq02v$dg2rW`B? ztXOb3uxln$E*1nxN`99eGG;-Dn~9t{@I3ojE(Qeqmij6C932b@&P^*)uxFV8As!f% z4$wB#Oy4Q9AXF`ahFl7Ex;vN<6ek(h$~K{y5aJRhvt(q+lvxns1UWD$?lK!^%Eg38 zTfLbHG38)Fu<(E|)35lo4nGU?-1@2~Q)@c>tdy?^s-WN=%y4;K%lxmj*FrM2oUTP6l%#ixIbPOD z3tW0+_ghGqbpg( zIgew0q09WBCxK)Lt-wKxubyk9IqUy_danL|?b;t|chqKTyVq8%K2^P;dQNrAYOV5E z z`DX6#x%+eP$Q_Wg{QnoTqwLPvWivm`d@^%pX0uEw{IBps;jv*7{3EzOct>zRu%`bz z|2BWh-^E|f`Owkms;i_1Vg58x8M=aIP#;&FrT)aug$AsM-N3_J?w#Izh@jsT}y zMCUkrM-DCndw9~?E*Uwv3>=|Y|LN~?aTzdo`KY;xMh-3mFWLC-IkHFQGN=nr0M?3* zl95Buk(-?TPD6xbKn3-97I zuo_mf%?!oCQA1+)v4hiqi$@UNKu{wmr-75udV3}#2d9C626-EcFpkV= z5Lb|W0oWKzZ)84$s5HmxACJss5S5@{1x;@6%E4oxBU{=gAs#t+44h>tvtKeYk3oFF z%#Bvm7DXDF%OGlTgHKxDCL{A1WG$E^(zfs;a~eb~!weYSg&CUHplX2=S)$Un3cr+u)>?UUe>hL%YX`kv_}8Z%AhdFq|3s`V7s70@`UG ziRZ9{hFMENMi;p}3hS7(3Hjs;^+!#B2ArM43tW`CP!jR7hUe?^vjwEM9&2WJp034K z!ZU>YF&UogpoJyg0~mtVISyLH)U2}>T4%dx8Pd&cpy62#T3u`JoWh^aL`(g)c)?Bc zwuWaoJPKn5AQ=Y6aK3|9ml>ws_3(5Dt+YEY*J?66P1gdnf;R&%WHLNe*RoTFT7J_y zMc2YjAg4=+u@rpE+a%b+tKACffB)LL)xTFCs$Nw+rMd-m|39zXT$!%yQ(3$G*Ybnq z50+0XZ(3fg^wZL3OQX_W)c*gecz^M|#bb*b7R!al3!g4rSlG2NFaP`e-T8OrkIc8J z|Nma@y4*Rr9davXpUr+L_%Iv*hh!VsAlM=}HS=iZn#>uQZTy@4X*mDZ_FnWJs6YI2 zuiyXqtNquP%X(gT{=%9D8E;X#3H&pZCJYmCK2|@gil_Fnf0Zn5)d~K|syJXCdKi7q zK^(fylT`84==9&Kit`cD$4F`fXnfJj*Q15F&FJiP2EJihjff7Ze` z!R&?Z&(t6Gw-dw@|O zU4hJA=BA}_(|5GdXRAB>qgC;={J}q56*oj}6g(`7H{e!;0rdw;dzH!lHDgH~Jk8$( zQaeZ+JWbpLQo~LYPlJvA7qrPhEx~(#FQvEYe&XnpNS=SKD$ZQsP5!VbKEYf2 z8>`~5PI7KYow@m`6U_68ZyGJWa9&>Paw3+67VzR9@4BUN(?Ad7C&%qE@t(IQ0n&U~rf!PVo;%FpFXe``R!!aqC;0!*%;EpnN%8*|*Y>KdI?MmJ-AnlYGv$ZU z`u`0|e<^*n^!Cy|)c^mwcysZhm*xL+dE@^-n*9Hy{(ryRYqGz6N&f#+FUkKO6UOBK zzZSeRI52oE`TyJfvA?Uoy!WK{ng8zH+ds$CClTWi!$hnfrZUmq=@a$92=VAfw`oMr z(gGPd#QBheBUgDRl3pbI*&>|6q<8JGIsDqY~hjQtmb%=`=0Sk*jB-01$T9)#TDPBkGAYF^!1jIXi z)`1RMwi>KH@Av?;#CHrtmX0zu!Tt3|Vb+15NQrs&)3x}HCk{;K3xWd+A4_(X7 z&oow%Oz*C1k%6Hq*ANubyE$vwC%LPxMHLSFDScbJpe4U8?h?alP4BELkzFUGZ?AYK zU5PFO)bf%^nch+VQPx5x?$Askncl%g3vw;8@@Q?ZYo#wMCcU0aZ>MY7#z@0V$@I1k zT0qc9=92}S-Uco4Wf}R5_4Z7rx7HuU&oT?l7Go=2%PO=eB(XiXrLM)ZtYCly(exI& z7SwPc8;r)I48@pVz7ObL(e-Ax=G z1!{n=%S^6~^+#D91a2RX+DLztG2OHbJrz%Hs6Q%ZTUcr+neID0itmvESKGP>25E)y4AtpV~UXQ2zOw#!?F!k%Xt zu2x(_W(XRJR(i>_3$WI}i7A3=+67q48>ZgHK9h(Uil0eLVPtsKl*BnXs8A z2CEg7;N+)*&pt>8R4rgKavBEJns!0eT9Y+vf;4TYT5*1@*$%$Rv=gWnY-gJn98Vjl zR(urJ7QqnP6%LqMR*w@c-R*{{m3JV-_CS+o@&9*w^#|+ksUKEfulDEK-L<#Z_N}d6 z{Y~}O>ae;~b=k_#Dxa>LUD>iyD?e7grhIa_Uk*#(EWNjML}~rvpNn@F-&TA>akav; zg)bBag&hk^=YO34ME>;rCi#5sk=zG!N9W?~KeG2{-;q5ayJqIO%x#%*X4lN};m=e4 z|1HC6@crQG;H01zg#I`EEBwR#^}IiOcfXSV|4RQZ9-A+rE-|hjhXZ}YTwICNY2kVr zJGc^-rb{!7@z`7maelbK;J!J?$1a`(S%x$zk&GQY39EMEyaNV1cJL(T+DIp3^CZOG zhCODs1jotPTnTY4Fk66snUZ-DqEeGjAq7c}&5=-*jJK3DV={K|BY44tV(g3@n;#)Q zYOb+%GInqyFp!#@tlWcRb0fr62cyp=W9`w#4t@k-L5SEmgvRDah>x-toTiqNv6CY~ z!G~Sc@z@**aRr+#`n6k!sj>MHqS9Q4@?`AbMsQ6-{$h}^v5OnQcLf&{X4Tl-2zdqL z9(V){iLsL(fh`L%5G@x!g141s_TsVm5#kcU?UfP`#ba|LWF-u4qaGWZ7a=N97tPmb zThPIYU<=~15UY&G=0vEEf;WU%7LPI?LVZ`DF$wnC8+LLc5*}rg17mX{)Muqd>=sNN zyLb_E@PK4&UWE88tSVN}qF-YNH^Tf&t0F+l!HwWZBgA1i-Lbh5;_I?3E$umZl#3TZ zVg&R9F4fq)2=P(CKB<^DC37O2lx*j^_z*-ZQiM)2cJm>qvoq-I*n9|ag0uoN!g40) z;zF?BJJ!-O8Ji0s?cx_mMf3|vlb=&IVmB;D-cj_zh|IaJ_iv0hZUdI1l%s-mHGR^;6{r@|22j*Uz z{T=-ObA zlA%MGgVTC~OiYG$Mya1Bm{ZE$t-!<%DJLy!RXns)%1w(Kd1%L!n-*Wp(BYf{tU>?` ze{g8$lz7S;nB8Dh$;x83h2xO!%S0Ic0rsNi6XwmvaixC)or0 zS$0l|6SP1Z4RbM&hjvhjTDXJ8DvfzEw3ABIGUt=_ytagPREb*9D3HrHoX614DpAXz zX_Ose;0^7tlC_$oDbt1$c3O#AX{f+JtMSm`xWXl*7SLYulKP{Jafm8WTZ|=KwD3&K zCya-Sqa}VO1Q;wygwkTV(u9@YltvClEkA7x42q-RV;Ltob-PzCRb zWhsZAgVy8<$GIC^{B|`^ixjYK%I^~5MoT+O;#YfP| zIwuXz(6s=&zzSt?;K6(sEk0t4eI|p`9kfV8_fztZ!D$X!a6F{w^<;3WirVrcgClhbcde z=>PXvQ|Zg}|J%E?YVnsT|KE1SB??azK3+H#{=Z!Q+xZXV z-V-*rJwVYY zIr#B>BRDDcVLABm)YXu*!j76YKVDpR&Q5}4DPY^dkvF~^Sl(>e9US?z=T*wCJ#CJ> zxPY*(TF8*KFh5>YBBC*A`#EiHyto$FH4tm2H-dv7&!ST|V2QtJ^W)VCn!}qmuXb|d zfj?M8I2k**@tB90HYu{r!HsX?G_deI%G`Ky1=&q>`?33bY<|3`g+tkc($w-i4vsvS zm_8YG9_8T3rxP^F@MP@b$aioj*->bjBQO3e&UOfjEYvV|@#H&QoDB<|j?I%7w-a9@ z-O%i9IXLn*!8Y8LJjxt-`B8NHF@JaL=EpZNJ#B&xe!Lw{6l&SCoc#Dct;4Klb?o5B zgUulQY@MCQPJTShsrHWJvH9`h67sHSTgeSRHaA|}PIk_4vvMhno&0!1L)(6L@Z*VS zaAH_{!m)!NpLV;n>Z!4VA8&WU#Ks+)A1|)OM9W%2jLnf3wd_Y(C>||0Pu^gUrsd?x zlc7l)a*mxmdD`mIIEiODc=C3@6Oyx?;o!-0G(r4jO6JLnJC`s+p9lr>b8zI%aGKQA kj?Ix*-xlQ^05uF|<>1L<=f*&TP3s_sXEm@E?6&*A02uT_(*OVf diff --git a/main.py b/main.py new file mode 100644 index 0000000..6eca3b4 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +import csv +import logging +import random +import time + +from DoubanSpider.Spider import DoubanBook + +if __name__ == '__main__': + logger = logging.getLogger("PAPA") + sleeptime = random.randint(0, 3) + with open("results.csv", "a", encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) + writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"]) + douban = DoubanBook() + douban.main() -- 2.45.2 From 48ea3536cf67bd5d6ada3848824f242e568a7578 Mon Sep 17 00:00:00 2001 From: daofeng Date: Mon, 9 Sep 2019 18:55:21 +0800 Subject: [PATCH 2/6] coding --- DoubanSpider/Spider.py | 16 ---------------- DoubanSpider/db.py | 2 ++ main.py | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/DoubanSpider/Spider.py b/DoubanSpider/Spider.py index 5517c12..d5cc846 100644 --- a/DoubanSpider/Spider.py +++ b/DoubanSpider/Spider.py @@ -124,19 +124,3 @@ class DoubanBook(object): else: print('[Spider]检测到现有TAG数据,开始抓取...') self.get_data() - - -def url_pool(): - for row in douban.session.query(Douban.url, Douban.tag).all(): - yield row - - -if __name__ == '__main__': - logger = logging.getLogger("PAPA") - sleeptime = random.randint(0, 3) - with open("results.csv", "a", encoding='utf-8') as f: - writer = csv.writer(f) - writer.writerow(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) - writer.writerow(["书名", "作者", "上市时间", "价格", "评分", "书籍分类", "内容简介"]) - douban = DoubanBook() - douban.main() diff --git a/DoubanSpider/db.py b/DoubanSpider/db.py index 6914814..31e0259 100644 --- a/DoubanSpider/db.py +++ b/DoubanSpider/db.py @@ -15,11 +15,13 @@ class Douban(Base): def __repr__(self): return "" % (self.id, self.tag, self.url) + class Recording(Base): __tablename__ = 'Recording' id = Column(Integer, primary_key=True) data = Column(Integer, unique=True, nullable=False) + if os.path.isfile('douban.db') is False: print('正在创建数据库...') Base.metadata.create_all() diff --git a/main.py b/main.py index 6eca3b4..28ca89e 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,31 @@ import csv import logging import random import time +from cmd import Cmd +from DoubanSpider.db import Douban from DoubanSpider.Spider import DoubanBook + +class SpiderMain(Cmd): + def __init__(self): + super().__init__() + pass + + def do_help(self, arg): + pass + + def do_start(self, arg): + pass + + def do_tag(self,arg): + pass + +def url_pool(): + for row in douban.session.query(Douban.url, Douban.tag).all(): + yield row + + if __name__ == '__main__': logger = logging.getLogger("PAPA") sleeptime = random.randint(0, 3) -- 2.45.2 From 2a5c0320136746151082fc91d10615a35a2f5b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B2=9B=E9=A3=8E?= Date: Tue, 10 Sep 2019 00:56:27 +0800 Subject: [PATCH 3/6] fix some bugs --- DoubanSpider/Spider.py | 5 ++++- main.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/DoubanSpider/Spider.py b/DoubanSpider/Spider.py index d5cc846..11e7869 100644 --- a/DoubanSpider/Spider.py +++ b/DoubanSpider/Spider.py @@ -2,6 +2,8 @@ from DoubanSpider import * from DoubanSpider.db import Douban, engine, Recording from sqlalchemy.orm import sessionmaker +logger = logging.getLogger("PAPA") + class DoubanBook(object): def __init__(self): @@ -108,7 +110,8 @@ class DoubanBook(object): print(f'正在保存:{name}。') self.save_csv(data) - def save_csv(self, data): + @staticmethod + def save_csv(data): with open('results.csv', 'a', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(data) diff --git a/main.py b/main.py index 28ca89e..f909a97 100644 --- a/main.py +++ b/main.py @@ -28,7 +28,6 @@ def url_pool(): if __name__ == '__main__': - logger = logging.getLogger("PAPA") sleeptime = random.randint(0, 3) with open("results.csv", "a", encoding='utf-8') as f: writer = csv.writer(f) -- 2.45.2 From d20cd9b0937f19f1657fbd5e6f0cb57b139bd4c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B2=9B=E9=A3=8E?= Date: Tue, 10 Sep 2019 20:49:20 +0800 Subject: [PATCH 4/6] release 2.0 --- DoubanSpider/Spider.py | 147 ++++++++++++++++------------------ DoubanSpider/__init__.py | 25 +++--- DoubanSpider/douban.db | Bin 0 -> 20480 bytes DoubanSpider/requirements.txt | 8 ++ main.py | 56 ++++++++++--- 5 files changed, 138 insertions(+), 98 deletions(-) create mode 100644 DoubanSpider/douban.db create mode 100644 DoubanSpider/requirements.txt diff --git a/DoubanSpider/Spider.py b/DoubanSpider/Spider.py index 11e7869..d127097 100644 --- a/DoubanSpider/Spider.py +++ b/DoubanSpider/Spider.py @@ -3,6 +3,7 @@ from DoubanSpider.db import Douban, engine, Recording from sqlalchemy.orm import sessionmaker logger = logging.getLogger("PAPA") +sleeptime = random.randint(0, 3) class DoubanBook(object): @@ -20,6 +21,11 @@ class DoubanBook(object): format='%(name)s - %(levelname)s - %(message)s', level=logging.WARNING) def get_url(self, tag_name): + """ + + :param tag_name: 字符串格式 TAG名称 + :return: + """ for num in range(0, 10000, 20): time.sleep(sleeptime) url = self.base_url.format(tag_name) + f'?start={num}&type=T' @@ -37,26 +43,15 @@ class DoubanBook(object): self.session.rollback() def get_tags(self): - print('[SQL]未发现TAGS数据!') - print('[Spider]正在准备TAG数据,这需要一定时间.....') - do_not_get_all = input('[Spider]请选择运行模式:\n1.获取所有TAG(需要大量时间)\n2.获取单一TAG\n请输入对应数字,回车确定\n') - if do_not_get_all == '1': - response = requests.get(self.main_url, headers=self.headers) - html = response.content.decode() - tags = re.findall('.*?.*?.*?', html) - # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - # executor.map(self.get_url, [i for i in tags]) - for i in tags: - print(f'[Spider]正在获取<{i}>链接数据.....') - time.sleep(0.5) - self.get_url(i) - elif do_not_get_all == '2': - user_tag = input('请输入标签:') - self.get_url(user_tag) - else: - print("[Spider]输入有误,请重新输入!") - self.get_tags() - self.get_data() + response = requests.get(self.main_url, headers=self.headers) + html = response.content.decode() + tags = re.findall('.*?.*?.*?', html) + # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # executor.map(self.get_url, [i for i in tags]) + for i in tags: + print(f'[Spider]正在获取<{i}>链接数据.....') + time.sleep(0.5) + self.get_url(i) # def get_books_url(self, urls, tag_name): # response = requests.get(url, headers=self.headers) @@ -64,66 +59,64 @@ class DoubanBook(object): # books_url = re.findall('.*?(.*?).*?', html)[0] - author = re.findall('出版年: (.*?)
.*?', html)[0] - except: - print(f'《{name}》未发现出版时间!') - time_temp = 'N/A' - logger.warning( - f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE
.*?.*?', html) + # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # executor.map(self.get_url, [i for i in tags]) + for i in tags: + print(f'[Spider]正在获取<{i}>链接数据.....') + time.sleep(0.5) + self.get_url(i) + + # def get_books_url(self, urls, tag_name): + # response = requests.get(url, headers=self.headers) + # html = response.content.decode() + # books_url = re.findall('.*?(.*?).*?', html)[0] + author = re.findall('出版年: (.*?)
.*?', html)[0] + except: + print(f'《{name}》未发现出版时间!') + time_temp = 'N/A' + logger.warning( + f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE
.*?.*?', html) - # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - # executor.map(self.get_url, [i for i in tags]) - for i in tags: - print(f'[Spider]正在获取<{i}>链接数据.....') - time.sleep(0.5) - self.get_url(i) - elif do_not_get_all == '2': - user_tag = input('请输入标签:') - self.get_url(user_tag) - self.main() - else: - print("[Spider]输入有误,请重新输入!") - self.get_tags() - self.get_data() - - # def get_books_url(self, urls, tag_name): - # response = requests.get(url, headers=self.headers) - # html = response.content.decode() - # books_url = re.findall('.*?(.*?).*?', html)[0] - author = re.findall('出版年: (.*?)
.*?', html)[0] - except: - print(f'《{name}》未发现出版时间!') - time_temp = 'N/A' - logger.warning( - f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}]CAN'T GET TITLE