From 41f75f0cd61c7ec7fa55d93f9cb64a4a7104ccfc Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 00:52:26 +0800 Subject: [PATCH 01/14] add net and Requester --- setup.py | 2 +- webcollector/config.py | 2 ++ webcollector/crawler.py | 12 +++++++++--- webcollector/fetch.py | 23 +++++++++++------------ webcollector/net.py | 7 +++++++ webcollector/plugin/net.py | 36 ++++++++++++++++++++++++++++++++++++ webcollector/plugin/ram.py | 4 ++-- 7 files changed, 68 insertions(+), 18 deletions(-) create mode 100644 webcollector/config.py create mode 100644 webcollector/plugin/net.py diff --git a/setup.py b/setup.py index f200231..f5720a5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="webcollector", - version="0.0.1-alpha", + version="0.0.2-alpha", author="Jun Hu", packages=find_packages( exclude=[ diff --git a/webcollector/config.py b/webcollector/config.py new file mode 100644 index 0000000..35e24c5 --- /dev/null +++ b/webcollector/config.py @@ -0,0 +1,2 @@ +# coding=utf-8 +DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" diff --git a/webcollector/crawler.py b/webcollector/crawler.py index e6e27e9..6c1320c 100644 --- a/webcollector/crawler.py +++ b/webcollector/crawler.py @@ -4,6 +4,7 @@ from webcollector.fetch import Fetcher from webcollector.generate import StatusGeneratorFilter from webcollector.model import Page, CrawlDatums +from webcollector.plugin.net import HttpRequester from webcollector.utils import RegexRule import logging @@ -13,8 +14,12 @@ class Crawler(object): - def __init__(self, db_manager, generator_filter=StatusGeneratorFilter()): + def __init__(self, + db_manager, + requester=HttpRequester(), + generator_filter=StatusGeneratorFilter()): self.db_manager = db_manager + self.requester = requester self.generator_filter = generator_filter self.fetcher = None self.num_threads = 10 @@ -47,6 +52,7 @@ def start_once(self, depth_index): self.db_manager.merge() self.fetcher = Fetcher( self.db_manager, + self.requester, execute_func=self.execute, generator_filter=self.generator_filter, num_threads=self.num_threads @@ -70,8 +76,8 @@ def start(self, depth): class AutoDetectCrawler(Crawler): - def __init__(self, db_manager, auto_detect): - super().__init__(db_manager) + def __init__(self, db_manager, auto_detect, **kwargs): + super().__init__(db_manager, **kwargs) self.auto_detect = auto_detect self.regex_rule = RegexRule() diff --git a/webcollector/fetch.py b/webcollector/fetch.py index dcfca79..c78f91b 100644 --- a/webcollector/fetch.py +++ b/webcollector/fetch.py @@ -1,7 +1,6 @@ # coding=utf-8 import queue import asyncio -import aiohttp import logging from webcollector.model import Page, CrawlDatums, CrawlDatum @@ -9,7 +8,12 @@ class Fetcher(object): - def __init__(self, db_manager, execute_func, generator_filter=None, num_threads=10): + def __init__(self, + db_manager, + requester, + execute_func, + generator_filter=None, + num_threads=10): self.fetch_queue = None self.feed_stopped = None self.generator = None @@ -18,6 +22,7 @@ def __init__(self, db_manager, execute_func, generator_filter=None, num_threads= self.buffer_size = 1000 self.db_manager = db_manager + self.requester = requester self.execute_func = execute_func self.num_threads = num_threads @@ -27,8 +32,8 @@ async def async_start(self): self.db_manager.init_fetch_and_detect() self.generator = self.db_manager.create_generator() self.generator.generator_filter = self.generator_filter - async with aiohttp.ClientSession() as session: - coroutines = [self.fetch_coroutine(session, self.execute_func) for _ in range(self.num_threads)] + async with self.requester.create_async_context_manager(): + coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] await asyncio.gather(*coroutines) def start(self): @@ -45,7 +50,7 @@ def feed(self): else: self.fetch_queue.put(crawl_datum) - async def fetch_coroutine(self, session, execute_func): + async def fetch_coroutine(self, execute_func): while True: if self.fetch_queue.empty(): if self.feed_stopped: @@ -54,13 +59,7 @@ async def fetch_coroutine(self, session, execute_func): else: crawl_datum = self.fetch_queue.get(block=False) try: - async with session.get(crawl_datum.url) as response: - code = response.status - content = await response.content.read() - encoding = response.get_encoding() - content_type = response.content_type - crawl_datum.code = code - page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) + page = await self.requester.get_response(crawl_datum) detected = CrawlDatums() execute_func(page, detected) diff --git a/webcollector/net.py b/webcollector/net.py index 87062c3..36c6378 100644 --- a/webcollector/net.py +++ b/webcollector/net.py @@ -1,3 +1,10 @@ # coding=utf-8 +class Requester(object): + + async def get_response(self, crawl_datum): + return None + + def create_async_context_manager(self): + return None \ No newline at end of file diff --git a/webcollector/plugin/net.py b/webcollector/plugin/net.py new file mode 100644 index 0000000..3a81aaa --- /dev/null +++ b/webcollector/plugin/net.py @@ -0,0 +1,36 @@ +# coding=utf-8 +from webcollector.config import DEFAULT_USER_AGENT +from webcollector.model import Page +from webcollector.net import Requester +import aiohttp + + +class HttpRequester(Requester): + + def __init__(self): + self.session = None + + def create_async_context_manager(self): + self.session = aiohttp.ClientSession() + return self.session + + def request(self, crawl_datum): + return self.session.get( + crawl_datum.url, + headers={"User-Agent": DEFAULT_USER_AGENT} + ) + + async def get_response(self, crawl_datum): + # async with self.session.get(crawl_datum.url) as response: + async with self.request(crawl_datum) as response: + code = response.status + content = await response.content.read() + encoding = response.get_encoding() + content_type = response.content_type + crawl_datum.code = code + page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) + return page + + + + diff --git a/webcollector/plugin/ram.py b/webcollector/plugin/ram.py index 10bd7ec..69aba84 100644 --- a/webcollector/plugin/ram.py +++ b/webcollector/plugin/ram.py @@ -70,7 +70,7 @@ def merge(self): class RamCrawler(AutoDetectCrawler): - def __init__(self, auto_detect): + def __init__(self, auto_detect, **kwargs): self.ram_db = RamDB() - super().__init__(RamDBManager(self.ram_db), auto_detect) + super().__init__(RamDBManager(self.ram_db), auto_detect, **kwargs) From 98b6980063a2437b40c474f622868a11e7008204 Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 01:41:25 +0800 Subject: [PATCH 02/14] add pypi --- .gitignore | 3 ++- dist/webcollector-0.0.2a0.tar.gz | Bin 0 -> 6068 bytes setup.py | 4 +--- 3 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 dist/webcollector-0.0.2a0.tar.gz diff --git a/.gitignore b/.gitignore index 646a869..9825a30 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /.idea/ -**/__pycache__/ \ No newline at end of file +**/__pycache__/ +/webcollector.egg-info/ \ No newline at end of file diff --git a/dist/webcollector-0.0.2a0.tar.gz b/dist/webcollector-0.0.2a0.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..795f297b7c553f93c835624944767d7cb46aa747 GIT binary patch literal 6068 zcma)=RacY^!$#@ulB%R2}kM1p_T4#n2|>522shOyJKjW`+MGh@SR;J z`|w(8ug#E%h2@UwV2J`ay88KB207U~czbv_Irw|~@(J<_^1rbY5tTzHLu(PuOi-p~K<@W=Ff5`ZA z&{NN?(4U$;Ah+^P9KsRkAqFH5jN(5tx#iZ^FN0-9lvo8)>dnm~1k&r5eclwVY{H0M zk^~5T8^bU8d-1|{R7YJU(i}n{ZM2r%AW>Bq>^m293Z{;TD0Df>f7h9MLaXYufd?MZm4)9N~wquBe z#LBW$792Ac#5Q!j#}!F=t$Myjo0OU|GW_n8NR3P)KZ~Em*60!Nz zEKw+p^Kxf?Dq0HYR<3y7eMa1w=f6_{Qo#^#Ae;AJ1TU)&!qb~Yaf#te)7bXXUR(Iu zVt2|P!rS&_aSpZzflU~CiXAOcfv(laP_G}<$U1gXArAS8_DK*!sZGOK^e7sf z0;6GY8AKHwSn0+;r#Opvem;e++qMWm-7fI_h_|K+ka{i_SYE8VNit9M# zfIxBLdNXn^Kx_?uDvO%D<~R|IIRp0+g3>_;L?g2wTW0m-dJ+#3!c4DY!?-E(LZDz$ zZ6pL|eC1Rj#7*8);nn`2L<@`mc2l#zG!5rI)@ORboMf7jXUCZbQsh)doeMIvtmCd` zClTfo9Tf}>gO<-BG2bYP@bhO2QgnZz(HNRlxj&Ar+Go%f6T3k(JVM_zGbeE*ceZ@z z@3w_lhzSoVNe@6Dubk`m)@MM%psIN;lKhT97O@JMB?aY!bH;zx(%{fNUDpci>wQae zje~kcroU`lfexmfEEs5Ckx@R+iuO!YkoHGlxXM-2IM@3L4MUr$hU8iCSqM>ah8I(f zIZ5xRzmr?*M9}noB-TD-6=$*z?l+^kv3YqUs449%#_YlRf}3vGL=jsxt@u4gs zTjHZw#jZ;qJeok3Ha{s<*X8vZyw$ETfZ$Y6r*&E)(H2&^`djEXUV55-DeTSt6~!NT zf#EudVh9njT!4LuYv|lUWkoi7DeR}!k>n^HOsW}lPu&S&DiRZO0jph ztK4g-@PJ-F+N}NuL1Bjwp~rJL_DfJmWCDVLMQ6Zv`kBXvD42L_nJ>IGvZ6t*nF8X6 zu8I4%@$d;_UNbB-MJ?bX^4}oSX_0H=q$7U?5l?R)*F@>uM`g8uJr;6r^b`XE2NomH zMg2A#skG#TyYfu5CLkROn*KyhD}1N<`Qtq(o(2U68?EZQU*moUWeFX;{1ihOeX1w) zOY!EJ@MTJ*W3!2~#J-fGr!HM<>0{x554+^~YY@Du7u;z62SX<4?fNPm+JO7px`(Gr z(8KQ8!X}V=3fxMD2MH#H!K*sK$ioPb?Xifze2zW;?j8Vm@cchuhhVPJwb=~?ghm!l_5o2T zr`2?w-S2wdh!^0zR5r|cD+6DLP62Ds?72VIOVDE`0dudfb!#^$AoL!2pNN8mK#kY zx+|ii85XPL=)!{@h!8g+Qg9 zP7KE(it2Snz|}yNYMQhDRiEXXh`fAfvq9PnAJQj&LZc-)uq|@cvlsY1QZS|_ zV;L-ajVgLBjVG>oaXLB1>B&(pJLBJE^s5BV^2?>}2=JlJwjcxxY~hu(|A0+=lX(5k z;~rA6^!QF~`Ms7>p@^kWqEr3dDyJ9c2S-6ehI`#SJaH5MvHSE-8x|*@FZ+De!jPr4 z8`Q+o>iUC}|6*GYX7`V=-v?q}3+{{OZN?aAZd!g7zM9ba(o#C6vy_}RCgq|-x=uBh z_MB|k&`;GqsaehXx)3~`+>#bk5Okq?EhAAX*m>2@{S%x1ePVJwGHSAFRrzeNOBdtl z2$Y~gwnp`(G}bd16_;{}@!tlMYQF;!vO!$^*Y}e*@%?FYyAr9wmr<8d5JFN*m$tS{ z4XKfu;u)JGc5KJC?U?S*WL^UlG~n(g#ZiXz=Sr`xv}VfjVdV=E*Zgs;KRp}jnDioS zlMmF|kGYPd3_VU?@kIC9QgIP#%uG~-Gg#4d< z60EjD8cF#Ls*2zl8_=d$77^J2S=2L0nMNM>d_iV|*iNWQ_*4i7Qry~fV&31e#K(`# zMd8F|-Gb04YUVVDSjE=~0fVfXJh7@B8&ypGR7x^GD}Rbu@@Y(fJfWj*Pd4_zoj-Ej z6>!rPDvjuZxkqji-hx2-4?u6?OQ8>9fbx8)P3yheFb%7YE#_-uTWpRFYz0hbg~0BF zI%k1ubv6{^lq#>O(GntoN;S1w&zKbBjHWuP^uv}~>b=jw=xEjKW>x(Gitkrs@c{<9 zVN7(?I!R!T9v%hs&ASpmo)a#Uzknzo_Y3%%3Q6*0m*(=%e<;imLaIx3KmADE);5h= zpZ2Mk$e0>#r*(}A3qW@wZH93lL?zA46)!wND+WbKhL8`?NW0yOb6Os-NnDot4a#3cDLmb(mg`8P z6iB*cjbSro$T~vQ(H$aH2rs8I)Bs(46)C8(ptj?FS^s6 z9}&{*V%hwQ5oP%Uiha1Fmtn-~BK}!s?<9{w@yH`|T;&6my7FVAF6@S0+;xA3qbbEr zqK%jiFXJv6%AyCu*b)oUQWbZjP!wtTP~?8G^@ioQo3C2TWoB_yPr5cc(#FvU=#r&A zT#g~e4x0n+e@V*9h`(ZBIR06s{Z^FB|5!(7a%61TZ5L};Z=G<$DmVn5_i((Cm1MiN zt&U|JSTODoK-?Lux!v`J&9%txh;-U=V272Io_{D|fT&C-+znMj=bw@`)_gJH7ANDA zNwm!~@|IlG*UmodAHNXCW&3jdL?;l9=R4a*<0a6#0HSoYWFpu57q6BQMq5M4l|zbR zYkH8Ixvz0eP1Rq`=nhp=b3(&Ff0{mbEYD(gGWeOV_OY!+giq%52#sxvoVz-_Jyu;- z@$gYn=7UPB7R9erTm38(&UJ7lu5OoyZ-dCx^6n(nRUZ0}%coJAZ}b~@G&a*Kh7n#mOt_aAlUzyZ|PKRilOiKOOCsoFFm zy6-96nby*_-Ytd>^Xa7y15&c$GE@w2=vWJ%e1R#c@AL2c#IvL8=qDuaKZrw8Wb;f% z97rVBJ#fe1pTb(ad*}drI>6T>|M|*C8|ApPE6(fVLN=PwEZ-ha*vRU*neqm~_w7Xp zdjsjUbB?BT8ckDV&evCmlIHqDNW?2^@77Sgg7pxX*vf!9&7-o5%h7ByKyoTX;J%${VHfq=zKdMNybK7lvpQ;|kK31#pBt|P9$~2CS zduRtNlc+PlsrYpJF!AvyLbG^mx*_ZVdY=+puOGQKdi9V9Zq&|h&k;|i=}S3c#|Mh_ z@q2ROsaf5`yfOTqcQ+^R3TL|yp|{;xDtgEE<-77d_zvCIuYH?85tYOaK^sZe3iy#B z7Nzm?HUe+M@gKg@k55x;;ABDh1hnag!^PAJMoOalkXK5Ok0U%)9;x;QLLtJAh0h}p z)Un1bjhJigE>7qRz`C}KPTxg(&mTxT8D&MhGSSbi3h`i z714e)Nm9F2y6$n_jlUyWaBJ@c!h45IGM}q5GFEWecGb$uR8T~3z97AW1c4QQQ{d(a ztJx<}f>|xbNGCtUjfJN#EE@WkYX=&z=V_Q)_4v>6|3We~3FBy7#OC)SuAX;nq2i2Y zqwXW%tl7v^j6midpQ;uPN!zQZ{Ovl&n>|BQc-E1Psv~r>Y&LCIkvp1KzQFi3aF$F- zY>ba)=`S^##bzD&P3?7ARTpQ=Vj|1&?iWn z(P6UbWka#eU7Ez?o}Y%>4ljr?s{7ErU(=afx(J!baM-PF{>ig1`V)G1Xy#v}i7|JP z_o{>PAIdBAzK~89hs2r_Gkf$ z_KhQr;$!i$n)tp@_c9A z^@tb=k_Esp`Xa9*XM^Al@#gZ-?D4Z&Uq(d+-A+eLDD$dXDJT#;e1@vw(E8N(*yB~% z2^?!o0jPuhNB4~~H;6`rZQ~Z8$AUYd-!I^IARhx^(#dFC zJO6;P$U;SLA}Ds9AD=BXN9as2T<`YzVyH%^rzN=q*NPTL%3tML_=)McgpA9=Rf~eL z5MdqyA|RrXw+PWl30X}yX7Y_j==;JMJ zEUk2PZglHVm_PV;ph1)B8GW#??V&gUg}42iv>k^Wt<{}za;}b6c6_D3yP^z}NPH*7eZ#{uaI@bM_3s}r^cgmQ zW%}?Uum_yNpH97U^MI3Co}b(=fX1YpjO``gIq0Zt9ppyX84dx&s(Jx<&n@u7iQLy~ z3Uzfn!clymZT}Mbj|N_-#o@*+qakAE9Qoz`1k6gq)ZN z9;L#dpq&WyH#*B9AvUR?c9+BQ61b$kOoE?&zCmnC*DeO-tY#tIZ6LMp9ORlxE+5L| zu$BMFFSkUY4_DK1nJ1ONNtiiPl@0l;nHLX>CKk&jxa}#+&!c5Pp8CBSnXM0FqW~u7 zC-)-|%u7H9SaDB(0g$DQuYk}|AWa}&IS7oH5O@~iu@x#0?Wnfn@Cs(7|BNf;n~&O= zISJ)e5bD1TL;E_#S%9X&+bbT|*ephJvqszV+%L4qRX=y#Cvq}coSmn3;EivdXG!j= ztQ4HL&B?45UAk}Vv1w8aBAt1!RsH0X$vlXn_uW0l4L@sK_0Z0u1 qy$d0Kk&^@(qrx+5-j;J4S8E#A82Hh#)ObI;z literal 0 HcmV?d00001 diff --git a/setup.py b/setup.py index f5720a5..009e6df 100644 --- a/setup.py +++ b/setup.py @@ -14,9 +14,7 @@ "aiohttp", "BeautifulSoup4" ], - description=""" - An open source web crawler framework. - """, + description="WebCollector-Python is an open source web crawler framework based on Python.It provides some simple interfaces for crawling the Web,you can setup a multi-threaded web crawler in less than 5 minutes.", license="GNU General Public License v3.0 (See LICENSE)", url="https://github.com/CrawlScript/WebCollector-Python" ) \ No newline at end of file From 8b1d4c8189633047745988167fee3132172967ce Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 02:28:21 +0800 Subject: [PATCH 03/14] add detected filter plugin --- README.md | 42 +++++++++++++++++++++++++++++++ dist/webcollector-0.0.3a0.tar.gz | Bin 0 -> 6500 bytes examples/demo_detected_filter.py | 35 ++++++++++++++++++++++++++ setup.py | 2 +- webcollector/crawler.py | 5 +++- webcollector/fetch.py | 13 +++++++++- webcollector/filter.py | 15 +++++++++++ webcollector/generate.py | 9 +++---- 8 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 dist/webcollector-0.0.3a0.tar.gz create mode 100644 examples/demo_detected_filter.py create mode 100644 webcollector/filter.py diff --git a/README.md b/README.md index cb86568..6f6b0eb 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,48 @@ class NewsCrawler(wc.RamCrawler): print("CONTENT: ", content[:50], "...") +crawler = NewsCrawler() +crawler.start(10) +``` + +### Filter Detected URLs by detected_filter Plugin + +[demo_detected_filter.py](examples/demo_detected_filter.py): + +```python +# coding=utf-8 +import webcollector as wc +from webcollector.filter import Filter +import re + + +class RegexDetectedFilter(Filter): + def filter(self, crawl_datum): + if re.fullmatch("https://github.blog/2019-02.*", crawl_datum.url): + return crawl_datum + else: + print("filtered by detected_filter: {}".format(crawl_datum.brief_info())) + return None + + +class NewsCrawler(wc.RamCrawler): + def __init__(self): + super().__init__(auto_detect=True, detected_filter=RegexDetectedFilter()) + self.num_threads = 10 + self.add_seed("https://github.blog/") + + def visit(self, page, detected): + + detected.extend(page.links("https://github.blog/[0-9]+.*")) + + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + crawler = NewsCrawler() crawler.start(10) ``` \ No newline at end of file diff --git a/dist/webcollector-0.0.3a0.tar.gz b/dist/webcollector-0.0.3a0.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..613d6aaf229e63e30779d967974284a86bfd94da GIT binary patch literal 6500 zcmV-q8Jp%GiwFoZ5Mx{d|72-%bX<33VqzGULjBzn&K`7!3H3 zWLcMBTxAiMnVx6&boX>O?(oF+T^HMtAM7`bhS9bf2QMBsH6Zo-efk%Y`j`H0wma=c zui0qyS}z*SR=3l6fx0iAAjM&11qeOLAmzdRKS_7G^*??0djIXotB;Sk{=HtWaQ*wO zuCo5^UZ?v4HOlq>>gfZHtf>`Q`~Sj0==k0MHI2qj<;0rf0g^YzP9-V9x5-Xr9M9)g zupFRY@Z>1jMW@T?+V_wXBFjVm0(&U*<^J2Q$lt*E2t0fE@1%Mc0_Lzqj;bWJ8M~j^YKf&^&e{ zXFs|Qur-AStcDu*%r4F>4v*h6n(@w|`cmg5f4`N_MJkH4JkRNgx__Cjdm^~pJU zjXfM#E;@}Tu45yfhi=G3plt@T)hO!8 z+`aVxM_eE?ywAJvZs9D*&wu~>->}%%I~90_HK670K{^ktz`k~F@xj~*VJeK@33PzX zJ23ie(f#rkhWuG{YZv`wg^o=gzDE~RJoim2j(pR@ccIC*g|S#()-bCb9u$_Try%B* z7hCQ+Ao@~b!T%XM_6;E`k?1KL1pXZ6&xYw)qGKGv;K@cm|MWhjJMv-OAMmAEaO~Yh zwm${H2vV29l<4&g&`UzD zSB^`Vkz>BP6tmD|nJl+CV+Bs#_{#B8oKx~O_?9p65L}xD{ydXm@Q(31Y#a%Q0L$q4 zi5wMTh^%Cd{gLsL0pzGaXUzmrIi4%0Lx7E0?7DOV>o;$!(fo10(SqHZfkU@}vYi89 zbpuZ>mF8nW{DF2yu%5zd4w_3+{Lve@+^8M zGOV&Y67g2FG0e1BlX&0Hwi{3J@FL3bN`#wxE+sP9iG2GGtbAPPv+rk5Ve8-uYT}{wG%>U(5cZ(HHg~jefID=zqV}?Unfd zE%2Yp1HiTcPi!o#0Ju9Y5x~3u_)OuiQN82HzD_jnARX~5;4g%>#`nQC?x&Kf1u?;<@sJ#)?u3G z3Awj~2?%t#ff(K}GuT>PSNKXyyU@04S3_S{Rn(K4f{d=8Xx`Jl!qlV>N~xsJP$)>3 zCsRi4Dil>;sckFQRYSGQO8FVvv_xY}FFm%oVIa2)BtzPfwam3bas|O|766q@=r_HF z*+bECfphc6FJMyehP7Cbn+CaV9kL>m)UT2Mv`vO-n_*HhENE4_9wOtpxJ-Ik_4?YL$Yi+b-mVN_G#$Y7)~@LMh! zaHi;r`6uJRpRpZBc&(+)0%)ULk6zN4zGsW7Q@n_-fkIC4=M}`pnB(9I*Xo5B&JUcDV0=Gt0Qkp6^ULARRWfltn0}QpC^7r zR?jVnycWt$xQp5TWZR~WhC|d)@3$=nc#6?y8&h>zt$ONW3*Nw9M;4s0@DL#&ONfBT znR!Wz?0Dqtl$*;ED`~F~S`}Cx?1@~brJF@MI95%*GMD$9LZI0GBuoHoD)Pr5|Eag2 zWbOsuGZmyz+S+mh>eXzh1CRElma{b&BVy1?{i+xW(c(Jc^#J|m{T4-jWVw`l41SBg z{E@yQ!00RJAh|patD0k3a-MHCJJ)xq=IT-kV1uplAdym4seua5YsFBguTpj3e1j? z^`8FD(Yh28Ia6z_(0wQ_g~p5?ltLN2R);yzQBIm;G9!o_ux*Vl)uSK-J*_<8vcM|4 zO>QMX`6Qx<_xy$=nWTKFq4`Vbf21?_o+_Yq`rlr=!}LF`X1Cev6a7!GU+RCqiT)=&ZjXQZz?NN7 zQ=|l0V!!qfm;7daA1bc@0hAv8;wmB%lA+%%Zm%eG{y$aO&cCit$AGG%tzajq3B zoIZ5{3T8&LkaAaHr`9H}Z{Os+4HMUsU|G;ZqKhD#o}!O}q>#07Uby7MxB zx27vCFH(^PYQ;Pdooo~_H3FI1`#2GDvu(VH()%9by;(GSyj2_oz;#q- z7LeBIW(^YpZPN!4YJmMK)q9U2{|j&3uV(+*ZTE%z-)(lA)c&*IFXjJlA^&qZo6+gK zn7xThbb<5m8wWU>U|HyGt=!vPv*J0)iN6P`%70~e+|4|;WkPa;mxRp9upj1<{p2dV zuZ;gfn7BU$ScCtqM&kd~=yV&z|Eu5bm-@eNXa8BO(UaA8n?_(rNkC#+sJ&7s4jdpP zb=i!fv0h~*D3EA?3aBdZLJ>~G&YBMraaYli6^x~j4RQl4ZfTR&*08dxBz=XOU(Zi& zJVkO)0>{%y{~K`Nn(gxa5H*V{6x-p^ZK>Th1&5s8g!bN{MKt2+f322JAMNZaS>uxb zr{_PV|6lR`ui0ufn(6(2i{AhB`knIpf1dsy{Mo*?f+>!8jY-%M1v8$~JXj+-l!z8l z-!moPG(?e)EEHMcjqF5?YH`RSyb;ZaMVJht&#SL>+91Z*+^#9-Z$aBuM4FU*o}Y~V zXLNLaHabpui!}Gu;oJ8#r>W*}vI|*VmUZ;;cx0Y@JTu25;ySnA6lckY(b=K-ZuF}; z{&j4g{dzhgj&3NIR?KB%OxrtAJx}FC~4V` z%pDAlOz0h-HTabqU@{QxC5j4*I#w`X)H`9Y$q-k-Bk`S*72Ra=SHMapjzdhIb8y#8 zw|t2tWoufa-VRc0Fpd1zt>wm8;arrq^(ivai^HPk1wz_t*SZJ*kODRtP zTT!v3Dv-?jnK{c`4oD(FHLY=iS53Cfao zk`yfNTlO${81>!0@oXzv_*;-I>R2YoR;XWLKD`$!UF*ny%xr`F%6lAMrrag5G*kIX z5)4ONHX|PpkOn3fhCl*oNzJwPWb$k!aR-whro@twCQQN`!<_29VzXGMmCSe7aY1R& zM*d5mZ}MFwKdS0Dfx2V1)xr;?3V; zQ{k1;vx^ep^n}&bOE&Ey@PTwiOL@mmagkC-T_%IAKp`yA`AtAk2ConjOi=#ZoJq4+ zeBCWKNQ$PpeBJnR&)p1YnyKf`GAb_JLS`hr8iBNjW*CM^qHLw1wF*p@SSMvt=So)B zVQ8`!evrW+Xelu+kOwIsL%wjz$-r1K>7=8%SqHh;0v=AY3}VJMS%>U&XV8e-N06;$ za{R0n%@hMuIDq6J+GGQ&CyeY;m2jy!%4G?!Co%J@Jf8@aP7Lz|c3ZMj6Qk;uaEoHl zvwUIFm({zIw6&7S5uG9O6=xpANg|pN?7G6KU#@6S_DstxA>Y0Q2f!|qnU7SWo$|qq zu5W>mA{>5JZ_LT?X4{mv%6DUpSI!tK?b7HO=tw;oM8`A7$h}(4fl=C5T`E(usGdD5 zWKG>c#T8kG+qC&Lqpwo+TQxK0b(58Ax_)oLS0)fr#avX0 zZ3N*0z@)M)4VNnTXRiPE05={M3U+P$w_abwe`^uLpGLFSX_oPyzp?*c8a+t8tWT|_ zGxogO7zHT~aeA}=6?d*B;yxMvvp`x04@8aa;(zwKtzH@b^I5I`1CD{U_FvtO8vnD` zFXMkd6Zm$ z0*m+6IC34}(Z}q4kucAz$!&9uDq6l-;fIGfGACgHz>aQLNq9^zpC@Cv$H z9cb3{k4%)P`=9!|IgdZN>*}YFh;x8|J6P*9n7Mx_sLE`~s1{Y^8sg$u{}CMKwv-eW zau@Ls%(+%^(NG3XUHtg_f-mg|rL5Y9qMcRsk2ys!_p_q<>kZV?@`IwL79Gy-*aiP$ zYUxh#m*@WiYyOqa|4s+;*!kZB27vm1^c!XTm*;Z+iw{MxgE<$4O;<;9&=-7zh$gd1 zSm4Bdb}M^yNp!X5$3LyR)GNA}o9OINm~$-6)7*#f=wwp1(wEfmJ^8gVXdNIMKwBA^ zPIb#?RmizQe<-<*-XdPIZASb5EzPFp{>dq}IhUJogk3Z5Q8KLty+{r-F6>c}vrW=3 z3bvm44h(T997q1*ZA4}gn2EtgA1|Z}Awn7@zgFD&Y8F8|(VGTEBDW9ZCA*M^%)HOI zWcxu%5c9pDu61lzoP25Vx-MjCb`dT}IfI)~k>B@;PNZyUsh#@0f<9?HvdH~`@XAt# zFxX&}3b;%4qJ7%{Q*t8`DaertUPPt`q|7}cQ|VgVVpy98veNc@iowcWCX{P6 zdhJsG_e|EGI#_(o_>QPx)XvH*hO5lHrs$Z?{ada||r! zATys#mr7GDpma*Iii%Z_WWG+TWkD(Rm2bY&h}+u=h6_JrEn(FfnYo(cD^ zH;v&=bZ}G|2)P#x#WY?jF+?xFf2m`ANyKpIAPWM1(HvaL!jhYZvR?=V#!&zz#A{p5 zMKLEiJW#iozEu+|?6Cu)51lU@Sf;dI4mjy51D;HN-eEjl77g&UUOoNj2Hix6M8{4( zsIiacKp}We<^u)LJz7*LNwyv@?JY_7q(X@a4G#65K!gpXJ&T!PoH~%a#3)LyK!RdR zBqf+2X`N2G+`6r!!+3hylH2z1Y^#A=P z=YPSM&cbD=|MDCE%o@l{$z5(D*og?gL<2CfLdQO$SIPX!5;y~F&4&Ql2&1VVN8*da z_!hfEQSa@^tB*W`d1(z_{;g((HvO_li2hci#Yw2(dN_c`ISxbODjdEnZ3`a-{|$U~ zwNHHA4jzOWTiXBhTdmUm=b5el6cZ*2d-jrjhsB6KM~@``H?!;CYW4f2{P!fOq>&~4 z-w^*#-@V=^$MK`{fBjZh!GH2i@Dl%@B7MM-HMJsZ|6e#DhFJsDG#WdV6JV$Y8BTAf zl9b@vWT!HY=W{Dq4$v=na+K_%(`9t+dqf7bJmkZfLA>$;8xwFySQE^EW5joUaD#v| z#8Z%{5vymsjnE?SZ=ER)q2e5)&>_*#K_HEAFtcoUo!piX>nq~s3+?YM{TSI6ky@j8 zfh>^5-N@Nz!QG(&xpDF(F5nZPDwOY{x#PtV4vn448)$EzeOQcGNQHxggDWSxjweKK za6sAGF%434pc(N_{_F4`kiMgy|2+^1 zO8kF<^l5Z>{9$Cwrxki`8jEG6Nc)$~U~U2V*X)SRz0Z6_u%i9ISVbRpyiyd6xS|bH zSkX8yRyOleB9Xr^sg5gGMsqV?DfZl{JhD-5nf_|@KeH1b zmH%t1@?W#lE%iT7zWyb%SEe$RsZ3=mQ<=(CrZSbOOl2xlnaWhAGL@-JWh&F3BmEyW Ke&_fA$N&ItRpz+> literal 0 HcmV?d00001 diff --git a/examples/demo_detected_filter.py b/examples/demo_detected_filter.py new file mode 100644 index 0000000..2d44ff4 --- /dev/null +++ b/examples/demo_detected_filter.py @@ -0,0 +1,35 @@ +# coding=utf-8 +import webcollector as wc +from webcollector.filter import Filter +import re + + +class RegexDetectedFilter(Filter): + def filter(self, crawl_datum): + if re.fullmatch("https://github.blog/2019-02.*", crawl_datum.url): + return crawl_datum + else: + print("filtered by detected_filter: {}".format(crawl_datum.brief_info())) + return None + + +class NewsCrawler(wc.RamCrawler): + def __init__(self): + super().__init__(auto_detect=True, detected_filter=RegexDetectedFilter()) + self.num_threads = 10 + self.add_seed("https://github.blog/") + + def visit(self, page, detected): + + detected.extend(page.links("https://github.blog/[0-9]+.*")) + + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + +crawler = NewsCrawler() +crawler.start(10) diff --git a/setup.py b/setup.py index 009e6df..e0f5662 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="webcollector", - version="0.0.2-alpha", + version="0.0.3-alpha", author="Jun Hu", packages=find_packages( exclude=[ diff --git a/webcollector/crawler.py b/webcollector/crawler.py index 6c1320c..0747725 100644 --- a/webcollector/crawler.py +++ b/webcollector/crawler.py @@ -17,10 +17,12 @@ class Crawler(object): def __init__(self, db_manager, requester=HttpRequester(), - generator_filter=StatusGeneratorFilter()): + generator_filter=StatusGeneratorFilter(), + detected_filter=None): self.db_manager = db_manager self.requester = requester self.generator_filter = generator_filter + self.detected_filter = detected_filter self.fetcher = None self.num_threads = 10 self.seeds = CrawlDatums() @@ -55,6 +57,7 @@ def start_once(self, depth_index): self.requester, execute_func=self.execute, generator_filter=self.generator_filter, + detected_filter=self.detected_filter, num_threads=self.num_threads ) return self.fetcher.start() diff --git a/webcollector/fetch.py b/webcollector/fetch.py index c78f91b..8d10256 100644 --- a/webcollector/fetch.py +++ b/webcollector/fetch.py @@ -13,11 +13,13 @@ def __init__(self, requester, execute_func, generator_filter=None, + detected_filter=None, num_threads=10): self.fetch_queue = None self.feed_stopped = None self.generator = None self.generator_filter = generator_filter + self.detected_filter = detected_filter self.feeder = None self.buffer_size = 1000 self.db_manager = db_manager @@ -66,7 +68,16 @@ async def fetch_coroutine(self, execute_func): crawl_datum.status = CrawlDatum.STATUS_DB_SUCCESS self.db_manager.write_fetch(crawl_datum) - for detected_crawl_datum in detected: + if self.detected_filter is not None: + filtered_detected = CrawlDatums() + for detected_crawl_datum in detected: + detected_crawl_datum = self.detected_filter.filter(detected_crawl_datum) + if detected_crawl_datum is not None: + filtered_detected.append(detected_crawl_datum) + else: + filtered_detected = detected + + for detected_crawl_datum in filtered_detected: self.db_manager.write_detect(detected_crawl_datum) logger.info("done: {}".format(crawl_datum.brief_info())) except Exception as e: diff --git a/webcollector/filter.py b/webcollector/filter.py new file mode 100644 index 0000000..372dfe2 --- /dev/null +++ b/webcollector/filter.py @@ -0,0 +1,15 @@ +# coding=utf-8 +class Filter(object): + def filter(self, crawl_datum): + return None + + +class HistoryFilter(Filter): + def __init__(self, history): + self.history = history + + def filter(self, crawl_datum): + if crawl_datum.key in self.history: + return crawl_datum + else: + return None diff --git a/webcollector/generate.py b/webcollector/generate.py index 9d83012..510772a 100644 --- a/webcollector/generate.py +++ b/webcollector/generate.py @@ -1,11 +1,13 @@ # coding=utf-8 from webcollector.model import CrawlDatum +from webcollector.filter import Filter class Generator(object): def __init__(self): self.num_generated = 0 + self.generator_filter = None def next(self): while True: @@ -27,12 +29,7 @@ def _next(self): return None -class GeneratorFilter(object): - def filter(self, crawl_datum): - pass - - -class StatusGeneratorFilter(GeneratorFilter): +class StatusGeneratorFilter(Filter): def filter(self, crawl_datum): if crawl_datum.status != CrawlDatum.STATUS_DB_SUCCESS: return crawl_datum From 3dcd5f0860bda264276beb9c0db3e662821cb55a Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 02:29:24 +0800 Subject: [PATCH 04/14] remove dist --- dist/webcollector-0.0.2a0.tar.gz | Bin 6068 -> 0 bytes dist/webcollector-0.0.3a0.tar.gz | Bin 6500 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dist/webcollector-0.0.2a0.tar.gz delete mode 100644 dist/webcollector-0.0.3a0.tar.gz diff --git a/dist/webcollector-0.0.2a0.tar.gz b/dist/webcollector-0.0.2a0.tar.gz deleted file mode 100644 index 795f297b7c553f93c835624944767d7cb46aa747..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6068 zcma)=RacY^!$#@ulB%R2}kM1p_T4#n2|>522shOyJKjW`+MGh@SR;J z`|w(8ug#E%h2@UwV2J`ay88KB207U~czbv_Irw|~@(J<_^1rbY5tTzHLu(PuOi-p~K<@W=Ff5`ZA z&{NN?(4U$;Ah+^P9KsRkAqFH5jN(5tx#iZ^FN0-9lvo8)>dnm~1k&r5eclwVY{H0M zk^~5T8^bU8d-1|{R7YJU(i}n{ZM2r%AW>Bq>^m293Z{;TD0Df>f7h9MLaXYufd?MZm4)9N~wquBe z#LBW$792Ac#5Q!j#}!F=t$Myjo0OU|GW_n8NR3P)KZ~Em*60!Nz zEKw+p^Kxf?Dq0HYR<3y7eMa1w=f6_{Qo#^#Ae;AJ1TU)&!qb~Yaf#te)7bXXUR(Iu zVt2|P!rS&_aSpZzflU~CiXAOcfv(laP_G}<$U1gXArAS8_DK*!sZGOK^e7sf z0;6GY8AKHwSn0+;r#Opvem;e++qMWm-7fI_h_|K+ka{i_SYE8VNit9M# zfIxBLdNXn^Kx_?uDvO%D<~R|IIRp0+g3>_;L?g2wTW0m-dJ+#3!c4DY!?-E(LZDz$ zZ6pL|eC1Rj#7*8);nn`2L<@`mc2l#zG!5rI)@ORboMf7jXUCZbQsh)doeMIvtmCd` zClTfo9Tf}>gO<-BG2bYP@bhO2QgnZz(HNRlxj&Ar+Go%f6T3k(JVM_zGbeE*ceZ@z z@3w_lhzSoVNe@6Dubk`m)@MM%psIN;lKhT97O@JMB?aY!bH;zx(%{fNUDpci>wQae zje~kcroU`lfexmfEEs5Ckx@R+iuO!YkoHGlxXM-2IM@3L4MUr$hU8iCSqM>ah8I(f zIZ5xRzmr?*M9}noB-TD-6=$*z?l+^kv3YqUs449%#_YlRf}3vGL=jsxt@u4gs zTjHZw#jZ;qJeok3Ha{s<*X8vZyw$ETfZ$Y6r*&E)(H2&^`djEXUV55-DeTSt6~!NT zf#EudVh9njT!4LuYv|lUWkoi7DeR}!k>n^HOsW}lPu&S&DiRZO0jph ztK4g-@PJ-F+N}NuL1Bjwp~rJL_DfJmWCDVLMQ6Zv`kBXvD42L_nJ>IGvZ6t*nF8X6 zu8I4%@$d;_UNbB-MJ?bX^4}oSX_0H=q$7U?5l?R)*F@>uM`g8uJr;6r^b`XE2NomH zMg2A#skG#TyYfu5CLkROn*KyhD}1N<`Qtq(o(2U68?EZQU*moUWeFX;{1ihOeX1w) zOY!EJ@MTJ*W3!2~#J-fGr!HM<>0{x554+^~YY@Du7u;z62SX<4?fNPm+JO7px`(Gr z(8KQ8!X}V=3fxMD2MH#H!K*sK$ioPb?Xifze2zW;?j8Vm@cchuhhVPJwb=~?ghm!l_5o2T zr`2?w-S2wdh!^0zR5r|cD+6DLP62Ds?72VIOVDE`0dudfb!#^$AoL!2pNN8mK#kY zx+|ii85XPL=)!{@h!8g+Qg9 zP7KE(it2Snz|}yNYMQhDRiEXXh`fAfvq9PnAJQj&LZc-)uq|@cvlsY1QZS|_ zV;L-ajVgLBjVG>oaXLB1>B&(pJLBJE^s5BV^2?>}2=JlJwjcxxY~hu(|A0+=lX(5k z;~rA6^!QF~`Ms7>p@^kWqEr3dDyJ9c2S-6ehI`#SJaH5MvHSE-8x|*@FZ+De!jPr4 z8`Q+o>iUC}|6*GYX7`V=-v?q}3+{{OZN?aAZd!g7zM9ba(o#C6vy_}RCgq|-x=uBh z_MB|k&`;GqsaehXx)3~`+>#bk5Okq?EhAAX*m>2@{S%x1ePVJwGHSAFRrzeNOBdtl z2$Y~gwnp`(G}bd16_;{}@!tlMYQF;!vO!$^*Y}e*@%?FYyAr9wmr<8d5JFN*m$tS{ z4XKfu;u)JGc5KJC?U?S*WL^UlG~n(g#ZiXz=Sr`xv}VfjVdV=E*Zgs;KRp}jnDioS zlMmF|kGYPd3_VU?@kIC9QgIP#%uG~-Gg#4d< z60EjD8cF#Ls*2zl8_=d$77^J2S=2L0nMNM>d_iV|*iNWQ_*4i7Qry~fV&31e#K(`# zMd8F|-Gb04YUVVDSjE=~0fVfXJh7@B8&ypGR7x^GD}Rbu@@Y(fJfWj*Pd4_zoj-Ej z6>!rPDvjuZxkqji-hx2-4?u6?OQ8>9fbx8)P3yheFb%7YE#_-uTWpRFYz0hbg~0BF zI%k1ubv6{^lq#>O(GntoN;S1w&zKbBjHWuP^uv}~>b=jw=xEjKW>x(Gitkrs@c{<9 zVN7(?I!R!T9v%hs&ASpmo)a#Uzknzo_Y3%%3Q6*0m*(=%e<;imLaIx3KmADE);5h= zpZ2Mk$e0>#r*(}A3qW@wZH93lL?zA46)!wND+WbKhL8`?NW0yOb6Os-NnDot4a#3cDLmb(mg`8P z6iB*cjbSro$T~vQ(H$aH2rs8I)Bs(46)C8(ptj?FS^s6 z9}&{*V%hwQ5oP%Uiha1Fmtn-~BK}!s?<9{w@yH`|T;&6my7FVAF6@S0+;xA3qbbEr zqK%jiFXJv6%AyCu*b)oUQWbZjP!wtTP~?8G^@ioQo3C2TWoB_yPr5cc(#FvU=#r&A zT#g~e4x0n+e@V*9h`(ZBIR06s{Z^FB|5!(7a%61TZ5L};Z=G<$DmVn5_i((Cm1MiN zt&U|JSTODoK-?Lux!v`J&9%txh;-U=V272Io_{D|fT&C-+znMj=bw@`)_gJH7ANDA zNwm!~@|IlG*UmodAHNXCW&3jdL?;l9=R4a*<0a6#0HSoYWFpu57q6BQMq5M4l|zbR zYkH8Ixvz0eP1Rq`=nhp=b3(&Ff0{mbEYD(gGWeOV_OY!+giq%52#sxvoVz-_Jyu;- z@$gYn=7UPB7R9erTm38(&UJ7lu5OoyZ-dCx^6n(nRUZ0}%coJAZ}b~@G&a*Kh7n#mOt_aAlUzyZ|PKRilOiKOOCsoFFm zy6-96nby*_-Ytd>^Xa7y15&c$GE@w2=vWJ%e1R#c@AL2c#IvL8=qDuaKZrw8Wb;f% z97rVBJ#fe1pTb(ad*}drI>6T>|M|*C8|ApPE6(fVLN=PwEZ-ha*vRU*neqm~_w7Xp zdjsjUbB?BT8ckDV&evCmlIHqDNW?2^@77Sgg7pxX*vf!9&7-o5%h7ByKyoTX;J%${VHfq=zKdMNybK7lvpQ;|kK31#pBt|P9$~2CS zduRtNlc+PlsrYpJF!AvyLbG^mx*_ZVdY=+puOGQKdi9V9Zq&|h&k;|i=}S3c#|Mh_ z@q2ROsaf5`yfOTqcQ+^R3TL|yp|{;xDtgEE<-77d_zvCIuYH?85tYOaK^sZe3iy#B z7Nzm?HUe+M@gKg@k55x;;ABDh1hnag!^PAJMoOalkXK5Ok0U%)9;x;QLLtJAh0h}p z)Un1bjhJigE>7qRz`C}KPTxg(&mTxT8D&MhGSSbi3h`i z714e)Nm9F2y6$n_jlUyWaBJ@c!h45IGM}q5GFEWecGb$uR8T~3z97AW1c4QQQ{d(a ztJx<}f>|xbNGCtUjfJN#EE@WkYX=&z=V_Q)_4v>6|3We~3FBy7#OC)SuAX;nq2i2Y zqwXW%tl7v^j6midpQ;uPN!zQZ{Ovl&n>|BQc-E1Psv~r>Y&LCIkvp1KzQFi3aF$F- zY>ba)=`S^##bzD&P3?7ARTpQ=Vj|1&?iWn z(P6UbWka#eU7Ez?o}Y%>4ljr?s{7ErU(=afx(J!baM-PF{>ig1`V)G1Xy#v}i7|JP z_o{>PAIdBAzK~89hs2r_Gkf$ z_KhQr;$!i$n)tp@_c9A z^@tb=k_Esp`Xa9*XM^Al@#gZ-?D4Z&Uq(d+-A+eLDD$dXDJT#;e1@vw(E8N(*yB~% z2^?!o0jPuhNB4~~H;6`rZQ~Z8$AUYd-!I^IARhx^(#dFC zJO6;P$U;SLA}Ds9AD=BXN9as2T<`YzVyH%^rzN=q*NPTL%3tML_=)McgpA9=Rf~eL z5MdqyA|RrXw+PWl30X}yX7Y_j==;JMJ zEUk2PZglHVm_PV;ph1)B8GW#??V&gUg}42iv>k^Wt<{}za;}b6c6_D3yP^z}NPH*7eZ#{uaI@bM_3s}r^cgmQ zW%}?Uum_yNpH97U^MI3Co}b(=fX1YpjO``gIq0Zt9ppyX84dx&s(Jx<&n@u7iQLy~ z3Uzfn!clymZT}Mbj|N_-#o@*+qakAE9Qoz`1k6gq)ZN z9;L#dpq&WyH#*B9AvUR?c9+BQ61b$kOoE?&zCmnC*DeO-tY#tIZ6LMp9ORlxE+5L| zu$BMFFSkUY4_DK1nJ1ONNtiiPl@0l;nHLX>CKk&jxa}#+&!c5Pp8CBSnXM0FqW~u7 zC-)-|%u7H9SaDB(0g$DQuYk}|AWa}&IS7oH5O@~iu@x#0?Wnfn@Cs(7|BNf;n~&O= zISJ)e5bD1TL;E_#S%9X&+bbT|*ephJvqszV+%L4qRX=y#Cvq}coSmn3;EivdXG!j= ztQ4HL&B?45UAk}Vv1w8aBAt1!RsH0X$vlXn_uW0l4L@sK_0Z0u1 qy$d0Kk&^@(qrx+5-j;J4S8E#A82Hh#)ObI;z diff --git a/dist/webcollector-0.0.3a0.tar.gz b/dist/webcollector-0.0.3a0.tar.gz deleted file mode 100644 index 613d6aaf229e63e30779d967974284a86bfd94da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6500 zcmV-q8Jp%GiwFoZ5Mx{d|72-%bX<33VqzGULjBzn&K`7!3H3 zWLcMBTxAiMnVx6&boX>O?(oF+T^HMtAM7`bhS9bf2QMBsH6Zo-efk%Y`j`H0wma=c zui0qyS}z*SR=3l6fx0iAAjM&11qeOLAmzdRKS_7G^*??0djIXotB;Sk{=HtWaQ*wO zuCo5^UZ?v4HOlq>>gfZHtf>`Q`~Sj0==k0MHI2qj<;0rf0g^YzP9-V9x5-Xr9M9)g zupFRY@Z>1jMW@T?+V_wXBFjVm0(&U*<^J2Q$lt*E2t0fE@1%Mc0_Lzqj;bWJ8M~j^YKf&^&e{ zXFs|Qur-AStcDu*%r4F>4v*h6n(@w|`cmg5f4`N_MJkH4JkRNgx__Cjdm^~pJU zjXfM#E;@}Tu45yfhi=G3plt@T)hO!8 z+`aVxM_eE?ywAJvZs9D*&wu~>->}%%I~90_HK670K{^ktz`k~F@xj~*VJeK@33PzX zJ23ie(f#rkhWuG{YZv`wg^o=gzDE~RJoim2j(pR@ccIC*g|S#()-bCb9u$_Try%B* z7hCQ+Ao@~b!T%XM_6;E`k?1KL1pXZ6&xYw)qGKGv;K@cm|MWhjJMv-OAMmAEaO~Yh zwm${H2vV29l<4&g&`UzD zSB^`Vkz>BP6tmD|nJl+CV+Bs#_{#B8oKx~O_?9p65L}xD{ydXm@Q(31Y#a%Q0L$q4 zi5wMTh^%Cd{gLsL0pzGaXUzmrIi4%0Lx7E0?7DOV>o;$!(fo10(SqHZfkU@}vYi89 zbpuZ>mF8nW{DF2yu%5zd4w_3+{Lve@+^8M zGOV&Y67g2FG0e1BlX&0Hwi{3J@FL3bN`#wxE+sP9iG2GGtbAPPv+rk5Ve8-uYT}{wG%>U(5cZ(HHg~jefID=zqV}?Unfd zE%2Yp1HiTcPi!o#0Ju9Y5x~3u_)OuiQN82HzD_jnARX~5;4g%>#`nQC?x&Kf1u?;<@sJ#)?u3G z3Awj~2?%t#ff(K}GuT>PSNKXyyU@04S3_S{Rn(K4f{d=8Xx`Jl!qlV>N~xsJP$)>3 zCsRi4Dil>;sckFQRYSGQO8FVvv_xY}FFm%oVIa2)BtzPfwam3bas|O|766q@=r_HF z*+bECfphc6FJMyehP7Cbn+CaV9kL>m)UT2Mv`vO-n_*HhENE4_9wOtpxJ-Ik_4?YL$Yi+b-mVN_G#$Y7)~@LMh! zaHi;r`6uJRpRpZBc&(+)0%)ULk6zN4zGsW7Q@n_-fkIC4=M}`pnB(9I*Xo5B&JUcDV0=Gt0Qkp6^ULARRWfltn0}QpC^7r zR?jVnycWt$xQp5TWZR~WhC|d)@3$=nc#6?y8&h>zt$ONW3*Nw9M;4s0@DL#&ONfBT znR!Wz?0Dqtl$*;ED`~F~S`}Cx?1@~brJF@MI95%*GMD$9LZI0GBuoHoD)Pr5|Eag2 zWbOsuGZmyz+S+mh>eXzh1CRElma{b&BVy1?{i+xW(c(Jc^#J|m{T4-jWVw`l41SBg z{E@yQ!00RJAh|patD0k3a-MHCJJ)xq=IT-kV1uplAdym4seua5YsFBguTpj3e1j? z^`8FD(Yh28Ia6z_(0wQ_g~p5?ltLN2R);yzQBIm;G9!o_ux*Vl)uSK-J*_<8vcM|4 zO>QMX`6Qx<_xy$=nWTKFq4`Vbf21?_o+_Yq`rlr=!}LF`X1Cev6a7!GU+RCqiT)=&ZjXQZz?NN7 zQ=|l0V!!qfm;7daA1bc@0hAv8;wmB%lA+%%Zm%eG{y$aO&cCit$AGG%tzajq3B zoIZ5{3T8&LkaAaHr`9H}Z{Os+4HMUsU|G;ZqKhD#o}!O}q>#07Uby7MxB zx27vCFH(^PYQ;Pdooo~_H3FI1`#2GDvu(VH()%9by;(GSyj2_oz;#q- z7LeBIW(^YpZPN!4YJmMK)q9U2{|j&3uV(+*ZTE%z-)(lA)c&*IFXjJlA^&qZo6+gK zn7xThbb<5m8wWU>U|HyGt=!vPv*J0)iN6P`%70~e+|4|;WkPa;mxRp9upj1<{p2dV zuZ;gfn7BU$ScCtqM&kd~=yV&z|Eu5bm-@eNXa8BO(UaA8n?_(rNkC#+sJ&7s4jdpP zb=i!fv0h~*D3EA?3aBdZLJ>~G&YBMraaYli6^x~j4RQl4ZfTR&*08dxBz=XOU(Zi& zJVkO)0>{%y{~K`Nn(gxa5H*V{6x-p^ZK>Th1&5s8g!bN{MKt2+f322JAMNZaS>uxb zr{_PV|6lR`ui0ufn(6(2i{AhB`knIpf1dsy{Mo*?f+>!8jY-%M1v8$~JXj+-l!z8l z-!moPG(?e)EEHMcjqF5?YH`RSyb;ZaMVJht&#SL>+91Z*+^#9-Z$aBuM4FU*o}Y~V zXLNLaHabpui!}Gu;oJ8#r>W*}vI|*VmUZ;;cx0Y@JTu25;ySnA6lckY(b=K-ZuF}; z{&j4g{dzhgj&3NIR?KB%OxrtAJx}FC~4V` z%pDAlOz0h-HTabqU@{QxC5j4*I#w`X)H`9Y$q-k-Bk`S*72Ra=SHMapjzdhIb8y#8 zw|t2tWoufa-VRc0Fpd1zt>wm8;arrq^(ivai^HPk1wz_t*SZJ*kODRtP zTT!v3Dv-?jnK{c`4oD(FHLY=iS53Cfao zk`yfNTlO${81>!0@oXzv_*;-I>R2YoR;XWLKD`$!UF*ny%xr`F%6lAMrrag5G*kIX z5)4ONHX|PpkOn3fhCl*oNzJwPWb$k!aR-whro@twCQQN`!<_29VzXGMmCSe7aY1R& zM*d5mZ}MFwKdS0Dfx2V1)xr;?3V; zQ{k1;vx^ep^n}&bOE&Ey@PTwiOL@mmagkC-T_%IAKp`yA`AtAk2ConjOi=#ZoJq4+ zeBCWKNQ$PpeBJnR&)p1YnyKf`GAb_JLS`hr8iBNjW*CM^qHLw1wF*p@SSMvt=So)B zVQ8`!evrW+Xelu+kOwIsL%wjz$-r1K>7=8%SqHh;0v=AY3}VJMS%>U&XV8e-N06;$ za{R0n%@hMuIDq6J+GGQ&CyeY;m2jy!%4G?!Co%J@Jf8@aP7Lz|c3ZMj6Qk;uaEoHl zvwUIFm({zIw6&7S5uG9O6=xpANg|pN?7G6KU#@6S_DstxA>Y0Q2f!|qnU7SWo$|qq zu5W>mA{>5JZ_LT?X4{mv%6DUpSI!tK?b7HO=tw;oM8`A7$h}(4fl=C5T`E(usGdD5 zWKG>c#T8kG+qC&Lqpwo+TQxK0b(58Ax_)oLS0)fr#avX0 zZ3N*0z@)M)4VNnTXRiPE05={M3U+P$w_abwe`^uLpGLFSX_oPyzp?*c8a+t8tWT|_ zGxogO7zHT~aeA}=6?d*B;yxMvvp`x04@8aa;(zwKtzH@b^I5I`1CD{U_FvtO8vnD` zFXMkd6Zm$ z0*m+6IC34}(Z}q4kucAz$!&9uDq6l-;fIGfGACgHz>aQLNq9^zpC@Cv$H z9cb3{k4%)P`=9!|IgdZN>*}YFh;x8|J6P*9n7Mx_sLE`~s1{Y^8sg$u{}CMKwv-eW zau@Ls%(+%^(NG3XUHtg_f-mg|rL5Y9qMcRsk2ys!_p_q<>kZV?@`IwL79Gy-*aiP$ zYUxh#m*@WiYyOqa|4s+;*!kZB27vm1^c!XTm*;Z+iw{MxgE<$4O;<;9&=-7zh$gd1 zSm4Bdb}M^yNp!X5$3LyR)GNA}o9OINm~$-6)7*#f=wwp1(wEfmJ^8gVXdNIMKwBA^ zPIb#?RmizQe<-<*-XdPIZASb5EzPFp{>dq}IhUJogk3Z5Q8KLty+{r-F6>c}vrW=3 z3bvm44h(T997q1*ZA4}gn2EtgA1|Z}Awn7@zgFD&Y8F8|(VGTEBDW9ZCA*M^%)HOI zWcxu%5c9pDu61lzoP25Vx-MjCb`dT}IfI)~k>B@;PNZyUsh#@0f<9?HvdH~`@XAt# zFxX&}3b;%4qJ7%{Q*t8`DaertUPPt`q|7}cQ|VgVVpy98veNc@iowcWCX{P6 zdhJsG_e|EGI#_(o_>QPx)XvH*hO5lHrs$Z?{ada||r! zATys#mr7GDpma*Iii%Z_WWG+TWkD(Rm2bY&h}+u=h6_JrEn(FfnYo(cD^ zH;v&=bZ}G|2)P#x#WY?jF+?xFf2m`ANyKpIAPWM1(HvaL!jhYZvR?=V#!&zz#A{p5 zMKLEiJW#iozEu+|?6Cu)51lU@Sf;dI4mjy51D;HN-eEjl77g&UUOoNj2Hix6M8{4( zsIiacKp}We<^u)LJz7*LNwyv@?JY_7q(X@a4G#65K!gpXJ&T!PoH~%a#3)LyK!RdR zBqf+2X`N2G+`6r!!+3hylH2z1Y^#A=P z=YPSM&cbD=|MDCE%o@l{$z5(D*og?gL<2CfLdQO$SIPX!5;y~F&4&Ql2&1VVN8*da z_!hfEQSa@^tB*W`d1(z_{;g((HvO_li2hci#Yw2(dN_c`ISxbODjdEnZ3`a-{|$U~ zwNHHA4jzOWTiXBhTdmUm=b5el6cZ*2d-jrjhsB6KM~@``H?!;CYW4f2{P!fOq>&~4 z-w^*#-@V=^$MK`{fBjZh!GH2i@Dl%@B7MM-HMJsZ|6e#DhFJsDG#WdV6JV$Y8BTAf zl9b@vWT!HY=W{Dq4$v=na+K_%(`9t+dqf7bJmkZfLA>$;8xwFySQE^EW5joUaD#v| z#8Z%{5vymsjnE?SZ=ER)q2e5)&>_*#K_HEAFtcoUo!piX>nq~s3+?YM{TSI6ky@j8 zfh>^5-N@Nz!QG(&xpDF(F5nZPDwOY{x#PtV4vn448)$EzeOQcGNQHxggDWSxjweKK za6sAGF%434pc(N_{_F4`kiMgy|2+^1 zO8kF<^l5Z>{9$Cwrxki`8jEG6Nc)$~U~U2V*X)SRz0Z6_u%i9ISVbRpyiyd6xS|bH zSkX8yRyOleB9Xr^sg5gGMsqV?DfZl{JhD-5nf_|@KeH1b zmH%t1@?W#lE%iT7zWyb%SEe$RsZ3=mQ<=(CrZSbOOl2xlnaWhAGL@-JWh&F3BmEyW Ke&_fA$N&ItRpz+> From 695158393ab9b4d98d8afb450b933bd4f9c2a28f Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 02:29:47 +0800 Subject: [PATCH 05/14] remove dist --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9825a30..fa2f82c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /.idea/ **/__pycache__/ -/webcollector.egg-info/ \ No newline at end of file +/webcollector.egg-info/ +/dist/ \ No newline at end of file From 7c6d63d8df0df800ef78dcd5f6ef15d98e4e0b74 Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 02:34:35 +0800 Subject: [PATCH 06/14] update demo --- examples/demo_auto_news_crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/demo_auto_news_crawler.py b/examples/demo_auto_news_crawler.py index f428f07..3af08f2 100644 --- a/examples/demo_auto_news_crawler.py +++ b/examples/demo_auto_news_crawler.py @@ -7,7 +7,8 @@ def __init__(self): super().__init__(auto_detect=True) self.num_threads = 10 self.add_seed("https://github.blog/") - self.add_regex("https://github.blog/[0-9]+.*") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" def visit(self, page, detected): if page.match_url("https://github.blog/[0-9]+.*"): From 88143114fa1c7240b122bc4e50b2a374bf89d9ef Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 02:36:08 +0800 Subject: [PATCH 07/14] update demo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6f6b0eb..aed6c5b 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,8 @@ class NewsCrawler(wc.RamCrawler): super().__init__(auto_detect=True) self.num_threads = 10 self.add_seed("https://github.blog/") - self.add_regex("https://github.blog/[0-9]+.*") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" def visit(self, page, detected): if page.match_url("https://github.blog/[0-9]+.*"): @@ -57,7 +58,6 @@ class NewsCrawler(wc.RamCrawler): crawler = NewsCrawler() crawler.start(10) - ``` ### Manually Detecting URLs From 241889147e60d0207bf07f1a0d831e9a3fcd509b Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 14:22:04 +0800 Subject: [PATCH 08/14] add redis crawler --- .gitignore | 3 +- README.md | 39 +++++++++++++++ examples/demo_redis_crawler.py | 29 +++++++++++ setup.py | 5 +- webcollector/__init__.py | 2 + webcollector/crawler.py | 24 ++++++--- webcollector/db_manager.py | 14 ++++-- webcollector/fetch.py | 8 +-- webcollector/model.py | 41 +++++++++++++++- webcollector/plugin/ram.py | 12 ++--- webcollector/plugin/redis.py | 90 ++++++++++++++++++++++++++++++++++ 11 files changed, 242 insertions(+), 25 deletions(-) create mode 100644 examples/demo_redis_crawler.py create mode 100644 webcollector/plugin/redis.py diff --git a/.gitignore b/.gitignore index fa2f82c..64849ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /.idea/ **/__pycache__/ /webcollector.egg-info/ -/dist/ \ No newline at end of file +/dist/ +**/*.p \ No newline at end of file diff --git a/README.md b/README.md index aed6c5b..51ad163 100644 --- a/README.md +++ b/README.md @@ -131,4 +131,43 @@ class NewsCrawler(wc.RamCrawler): crawler = NewsCrawler() crawler.start(10) +``` + + +### Resume Crawling by RedisCrawler + +[demo_redis_crawler.py](examples/demo_redis_crawler.py): + + +```python +# coding=utf-8 +from redis import StrictRedis + +import webcollector as wc + + +class NewsCrawler(wc.RedisCrawler): + + def __init__(self): + super().__init__(redis_client=StrictRedis("127.0.0.1"), + db_prefix="news", + auto_detect=True) + self.num_threads = 10 + self.resumable = True # you can resume crawling after shutdown + self.add_seed("https://github.blog/") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" + + def visit(self, page, detected): + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + +crawler = NewsCrawler() +crawler.start(10) + ``` \ No newline at end of file diff --git a/examples/demo_redis_crawler.py b/examples/demo_redis_crawler.py new file mode 100644 index 0000000..4c25d42 --- /dev/null +++ b/examples/demo_redis_crawler.py @@ -0,0 +1,29 @@ +# coding=utf-8 +from redis import StrictRedis + +import webcollector as wc + + +class NewsCrawler(wc.RedisCrawler): + + def __init__(self): + super().__init__(redis_client=StrictRedis("127.0.0.1"), + db_prefix="news", + auto_detect=True) + self.num_threads = 10 + self.resumable = True # you can resume crawling after shutdown + self.add_seed("https://github.blog/") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" + + def visit(self, page, detected): + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + +crawler = NewsCrawler() +crawler.start(10) diff --git a/setup.py b/setup.py index e0f5662..5db3757 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="webcollector", - version="0.0.3-alpha", + version="0.0.4-alpha", author="Jun Hu", packages=find_packages( exclude=[ @@ -12,7 +12,8 @@ install_requires=[ "html5lib", "aiohttp", - "BeautifulSoup4" + "BeautifulSoup4", + "redis" ], description="WebCollector-Python is an open source web crawler framework based on Python.It provides some simple interfaces for crawling the Web,you can setup a multi-threaded web crawler in less than 5 minutes.", license="GNU General Public License v3.0 (See LICENSE)", diff --git a/webcollector/__init__.py b/webcollector/__init__.py index 38184e2..d25ed3f 100644 --- a/webcollector/__init__.py +++ b/webcollector/__init__.py @@ -2,8 +2,10 @@ import logging import sys +from webcollector.plugin.redis import RedisCrawler from webcollector.plugin.ram import RamCrawler + logging.basicConfig( stream=sys.stdout, level=logging.INFO, diff --git a/webcollector/crawler.py b/webcollector/crawler.py index 0747725..3544d0c 100644 --- a/webcollector/crawler.py +++ b/webcollector/crawler.py @@ -25,18 +25,28 @@ def __init__(self, self.detected_filter = detected_filter self.fetcher = None self.num_threads = 10 + self.resumable = None + self.seeds = CrawlDatums() + self.forced_seeds = CrawlDatums() - def add_seed(self, url_or_datum, type=None): - return self.seeds.append(url_or_datum).set_type(type) + def add_seed(self, url_or_datum, type=None, forced=False): + if forced: + return self.forced_seeds.append(url_or_datum).set_type(type) + else: + return self.seeds.append(url_or_datum).set_type(type) - def add_seeds(self, urls_or_datums, type=None): + def add_seeds(self, urls_or_datums, type=None, forced=False): crawl_datums = [] for url_or_datum in urls_or_datums: - crawl_datum = self.add_seed(url_or_datum, type=type) + crawl_datum = self.add_seed(url_or_datum, type=type, forced=forced) crawl_datums.append(crawl_datum) return crawl_datums + def inject(self): + self.db_manager.inject(self.seeds, forced=False) + self.db_manager.inject(self.forced_seeds, forced=True) + # def add_seed_and_return(self, url_or_datum): # crawl_datum = CrawlDatum.convert_from_item(url_or_datum) # self.seeds.append(crawl_datum) @@ -63,9 +73,11 @@ def start_once(self, depth_index): return self.fetcher.start() def start(self, depth): - if len(self.seeds) == 0: + if not self.resumable: + self.db_manager.clear() + if len(self.seeds) == 0 and len(self.forced_seeds) == 0: raise Exception("Please add at least one seed") - self.db_manager.inject(self.seeds) + self.inject() for depth_index in range(depth): print("start depth {}".format(depth_index)) start_time = time.time() diff --git a/webcollector/db_manager.py b/webcollector/db_manager.py index 7fec29e..f3990fb 100644 --- a/webcollector/db_manager.py +++ b/webcollector/db_manager.py @@ -3,10 +3,7 @@ class DBManager(object): - def inject(self, seeds): - pass - - def write_crawl(self, crawl_datum): + def inject(self, seeds, forced=False): pass def init_fetch_and_detect(self): @@ -23,3 +20,12 @@ def merge(self): def create_generator(self): return None + + def clear(self): + pass + + def open(self): + pass + + def close(self): + pass diff --git a/webcollector/fetch.py b/webcollector/fetch.py index 8d10256..40a3ade 100644 --- a/webcollector/fetch.py +++ b/webcollector/fetch.py @@ -31,12 +31,14 @@ def __init__(self, async def async_start(self): self.fetch_queue = queue.Queue() self.feed_stopped = False + self.db_manager.open() self.db_manager.init_fetch_and_detect() self.generator = self.db_manager.create_generator() self.generator.generator_filter = self.generator_filter async with self.requester.create_async_context_manager(): coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] await asyncio.gather(*coroutines) + self.db_manager.close() def start(self): loop = asyncio.get_event_loop() @@ -66,7 +68,6 @@ async def fetch_coroutine(self, execute_func): execute_func(page, detected) crawl_datum.status = CrawlDatum.STATUS_DB_SUCCESS - self.db_manager.write_fetch(crawl_datum) if self.detected_filter is not None: filtered_detected = CrawlDatums() @@ -82,6 +83,7 @@ async def fetch_coroutine(self, execute_func): logger.info("done: {}".format(crawl_datum.brief_info())) except Exception as e: logger.error("failed: {}".format(crawl_datum.brief_info()), exc_info=True) + crawl_datum.status = CrawlDatum.STATUS_DB_FAILED - - + crawl_datum.num_fetched += 1 + self.db_manager.write_fetch(crawl_datum) diff --git a/webcollector/model.py b/webcollector/model.py index cd70182..ec68667 100644 --- a/webcollector/model.py +++ b/webcollector/model.py @@ -2,6 +2,7 @@ from urllib.parse import urljoin import chardet from bs4 import BeautifulSoup +import json # A CrawlDatum corresponds to a task description (usually for a webpage) @@ -16,13 +17,20 @@ class CrawlDatum(object): META_KEY_SYS_TYPE = "sys_type" - def __init__(self, url, key=None, type=None, meta_dict=None, code=CODE_NOT_SET, status=STATUS_DB_UNEXECUTED): + def __init__(self, url, + key=None, + type=None, + meta_dict=None, + code=CODE_NOT_SET, + status=STATUS_DB_UNEXECUTED, + num_fetched=0): self.url = url self.key = key if key is not None else url self.type = type self.meta_dict = meta_dict self.code = code self.status = status + self.num_fetched = num_fetched def set_key(self, key): self.key = key @@ -66,6 +74,37 @@ def brief_info(self): infos.append("Key: {} (URL: {})".format(self.key, self.url)) return " ".join(infos) + def to_dict(self): + dict_data = { + "url": self.url, + "key": self.key, + "type": self.type, + "meta_dict": self.meta_dict, + "code": self.code, + "status": self.status, + "num_fetched": self.num_fetched + } + return dict_data + + @classmethod + def from_dict(cls, dict_data): + return CrawlDatum( + url=dict_data["url"], + key=dict_data["key"], + type=dict_data["type"], + meta_dict=dict_data["meta_dict"], + code=dict_data["code"], + status=dict_data["status"], + num_fetched=dict_data["num_fetched"] + ) + + def to_json(self): + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str): + return CrawlDatum.from_dict(json.loads(json_str)) + class CrawlDatums(list): diff --git a/webcollector/plugin/ram.py b/webcollector/plugin/ram.py index 69aba84..26e25eb 100644 --- a/webcollector/plugin/ram.py +++ b/webcollector/plugin/ram.py @@ -17,11 +17,9 @@ class RamDBGenerator(Generator): def __init__(self, ram_db): super().__init__() self.ram_db = ram_db - self.iter = None + self.iter = iter(self.ram_db.crawl_db.values()) def _next(self) -> CrawlDatum: - if self.iter is None: - self.iter = iter(self.ram_db.crawl_db.values()) try: return next(self.iter) except StopIteration: @@ -32,19 +30,17 @@ class RamDBManager(DBManager): def __init__(self, ram_db): self.ram_db = ram_db - def inject(self, seeds): + def inject(self, seeds, forced=False): for seed in seeds: if isinstance(seed, str): seed = CrawlDatum(seed) + if not forced and seed.key in self.ram_db.crawl_db: + continue self.ram_db.crawl_db[seed.key] = seed def create_generator(self): return RamDBGenerator(self.ram_db) - def write_crawl(self, crawl_datum): - if crawl_datum.key not in self.ram_db.crawl_db: - self.ram_db.crawl_db[crawl_datum.key] = crawl_datum - def init_fetch_and_detect(self): self.ram_db.fetch_db = {} self.ram_db.detect_db = {} diff --git a/webcollector/plugin/redis.py b/webcollector/plugin/redis.py new file mode 100644 index 0000000..03de39e --- /dev/null +++ b/webcollector/plugin/redis.py @@ -0,0 +1,90 @@ +# coding=utf-8 +from redis import StrictRedis + +from webcollector.crawler import AutoDetectCrawler +from webcollector.db_manager import DBManager +from webcollector.generate import Generator +from webcollector.model import CrawlDatum + + +class RedisDBGenerator(Generator): + + def __init__(self, redis_db_manager): + super().__init__() + self.history_keys = set() + self.iter = redis_db_manager.redis_client.hscan_iter( + redis_db_manager.crawl_db + ) + + def _next(self) -> CrawlDatum: + try: + while True: + key, crawl_datum_json = next(self.iter) + if key in self.history_keys: + continue + else: + self.history_keys.add(key) + return CrawlDatum.from_json(crawl_datum_json) + except StopIteration: + return None + + +class RedisDBManager(DBManager): + def __init__(self, redis_client: StrictRedis, db_prefix): + self.redis_client = redis_client + self.db_prefix = db_prefix + self.crawl_db = "{}_crawl".format(db_prefix) + self.fetch_db = "{}_fetch".format(db_prefix) + self.detect_db = "{}_detect".format(db_prefix) + + def open(self): + pass + + def close(self): + pass + + def clear(self): + self.redis_client.delete(self.crawl_db) + self.redis_client.delete(self.fetch_db) + self.redis_client.delete(self.detect_db) + + def inject(self, seeds, forced=False): + for seed in seeds: + if isinstance(seed, str): + seed = CrawlDatum(seed) + if not forced and self.redis_client.hexists(self.crawl_db, seed.key): + continue + self.redis_client.hset(self.crawl_db, seed.key, seed.to_json()) + + def create_generator(self): + return RedisDBGenerator(self) + + def init_fetch_and_detect(self): + pass + + def write_fetch(self, crawl_datum): + self.redis_client.hset(self.fetch_db, crawl_datum.key, crawl_datum.to_json()) + + def write_detect(self, crawl_datum): + self.redis_client.hset(self.detect_db, crawl_datum.key, crawl_datum.to_json()) + + def merge(self): + print("merging......") + if self.redis_client.exists(self.fetch_db): + for _, crawl_datum_json in self.redis_client.hscan_iter(self.fetch_db): + crawl_datum = CrawlDatum.from_json(crawl_datum_json) + self.redis_client.hset(self.crawl_db, crawl_datum.key, crawl_datum.to_json()) + self.redis_client.delete(self.fetch_db) + + if self.redis_client.exists(self.detect_db): + for key, crawl_datum_json in self.redis_client.hscan_iter(self.detect_db): + if not self.redis_client.hexists(self.crawl_db, key): + crawl_datum = CrawlDatum.from_json(crawl_datum_json) + self.redis_client.hset(self.crawl_db, crawl_datum.key, crawl_datum.to_json()) + self.redis_client.delete(self.detect_db) + + +class RedisCrawler(AutoDetectCrawler): + def __init__(self, redis_client, db_prefix, auto_detect, **kwargs): + super().__init__(RedisDBManager(redis_client, db_prefix), auto_detect, **kwargs) + From 9d854e3ed8ca9a3a6dbd6a928b2712d8639b5422 Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Wed, 13 Feb 2019 14:23:54 +0800 Subject: [PATCH 09/14] add RedisCrawler --- README.md | 1 - examples/demo_redis_crawler.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 51ad163..84ca1cf 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,6 @@ crawler.start(10) ```python # coding=utf-8 from redis import StrictRedis - import webcollector as wc diff --git a/examples/demo_redis_crawler.py b/examples/demo_redis_crawler.py index 4c25d42..08d689a 100644 --- a/examples/demo_redis_crawler.py +++ b/examples/demo_redis_crawler.py @@ -1,6 +1,5 @@ # coding=utf-8 from redis import StrictRedis - import webcollector as wc @@ -11,7 +10,7 @@ def __init__(self): db_prefix="news", auto_detect=True) self.num_threads = 10 - self.resumable = True # you can resume crawling after shutdown + self.resumable = True # you can resume crawling after shutdown self.add_seed("https://github.blog/") self.add_regex("+https://github.blog/[0-9]+.*") self.add_regex("-.*#.*") # do not detect urls that contain "#" From 3931f5a89fc6aa42edea1cae0ff28f849cc869fc Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Mon, 19 Aug 2019 13:54:35 +0800 Subject: [PATCH 10/14] use requests instead of aiohttp --- examples/demo_server.py | 12 +++++ examples/demo_speed.py | 23 +++++++++ test.py | 95 ++++++++++++++++++++++++++++++++++++++ webcollector/fetch.py | 23 ++++++--- webcollector/plugin/net.py | 30 +++++++++++- 5 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 examples/demo_server.py create mode 100644 examples/demo_speed.py create mode 100644 test.py diff --git a/examples/demo_server.py b/examples/demo_server.py new file mode 100644 index 0000000..dcde4c9 --- /dev/null +++ b/examples/demo_server.py @@ -0,0 +1,12 @@ +# coding=utf-8 +from flask import Flask +import time +import random +app = Flask(__name__) + +@app.route("/") +def index(): + time.sleep(2) + return "ok" + +app.run() diff --git a/examples/demo_speed.py b/examples/demo_speed.py new file mode 100644 index 0000000..15a27f5 --- /dev/null +++ b/examples/demo_speed.py @@ -0,0 +1,23 @@ +# coding=utf-8 +import webcollector as wc +import time + + +class RubyChinaCrawler(wc.RamCrawler): + def __init__(self): + super().__init__(auto_detect=False) + self.num_threads = 10 + self.add_seeds(["https://ruby-china.org/topics?page={}".format(i) for i in range(1, 40)]) + + def visit(self, page, detected): + print("start_visit", page.url) + time.sleep(4) + print("end_visit", page.url) + + +crawler = RubyChinaCrawler() +start = time.time() +crawler.start(10) +print(time.time() - start) + + diff --git a/test.py b/test.py new file mode 100644 index 0000000..7f22256 --- /dev/null +++ b/test.py @@ -0,0 +1,95 @@ +# coding=utf-8 + +import asyncio +import requests +import random +import threading + + + +import asyncio +import requests +from concurrent.futures import ThreadPoolExecutor +import time + +url = "http://127.0.0.1:5000" + +loop = asyncio.get_event_loop() + + +async def cor(): + print("c-start") + await asyncio.sleep(4) + print("c-end") + + +async def main(): + tasks = [loop.create_task(cor()) for _ in range(10)] + print("finish tasks======") + for i, task in enumerate(tasks): + print("start", i) + time.sleep(5) + await task + print("end", i) + +loop.run_until_complete(main()) + +adfads + +pool = ThreadPoolExecutor(20) + + + +def request(i): + print("start", i) + time.sleep(5) + text = requests.get(url).text + print("content:", i, text) + print(threading.get_ident()) + return text + + +# f0 = loop.run_in_executor(pool, request) +# f1 = loop.run_in_executor(pool, request) +# futures = [loop.run_in_executor(pool, request, i) for i in range(20)] +# futures = [loop.run_in_executor(None, requests.get, "http://127.0.0.1:5000") for _ in range(10)] +# print("======") + + +async def cor(i): + for j in range(20): + future = loop.run_in_executor(pool, request, "{}_{}".format(i, j)) + await future + print("cor", i) + print("end-cor", i) + +loop.run_until_complete(asyncio.gather(*[cor(i) for i in range(10)])) + + +# async def main(): +# # for future in futures: +# for i in range(10): +# # future = loop.run_in_executor(pool, request) +# await futures[i] +# print("end", i) +# +# loop.run_until_complete(main()) + +# async def test(): +# print("start") +# # await asyncio.sleep(2) +# request_future = loop.run_in_executor(pool, request) +# result = await request_future +# print("end") +# +# +# +# +# async def main(): +# tasks = [loop.create_task(test()) for _ in range(10)] +# for i in range(10): +# await tasks[i] +# print("task end", i) +# +# +# loop.run_until_complete(main()) \ No newline at end of file diff --git a/webcollector/fetch.py b/webcollector/fetch.py index 40a3ade..0b7fe6b 100644 --- a/webcollector/fetch.py +++ b/webcollector/fetch.py @@ -28,6 +28,9 @@ def __init__(self, self.execute_func = execute_func self.num_threads = num_threads + self.loop = None + + async def async_start(self): self.fetch_queue = queue.Queue() self.feed_stopped = False @@ -35,14 +38,19 @@ async def async_start(self): self.db_manager.init_fetch_and_detect() self.generator = self.db_manager.create_generator() self.generator.generator_filter = self.generator_filter - async with self.requester.create_async_context_manager(): - coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] - await asyncio.gather(*coroutines) + + # async with self.requester.create_async_context_manager(): + # coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] + # await asyncio.gather(*coroutines) + + coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] + await asyncio.gather(*coroutines) + self.db_manager.close() def start(self): - loop = asyncio.get_event_loop() - loop.run_until_complete(self.async_start()) + self.loop = asyncio.get_event_loop() + self.loop.run_until_complete(self.async_start()) return self.generator.num_generated def feed(self): @@ -63,7 +71,10 @@ async def fetch_coroutine(self, execute_func): else: crawl_datum = self.fetch_queue.get(block=False) try: - page = await self.requester.get_response(crawl_datum) + # loop = asyncio.get_event_loop() + request_future = self.loop.run_in_executor(None, self.requester.get_response, crawl_datum) + page = await request_future + # page = await self.requester.get_response(crawl_datum) detected = CrawlDatums() execute_func(page, detected) diff --git a/webcollector/plugin/net.py b/webcollector/plugin/net.py index 3a81aaa..991d87c 100644 --- a/webcollector/plugin/net.py +++ b/webcollector/plugin/net.py @@ -3,9 +3,9 @@ from webcollector.model import Page from webcollector.net import Requester import aiohttp +import requests - -class HttpRequester(Requester): +class AioHttpRequester(Requester): def __init__(self): self.session = None @@ -32,5 +32,31 @@ async def get_response(self, crawl_datum): return page +class HttpRequester(Requester): + + def __init__(self): + self.session = None + + + # def create_async_context_manager(self): + # self.session = aiohttp.ClientSession() + # return self.session + + # def request(self, crawl_datum): + # headers = {"User-Agent": DEFAULT_USER_AGENT} + # return requests.get(crawl_datum.url, headers=headers) + + def get_response(self, crawl_datum): + headers = {"User-Agent": DEFAULT_USER_AGENT} + response = requests.get(crawl_datum.url, headers=headers) + + code = response.status_code + content = response.content + encoding = response.encoding + content_type = response.headers["Content-Type"] + crawl_datum.code = code + page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) + + return page From fbcb316cbdb11dc208f947b866d99c0cb390f0bc Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Mon, 19 Aug 2019 21:52:12 +0800 Subject: [PATCH 11/14] 0.5 --- README.md | 64 +++++++++++++++++++++++++ examples/demo_custom_http_request.py | 56 ++++++++++++++++++++++ examples/demo_speed.py | 4 +- setup.py | 5 +- webcollector/fetch.py | 7 ++- webcollector/net.py | 16 +++++-- webcollector/plugin/net.py | 70 ++++++++++++---------------- 7 files changed, 170 insertions(+), 52 deletions(-) create mode 100644 examples/demo_custom_http_request.py diff --git a/README.md b/README.md index 84ca1cf..f6ee201 100644 --- a/README.md +++ b/README.md @@ -169,4 +169,68 @@ class NewsCrawler(wc.RedisCrawler): crawler = NewsCrawler() crawler.start(10) +``` + +### Custom Http Request with Requests + +[demo_custom_http_request.py](examples/demo_custom_http_request.py): + + +``` +# coding=utf-8 + +import webcollector as wc +from webcollector.model import Page +from webcollector.plugin.net import HttpRequester + +import requests + + +class MyRequester(HttpRequester): + def get_response(self, crawl_datum): + # custom http request + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" + } + + print("sending request with MyRequester") + + # send request and get response + response = requests.get(crawl_datum.url, headers=headers) + + # update code + crawl_datum.code = response.status_code + + # wrap http response as a Page object + page = Page(crawl_datum, + response.content, + content_type=response.headers["Content-Type"], + http_charset=response.encoding) + + return page + + +class NewsCrawler(wc.RamCrawler): + def __init__(self): + super().__init__(auto_detect=True) + self.num_threads = 10 + + # set requester to enable MyRequester + self.requester = MyRequester() + + self.add_seed("https://github.blog/") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" + + def visit(self, page, detected): + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + +crawler = NewsCrawler() +crawler.start(10) ``` \ No newline at end of file diff --git a/examples/demo_custom_http_request.py b/examples/demo_custom_http_request.py new file mode 100644 index 0000000..d9068e4 --- /dev/null +++ b/examples/demo_custom_http_request.py @@ -0,0 +1,56 @@ +# coding=utf-8 + +import webcollector as wc +from webcollector.model import Page +from webcollector.plugin.net import HttpRequester + +import requests + + +class MyRequester(HttpRequester): + def get_response(self, crawl_datum): + # custom http request + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" + } + + print("sending request with MyRequester") + + # send request and get response + response = requests.get(crawl_datum.url, headers=headers) + + # update code + crawl_datum.code = response.status_code + + # wrap http response as a Page object + page = Page(crawl_datum, + response.content, + content_type=response.headers["Content-Type"], + http_charset=response.encoding) + + return page + + +class NewsCrawler(wc.RamCrawler): + def __init__(self): + super().__init__(auto_detect=True) + self.num_threads = 10 + + # set requester to enable MyRequester + self.requester = MyRequester() + + self.add_seed("https://github.blog/") + self.add_regex("+https://github.blog/[0-9]+.*") + self.add_regex("-.*#.*") # do not detect urls that contain "#" + + def visit(self, page, detected): + if page.match_url("https://github.blog/[0-9]+.*"): + title = page.select("h1.lh-condensed")[0].text.strip() + content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip() + print("\nURL: ", page.url) + print("TITLE: ", title) + print("CONTENT: ", content[:50], "...") + + +crawler = NewsCrawler() +crawler.start(10) \ No newline at end of file diff --git a/examples/demo_speed.py b/examples/demo_speed.py index 15a27f5..9e894e5 100644 --- a/examples/demo_speed.py +++ b/examples/demo_speed.py @@ -11,7 +11,7 @@ def __init__(self): def visit(self, page, detected): print("start_visit", page.url) - time.sleep(4) + # time.sleep(4) print("end_visit", page.url) @@ -19,5 +19,3 @@ def visit(self, page, detected): start = time.time() crawler.start(10) print(time.time() - start) - - diff --git a/setup.py b/setup.py index 5db3757..ecf1e73 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="webcollector", - version="0.0.4-alpha", + version="0.0.5-alpha", author="Jun Hu", packages=find_packages( exclude=[ @@ -13,7 +13,8 @@ "html5lib", "aiohttp", "BeautifulSoup4", - "redis" + "redis", + "requests" ], description="WebCollector-Python is an open source web crawler framework based on Python.It provides some simple interfaces for crawling the Web,you can setup a multi-threaded web crawler in less than 5 minutes.", license="GNU General Public License v3.0 (See LICENSE)", diff --git a/webcollector/fetch.py b/webcollector/fetch.py index 0b7fe6b..aa88016 100644 --- a/webcollector/fetch.py +++ b/webcollector/fetch.py @@ -42,10 +42,9 @@ async def async_start(self): # async with self.requester.create_async_context_manager(): # coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] # await asyncio.gather(*coroutines) - - coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] - await asyncio.gather(*coroutines) - + with self.requester: + coroutines = [self.fetch_coroutine(self.execute_func) for _ in range(self.num_threads)] + await asyncio.gather(*coroutines) self.db_manager.close() def start(self): diff --git a/webcollector/net.py b/webcollector/net.py index 36c6378..439446b 100644 --- a/webcollector/net.py +++ b/webcollector/net.py @@ -3,8 +3,18 @@ class Requester(object): - async def get_response(self, crawl_datum): - return None + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + def get_response(self, crawl_datum): + raise NotImplementedError() def create_async_context_manager(self): - return None \ No newline at end of file + return None + + +with Requester() as r: + print(r) \ No newline at end of file diff --git a/webcollector/plugin/net.py b/webcollector/plugin/net.py index 991d87c..eb9c3a0 100644 --- a/webcollector/plugin/net.py +++ b/webcollector/plugin/net.py @@ -2,50 +2,11 @@ from webcollector.config import DEFAULT_USER_AGENT from webcollector.model import Page from webcollector.net import Requester -import aiohttp import requests -class AioHttpRequester(Requester): - - def __init__(self): - self.session = None - - def create_async_context_manager(self): - self.session = aiohttp.ClientSession() - return self.session - - def request(self, crawl_datum): - return self.session.get( - crawl_datum.url, - headers={"User-Agent": DEFAULT_USER_AGENT} - ) - - async def get_response(self, crawl_datum): - # async with self.session.get(crawl_datum.url) as response: - async with self.request(crawl_datum) as response: - code = response.status - content = await response.content.read() - encoding = response.get_encoding() - content_type = response.content_type - crawl_datum.code = code - page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) - return page - class HttpRequester(Requester): - def __init__(self): - self.session = None - - - # def create_async_context_manager(self): - # self.session = aiohttp.ClientSession() - # return self.session - - # def request(self, crawl_datum): - # headers = {"User-Agent": DEFAULT_USER_AGENT} - # return requests.get(crawl_datum.url, headers=headers) - def get_response(self, crawl_datum): headers = {"User-Agent": DEFAULT_USER_AGENT} response = requests.get(crawl_datum.url, headers=headers) @@ -57,6 +18,35 @@ def get_response(self, crawl_datum): crawl_datum.code = code page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) - + return page + +# class AioHttpRequester(Requester): +# +# def __init__(self): +# self.session = None +# +# def create_async_context_manager(self): +# self.session = aiohttp.ClientSession() +# return self.session +# +# def request(self, crawl_datum): +# return self.session.get( +# crawl_datum.url, +# headers={"User-Agent": DEFAULT_USER_AGENT} +# ) +# +# async def get_response(self, crawl_datum): +# # async with self.session.get(crawl_datum.url) as response: +# async with self.request(crawl_datum) as response: +# code = response.status +# content = await response.content.read() +# encoding = response.get_encoding() +# content_type = response.content_type +# crawl_datum.code = code +# page = Page(crawl_datum, content, content_type=content_type, http_charset=encoding) +# return page + + + From fc7a2a0dab70d8f967f59dd9e6b162806d0c9f5c Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Mon, 19 Aug 2019 21:53:58 +0800 Subject: [PATCH 12/14] readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6ee201..69c0707 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ crawler.start(10) [demo_custom_http_request.py](examples/demo_custom_http_request.py): -``` +```python # coding=utf-8 import webcollector as wc From d906be09cf93db20feed11163eedd9793217847d Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Mon, 19 Aug 2019 21:58:20 +0800 Subject: [PATCH 13/14] bug --- webcollector/net.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/webcollector/net.py b/webcollector/net.py index 439446b..39c9fb5 100644 --- a/webcollector/net.py +++ b/webcollector/net.py @@ -14,7 +14,3 @@ def get_response(self, crawl_datum): def create_async_context_manager(self): return None - - -with Requester() as r: - print(r) \ No newline at end of file From c7388a5e4dd668dc7224d88665ecfa99e5f8d10e Mon Sep 17 00:00:00 2001 From: hujunxianligong Date: Mon, 19 Aug 2019 22:02:57 +0800 Subject: [PATCH 14/14] bug --- webcollector/net.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/webcollector/net.py b/webcollector/net.py index 39c9fb5..684aa7b 100644 --- a/webcollector/net.py +++ b/webcollector/net.py @@ -10,7 +10,4 @@ def __exit__(self, exc_type, exc_val, exc_tb): pass def get_response(self, crawl_datum): - raise NotImplementedError() - - def create_async_context_manager(self): - return None + raise NotImplementedError() \ No newline at end of file