9
9
# import undetected_chromedriver as uc
10
10
from utils import detect_optimizable , download_image , extract_text_from_html , get_output_code , isnotnull , lowercase_tags_in_xpath , myMySQL , new_line , \
11
11
on_press_creator , on_release_creator , readCode , rename_downloaded_file , replace_field_values , send_email , split_text_by_lines , write_to_csv , write_to_excel , write_to_json
12
- from constants import WriteMode , DataWriteMode
12
+ from constants import WriteMode , DataWriteMode , GraphOption
13
13
from myChrome import MyChrome
14
14
from threading import Thread , Event
15
15
from PIL import Image
@@ -235,8 +235,10 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
235
235
# 检测如果没有复杂的操作,优化提取数据流程
236
236
def preprocess (self ):
237
237
for index_node , node in enumerate (self .procedure ):
238
- parameters = node ["parameters" ]
238
+ parameters : dict = node ["parameters" ]
239
239
iframe = parameters .get ('iframe' )
240
+ option = node ["option" ]
241
+
240
242
parameters ["iframe" ] = False if not iframe else ...
241
243
if parameters .get ("xpath" ):
242
244
parameters ["xpath" ] = lowercase_tags_in_xpath (parameters ["xpath" ])
@@ -248,150 +250,112 @@ def preprocess(self):
248
250
parameters ["waitElementTime" ] = 10
249
251
parameters ["waitElementIframeIndex" ] = 0
250
252
251
- if node ["option" ] == 1 : # 打开网页操作
252
- try :
253
- cookies = node ["parameters" ]["cookies" ]
254
- except :
255
- node ["parameters" ]["cookies" ] = ""
256
- elif node ["option" ] == 2 : # 点击操作
257
- try :
258
- alertHandleType = node ["parameters" ]["alertHandleType" ]
259
- except :
260
- node ["parameters" ]["alertHandleType" ] = 0
261
- if node ["parameters" ]["useLoop" ]:
253
+ if option == GraphOption .Get .value : # 打开网页操作
254
+ parameters ["cookies" ] = parameters .get ("cookies" , "" )
255
+ elif option == GraphOption .Click .value : # 点击操作
256
+ parameters ["alertHandleType" ] = parameters .get ("alertHandleType" , 0 )
257
+ if parameters .get ("useLoop" ):
262
258
if self .task_version <= "0.3.5" :
263
259
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
264
- node ["parameters" ]["xpath" ] = ""
265
- self .print_and_log ("您的任务版本号为" + self .task_version +
266
- ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath" )
267
- elif node ["option" ] == 3 : # 提取数据操作
268
- node ["parameters" ]["recordASField" ] = 0
269
- try :
270
- params = node ["parameters" ]["params" ]
271
- except :
272
- node ["parameters" ]["params" ] = node ["parameters" ]["paras" ] # 兼容0.5.0及以下版本的EasySpider
273
- params = node ["parameters" ]["params" ]
274
- try :
275
- clear = node ["parameters" ]["clear" ]
276
- except :
277
- node ["parameters" ]["clear" ] = 0
278
- try :
279
- newLine = node ["parameters" ]["newLine" ]
280
- except :
281
- node ["parameters" ]["newLine" ] = 1
260
+ parameters ["xpath" ] = ""
261
+ self .print_and_log (f"您的任务版本号为{ self .task_version } ,循环点击不支持相对XPath写法,已自动切换为纯循环的XPath" )
262
+ elif option == GraphOption .Extract .value : # 提取数据操作
263
+ parameters ["recordASField" ] = 0
264
+ parameters ["params" ] = parameters .get ("params" , parameters ["paras" ]) # 兼容0.5.0及以下版本的EasySpider
265
+ parameters ["clear" ] = parameters .get ("clear" , 0 )
266
+ parameters ["newLine" ] = parameters .get ("newLine" , 1 )
267
+
268
+ params = parameters ["params" ]
282
269
for param in params :
283
- try :
284
- iframe = param ["iframe" ]
285
- except :
286
- param ["iframe" ] = False
287
- try :
270
+ param ["iframe" ] = param .get ("iframe" , False )
271
+
272
+ if param .get ("relativeXPath" ):
288
273
param ["relativeXPath" ] = lowercase_tags_in_xpath (param ["relativeXPath" ])
289
- except :
290
- pass
291
- try :
292
- node ["parameters" ]["recordASField" ] = param ["recordASField" ]
293
- except :
294
- node ["parameters" ]["recordASField" ] = 1
295
- try :
296
- splitLine = int (param ["splitLine" ])
297
- except :
298
- param ["splitLine" ] = 0
299
- if param ["contentType" ] == 8 :
300
- self .print_and_log (
301
- "默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。" )
302
- self .print_and_log (
303
- "If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \" Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program." )
274
+
275
+ parameters ["recordASField" ] = param .get ("recordASField" , 1 )
276
+
277
+ param ["splitLine" ] = 0 if not param .get ("splitLine" ) else ...
278
+
279
+ if param .get ("contentType" ) == 8 :
280
+ self .print_and_log ("默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType =="
281
+ "8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片"
282
+ "保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
283
+ "的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。" )
284
+ self .print_and_log ("If you think the default ddddocr function is not good enough, you can "
285
+ "modify the source code get_content function -> contentType == 8 position "
286
+ "to your own OCR model and then compile and run it; or you can first set "
287
+ "the content type of the crawler to \" Element Screenshot\" to save the "
288
+ "picture, and then call your own program with custom operations. The "
289
+ "function of the program is to read the latest generated picture, then use "
290
+ "a good model, such as PaddleOCR to recognize the picture, and then return "
291
+ "the return value as a parameter output to the program." )
304
292
param ["optimizable" ] = detect_optimizable (param )
305
- elif node ["option" ] == 4 : # 输入文字
306
- try :
307
- index = node ["parameters" ]["index" ] # 索引值
308
- except :
309
- node ["parameters" ]["index" ] = 0
310
- elif node ["option" ] == 5 : # 自定义操作
311
- try :
312
- clear = node ["parameters" ]["clear" ]
313
- except :
314
- node ["parameters" ]["clear" ] = 0
315
- try :
316
- newLine = node ["parameters" ]["newLine" ]
317
- except :
318
- node ["parameters" ]["newLine" ] = 1
319
- elif node ["option" ] == 7 : # 移动到元素
320
- if node ["parameters" ]["useLoop" ]:
321
- if self .task_version <= "0.3.5" :
322
- # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
323
- node ["parameters" ]["xpath" ] = ""
324
- self .print_and_log ("您的任务版本号为" + self .task_version +
325
- ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath" )
326
- elif node ["option" ] == 8 : # 循环操作
327
- try :
328
- exitElement = node ["parameters" ]["exitElement" ]
329
- if exitElement == "" :
330
- node ["parameters" ]["exitElement" ] = "//body"
331
- except :
332
- node ["parameters" ]["exitElement" ] = "//body"
333
- node ["parameters" ]["quickExtractable" ] = False # 是否可以快速提取
334
- try :
335
- skipCount = node ["parameters" ]["skipCount" ]
336
- except :
337
- node ["parameters" ]["skipCount" ] = 0
293
+ elif option == GraphOption .Input .value : # 输入文字
294
+ parameters ['index' ] = parameters .get ('index' , 0 )
295
+ elif option == GraphOption .Custom .value : # 自定义操作
296
+ parameters ['clear' ] = parameters .get ('clear' , 0 )
297
+ parameters ['newLine' ] = parameters .get ('newLine' , 1 )
298
+ elif option == GraphOption .Move .value : # 移动到元素
299
+ if parameters .get ('useLoop' ):
300
+ if self .task_version <= "0.3.5" : # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
301
+ parameters ["xpath" ] = ""
302
+ self .print_and_log (f"您的任务版本号为{ self .task_version } ,循环点击不支持相对XPath写法,已自动切换为纯循环的XPath" )
303
+ elif option == GraphOption .Loop .value : # 循环操作
304
+ parameters ['exitElement' ] = "//body" if not parameters .get ('exitElement' ) or parameters .get ('exitElement' ) == "" else ...
305
+ parameters ["quickExtractable" ] = False # 是否可以快速提取
306
+ parameters ['skipCount' ] = parameters .get ('skipCount' , 0 )
307
+
338
308
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
339
- if len (node ["sequence" ]) == 1 and self .procedure [node ["sequence" ][0 ]]["option" ] == 3 and (int (node ["parameters" ]["loopType" ]) == 1 or int (node ["parameters" ]["loopType" ]) == 2 ):
340
- try :
341
- params = self .procedure [node ["sequence" ][0 ]]["parameters" ]["params" ]
342
- except :
343
- params = self .procedure [node ["sequence" ][0 ]]["parameters" ]["paras" ] # 兼容0.5.0及以下版本的EasySpider
344
- try :
345
- waitElement = self .procedure [node ["sequence" ][0 ]]["parameters" ]["waitElement" ]
346
- except :
347
- waitElement = ""
348
- if node ["parameters" ]["iframe" ]:
349
- node ["parameters" ]["quickExtractable" ] = False # 如果是iframe,那么不可以快速提取
309
+ if len (node ["sequence" ]) == 1 and self .procedure [node ["sequence" ][0 ]]["option" ] == 3 \
310
+ and (int (node ["parameters" ]["loopType" ]) == 1 or int (node ["parameters" ]["loopType" ]) == 2 ):
311
+ params = self .procedure [node ["sequence" ][0 ]].get ("parameters" ).get ("params" )
312
+ if not params :
313
+ params = self .procedure [node ["sequence" ][0 ]]["parameters" ]["paras" ] # 兼容0.5.0及以下版本的EasySpider
314
+
315
+ waitElement = self .procedure [node ["sequence" ][0 ]]["parameters" ].get ("waitElement" , "" )
316
+
317
+ if parameters ["iframe" ]:
318
+ parameters ["quickExtractable" ] = False # 如果是iframe,那么不可以快速提取
350
319
else :
351
- node ["parameters" ]["quickExtractable" ] = True # 先假设可以快速提取
352
- if node ["parameters" ]["skipCount" ] > 0 :
353
- node ["parameters" ]["quickExtractable" ] = False # 如果有跳过的元素,那么不可以快速提取
320
+ parameters ["quickExtractable" ] = True # 先假设可以快速提取
321
+
322
+ if parameters ["skipCount" ] > 0 :
323
+ parameters ["quickExtractable" ] = False # 如果有跳过的元素,那么不可以快速提取
324
+
354
325
for param in params :
355
326
optimizable = detect_optimizable (param , ignoreWaitElement = False , waitElement = waitElement )
356
- try :
357
- iframe = param ["iframe" ]
358
- except :
359
- param ["iframe" ] = False
360
- if param ["iframe" ] and not param ["relative" ]: # 如果是iframe,那么不可以快速提取
327
+ param ['iframe' ] = param .get ('iframe' , False )
328
+ if param ["iframe" ] and not param ["relative" ]: # 如果是iframe,那么不可以快速提取
361
329
optimizable = False
362
- if not optimizable : # 如果有一个不满足优化条件,那么就不能快速提取
363
- node [ " parameters" ] ["quickExtractable" ] = False
330
+ if not optimizable : # 如果有一个不满足优化条件,那么就不能快速提取
331
+ parameters ["quickExtractable" ] = False
364
332
break
365
- if node ["parameters" ]["quickExtractable" ]:
366
- self .print_and_log ("循环操作<" + node ["title" ] + ">可以快速提取数据" )
367
- self .print_and_log ("Loop operation <" + node ["title" ] + "> can extract data quickly" )
368
- try :
369
- node ["parameters" ]["clear" ] = self .procedure [node ["sequence" ][0 ]]["parameters" ]["clear" ]
370
- except :
371
- node ["parameters" ]["clear" ] = 0
372
- try :
373
- node ["parameters" ]["newLine" ] = self .procedure [node ["sequence" ][0 ]]["parameters" ]["newLine" ]
374
- except :
375
- node ["parameters" ]["newLine" ] = 1
376
- if int (node ["parameters" ]["loopType" ]) == 1 : # 不固定元素列表
333
+
334
+ if parameters ["quickExtractable" ]:
335
+ self .print_and_log (f"循环操作<{ node ['title' ]} >可以快速提取数据" )
336
+ self .print_and_log (f"Loop operation <{ node ['title' ]} > can extract data quickly" )
337
+ parameters ["clear" ] = self .procedure [node ["sequence" ][0 ]]["parameters" ].get ("clear" , 0 )
338
+ parameters ["newLine" ] = self .procedure [node ["sequence" ][0 ]]["parameters" ].get ("newLine" , 1 )
339
+
340
+ if int (node ["parameters" ]["loopType" ]) == 1 : # 不固定元素列表
377
341
node ["parameters" ]["baseXPath" ] = node ["parameters" ]["xpath" ]
378
- elif int (node ["parameters" ]["loopType" ]) == 2 : # 固定元素列表
342
+ elif int (node ["parameters" ]["loopType" ]) == 2 : # 固定元素列表
379
343
node ["parameters" ]["baseXPath" ] = node ["parameters" ]["pathList" ]
380
344
node ["parameters" ]["quickParams" ] = []
381
345
for param in params :
382
346
content_type = ""
383
- if param ["relativeXPath" ].find ("/@href" ) >= 0 or param ["relativeXPath" ].find ("/text()" ) >= 0 or param [ "relativeXPath" ]. find (
384
- "::text()" ) >= 0 :
347
+ if param ["relativeXPath" ].find ("/@href" ) >= 0 or param ["relativeXPath" ].find ("/text()" ) >= 0 \
348
+ or param [ "relativeXPath" ]. find ( "::text()" ) >= 0 :
385
349
content_type = ""
386
350
elif param ["nodeType" ] == 2 :
387
351
content_type = "//@href"
388
- elif param ["nodeType" ] == 4 : # 图片链接
352
+ elif param ["nodeType" ] == 4 : # 图片链接
389
353
content_type = "//@src"
390
354
elif param ["contentType" ] == 1 :
391
355
content_type = "/text()"
392
356
elif param ["contentType" ] == 0 :
393
357
content_type = "//text()"
394
- if param ["relative" ]: # 如果是相对XPath
358
+ if param ["relative" ]: # 如果是相对XPath
395
359
xpath = "." + param ["relativeXPath" ] + content_type
396
360
else :
397
361
xpath = param ["relativeXPath" ] + content_type
0 commit comments