9
9
# import undetected_chromedriver as uc
10
10
from utils import detect_optimizable , download_image , extract_text_from_html , get_output_code , isnotnull , lowercase_tags_in_xpath , myMySQL , new_line , \
11
11
on_press_creator , on_release_creator , readCode , rename_downloaded_file , replace_field_values , send_email , split_text_by_lines , write_to_csv , write_to_excel , write_to_json
12
+ from constants import WriteMode , DataWriteMode
12
13
from myChrome import MyChrome
13
14
from threading import Thread , Event
14
15
from PIL import Image
@@ -132,13 +133,12 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
132
133
with open (stealth_path , 'r' ) as f :
133
134
js = f .read ()
134
135
self .print_and_log ("Loading stealth.min.js" )
135
- self .browser .execute_cdp_cmd ('Page.addScriptToEvaluateOnNewDocument' , {
136
- 'source' : js }) # TMALL 反扒
136
+ self .browser .execute_cdp_cmd ('Page.addScriptToEvaluateOnNewDocument' , {'source' : js }) # TMALL 反扒
137
137
self .browser .execute_cdp_cmd ("Page.addScriptToEvaluateOnNewDocument" , {
138
- "source" : """
139
- Object.defineProperty(navigator, 'webdriver', {
140
- get: () => undefined
141
- })
138
+ "source" : """
139
+ Object.defineProperty(navigator, 'webdriver', {
140
+ get: () => undefined
141
+ })
142
142
"""
143
143
})
144
144
WebDriverWait (self .browser , 10 )
@@ -154,59 +154,62 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
154
154
self .maxViewLength = service .get ("maxViewLength" , 15 ) # 最大显示长度
155
155
self .outputFormat = service .get ("outputFormat" , "csv" ) # 输出格式
156
156
self .save_threshold = service .get ("saveThreshold" , 10 ) # 保存最低阈值
157
- self .dataWriteMode = service .get ("dataWriteMode" , 1 ) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
157
+ self .dataWriteMode = service .get ("dataWriteMode" , DataWriteMode .Append .value ) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
158
+ self .task_version = service .get ("version" , "" ) # 任务版本
158
159
159
- try :
160
- self .task_version = service ["version" ] # 任务版本
161
- if service ["version" ] >= "0.3.1" : # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
162
- pass
163
- else : # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
164
- if service ["version" ] != version :
165
- self .print_and_log ("版本不一致,请使用" +
166
- service ["version" ] + "版本的EasySpider运行该任务!" )
167
- self .print_and_log ("Version not match, please use EasySpider " +
168
- service ["version" ] + " to run this task!" )
169
- self .browser .quit ()
170
- sys .exit ()
171
- except : # 0.2.0版本没有version字段,所以直接退出
160
+ if not self .task_version :
172
161
self .print_and_log ("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!" )
173
162
self .print_and_log ("Version not match, please use EasySpider v0.2.0 to run this task!" )
174
163
self .browser .quit ()
175
164
sys .exit ()
176
- try :
177
- self .links = list (filter (isnotnull , service ["links" ].split ("\n " ))) # 要执行的link的列表
178
- except :
165
+
166
+ if self .task_version >= "0.3.1" : # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
167
+ pass
168
+ elif self .task_version != version : # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
169
+ self .print_and_log (f"版本不一致,请使用{ self .task_version } 版本的EasySpider运行该任务!" )
170
+ self .print_and_log (f"Version not match, please use EasySpider { self .task_version } to run this task!" )
171
+ self .browser .quit ()
172
+ sys .exit ()
173
+
174
+ service_links = service .get ("links" )
175
+ if service_links :
176
+ self .links = list (filter (isnotnull , service_links .split ("\n " ))) # 要执行的link的列表
177
+ else :
179
178
self .links = list (filter (isnotnull , service ["url" ])) # 要执行的link
179
+
180
180
self .OUTPUT = [] # 采集的数据
181
181
if self .outputFormat in ["csv" , "txt" , "xlsx" , "json" ]:
182
182
if os .path .exists ("Data/Task_" + str (self .id ) + "/" + self .saveName + '.' + self .outputFormat ):
183
- if self .dataWriteMode == 2 :
183
+ if self .dataWriteMode == DataWriteMode . Cover . value :
184
184
os .remove ("Data/Task_" + str (self .id ) + "/" + self .saveName + '.' + self .outputFormat )
185
- elif self .dataWriteMode == 3 :
185
+ elif self .dataWriteMode == DataWriteMode . Rename . value :
186
186
i = 2
187
187
while os .path .exists ("Data/Task_" + str (self .id ) + "/" + self .saveName + '_' + str (i ) + '.' + self .outputFormat ):
188
188
i = i + 1
189
189
self .saveName = self .saveName + '_' + str (i )
190
190
self .print_and_log ("文件已存在,已重命名为" , self .saveName )
191
- self .writeMode = 1 # 写入模式,0为新建,1为追加
192
- if self .outputFormat == " csv" or self . outputFormat == " txt" or self . outputFormat == " xlsx" :
193
- if not os .path .exists ("Data/Task_" + str (self .id ) + "/" + self .saveName + '.' + self .outputFormat ):
191
+ self .writeMode = WriteMode . Create . value # 写入模式,0为新建,1为追加
192
+ if self .outputFormat in [ ' csv' , ' txt' , ' xlsx' ] :
193
+ if not os .path .exists (f "Data/Task_{ str (self .id )} / { self .saveName } . { self .outputFormat } " ):
194
194
self .OUTPUT .append ([]) # 添加表头
195
- self .writeMode = 0
195
+ self .writeMode = WriteMode . Create . value
196
196
elif self .outputFormat == "json" :
197
- self .writeMode = 3 # JSON模式无需判断是否存在文件
197
+ self .writeMode = WriteMode . Json . value # JSON模式无需判断是否存在文件
198
198
elif self .outputFormat == "mysql" :
199
199
self .mysql = myMySQL (config ["mysql_config_path" ])
200
- self .mysql .create_table (self .saveName , service ["outputParameters" ], remove_if_exists = self .dataWriteMode == 2 )
201
- self .writeMode = 2
202
- if self .writeMode == 0 :
200
+ self .mysql .create_table (self .saveName , service ["outputParameters" ],
201
+ remove_if_exists = self .dataWriteMode == DataWriteMode .Cover .value )
202
+ self .writeMode = WriteMode .MySQL .value # MySQL模式
203
+
204
+ if self .writeMode == WriteMode .Create .value :
203
205
self .print_and_log ("新建模式|Create Mode" )
204
- elif self .writeMode == 1 :
206
+ elif self .writeMode == WriteMode . Append . value :
205
207
self .print_and_log ("追加模式|Append Mode" )
206
- elif self .writeMode == 2 :
208
+ elif self .writeMode == WriteMode . MySQL . value :
207
209
self .print_and_log ("MySQL模式|MySQL Mode" )
208
- elif self .writeMode == 3 :
210
+ elif self .writeMode == WriteMode . Json . value :
209
211
self .print_and_log ("JSON模式|JSON Mode" )
212
+
210
213
self .containJudge = service ["containJudge" ] # 是否含有判断语句
211
214
self .outputParameters = {}
212
215
self .service = service
@@ -222,7 +225,7 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
222
225
self .outputParametersTypes .append (param .get ("type" , "text" ))
223
226
self .outputParametersRecord .append (bool (param .get ("recordASField" , True )))
224
227
# 文件叠加的时候不添加表头
225
- if self .outputFormat in ["csv" , "txt" , "xlsx" ] and self .writeMode == 0 :
228
+ if self .outputFormat in ["csv" , "txt" , "xlsx" ] and self .writeMode == WriteMode . Create . value :
226
229
self .OUTPUT [0 ].append (param ["name" ])
227
230
self .urlId = 0 # 全局记录变量
228
231
self .preprocess () # 预处理,优化提取数据流程
0 commit comments