Skip to content

Commit a8e77b5

Browse files
authored
Merge pull request #359 from touero/master
Define constants using enumeration classes
2 parents 2860bc7 + 606de75 commit a8e77b5

File tree

2 files changed

+56
-37
lines changed

2 files changed

+56
-37
lines changed

ExecuteStage/constants.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from enum import unique, IntEnum
2+
3+
4+
@unique
5+
class WriteMode(IntEnum):
6+
Create = 0 # 新建模式|Create Mode
7+
Append = 1 # 追加模式|Append Mode
8+
Mysql = 2 # Mysql模式|Mysql Mode
9+
Json = 3 # Json模式|Json Mode
10+
11+
12+
@unique
13+
class DataWriteMode(IntEnum):
14+
Append = 1 # 追加模式|Append Mode
15+
Cover = 2 # 覆盖模式|Cover Mode
16+
Rename = 3 # 重命名模式|Rename Mode

ExecuteStage/easyspider_executestage.py

+40-37
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# import undetected_chromedriver as uc
1010
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
1111
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
12+
from constants import WriteMode, DataWriteMode
1213
from myChrome import MyChrome
1314
from threading import Thread, Event
1415
from PIL import Image
@@ -132,13 +133,12 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
132133
with open(stealth_path, 'r') as f:
133134
js = f.read()
134135
self.print_and_log("Loading stealth.min.js")
135-
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
136-
'source': js}) # TMALL 反扒
136+
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
137137
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
138-
"source": """
139-
Object.defineProperty(navigator, 'webdriver', {
140-
get: () => undefined
141-
})
138+
"source": """
139+
Object.defineProperty(navigator, 'webdriver', {
140+
get: () => undefined
141+
})
142142
"""
143143
})
144144
WebDriverWait(self.browser, 10)
@@ -154,59 +154,62 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
154154
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
155155
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
156156
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
157-
self.dataWriteMode = service.get("dataWriteMode", 1) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
157+
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
158+
self.task_version = service.get("version", "") # 任务版本
158159

159-
try:
160-
self.task_version = service["version"] # 任务版本
161-
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
162-
pass
163-
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
164-
if service["version"] != version:
165-
self.print_and_log("版本不一致,请使用" +
166-
service["version"] + "版本的EasySpider运行该任务!")
167-
self.print_and_log("Version not match, please use EasySpider " +
168-
service["version"] + " to run this task!")
169-
self.browser.quit()
170-
sys.exit()
171-
except: # 0.2.0版本没有version字段,所以直接退出
160+
if not self.task_version:
172161
self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
173162
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
174163
self.browser.quit()
175164
sys.exit()
176-
try:
177-
self.links = list(filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
178-
except:
165+
166+
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
167+
pass
168+
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
169+
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务!")
170+
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
171+
self.browser.quit()
172+
sys.exit()
173+
174+
service_links = service.get("links")
175+
if service_links:
176+
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
177+
else:
179178
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
179+
180180
self.OUTPUT = [] # 采集的数据
181181
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
182182
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
183-
if self.dataWriteMode == 2:
183+
if self.dataWriteMode == DataWriteMode.Cover.value:
184184
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
185-
elif self.dataWriteMode == 3:
185+
elif self.dataWriteMode == DataWriteMode.Rename.value:
186186
i = 2
187187
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
188188
i = i + 1
189189
self.saveName = self.saveName + '_' + str(i)
190190
self.print_and_log("文件已存在,已重命名为", self.saveName)
191-
self.writeMode = 1 # 写入模式,0为新建,1为追加
192-
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
193-
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
191+
self.writeMode = WriteMode.Create.value # 写入模式,0为新建,1为追加
192+
if self.outputFormat in ['csv', 'txt', 'xlsx']:
193+
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
194194
self.OUTPUT.append([]) # 添加表头
195-
self.writeMode = 0
195+
self.writeMode = WriteMode.Create.value
196196
elif self.outputFormat == "json":
197-
self.writeMode = 3 # JSON模式无需判断是否存在文件
197+
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
198198
elif self.outputFormat == "mysql":
199199
self.mysql = myMySQL(config["mysql_config_path"])
200-
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
201-
self.writeMode = 2
202-
if self.writeMode == 0:
200+
self.mysql.create_table(self.saveName, service["outputParameters"],
201+
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
202+
self.writeMode = WriteMode.MySQL.value # MySQL模式
203+
204+
if self.writeMode == WriteMode.Create.value:
203205
self.print_and_log("新建模式|Create Mode")
204-
elif self.writeMode == 1:
206+
elif self.writeMode == WriteMode.Append.value:
205207
self.print_and_log("追加模式|Append Mode")
206-
elif self.writeMode == 2:
208+
elif self.writeMode == WriteMode.MySQL.value:
207209
self.print_and_log("MySQL模式|MySQL Mode")
208-
elif self.writeMode == 3:
210+
elif self.writeMode == WriteMode.Json.value:
209211
self.print_and_log("JSON模式|JSON Mode")
212+
210213
self.containJudge = service["containJudge"] # 是否含有判断语句
211214
self.outputParameters = {}
212215
self.service = service
@@ -222,7 +225,7 @@ def __init__(self, browser_t, id, service, version, event, saveName, config, opt
222225
self.outputParametersTypes.append(param.get("type", "text"))
223226
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
224227
# 文件叠加的时候不添加表头
225-
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == 0:
228+
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
226229
self.OUTPUT[0].append(param["name"])
227230
self.urlId = 0 # 全局记录变量
228231
self.preprocess() # 预处理,优化提取数据流程

0 commit comments

Comments
 (0)