forked from browser-use/web-ui
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_dom.py
52 lines (38 loc) · 1.41 KB
/
filter_dom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
import sys
import os
from bs4 import BeautifulSoup
def extract_external_js(html_content):
"""
Parse the HTML content and return a list of external JavaScript URLs (script src).
"""
soup = BeautifulSoup(html_content, 'html.parser')
external_scripts = []
for script in soup.find_all('script'):
src = script.get('src')
if src:
external_scripts.append(src)
return external_scripts
def dom_main(html_file_path, output_file_path):
"""
Main function:
- Reads HTML from the specified file
- Extracts external <script src=...">
- Writes each src into a text file (one per line)
"""
if not os.path.isfile(html_file_path):
print(f"Error: File '{html_file_path}' does not exist.")
sys.exit(1)
with open(html_file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
external_scripts = extract_external_js(html_content)
# Write external script URLs to a txt file
with open(output_file_path, 'w', encoding='utf-8') as out:
for src in external_scripts:
out.write(src + "\n")
print(f"Extracted {len(external_scripts)} external scripts and saved to '{output_file_path}'.")
if __name__ == "__main__":
# Modify these paths as needed
html_file_path = "final_dom.html"
output_file_path = "external_scripts.txt"
dom_main(html_file_path, output_file_path)