-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlogglob.py
92 lines (75 loc) · 3.29 KB
/
logglob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from datetime import datetime
from typing import Generator
import loggi
from pathier import Pathier
from config import Config
root = Pathier(__file__).parent
config = Config.load()
def load_log(company: str) -> loggi.models.Log:
"""Returns a `loggi.models.Log` object for the scraper associated with `company`."""
stem = company.lower().replace(" ", "_")
return loggi.load_log(config.scraper_logs_dir / f"{stem}.log")
def get_all_logs() -> Generator[loggi.models.Log, None, None]:
"""Generator yielding `loggi.models.Log` objects from `./gruel_logs`."""
for file in config.scraper_logs_dir.glob("*.log"):
yield loggi.load_log(file)
def get_failed_scrapers(start_time: datetime) -> list[str]:
"""Returns a list of scrapers whose last log message is an `ERROR` or `EXCEPTION`."""
fails: list[str] = []
for log in get_all_logs():
if log.filter_dates(start_time).filter_levels(["ERROR", "EXCEPTION"]).events:
assert log.path
fails.append(log.path.stem)
return fails
def get_resurrected_listings_count(start_time: datetime) -> int:
"""Returns the number of resurrected listings logged since `start_time`."""
count = 0
for log in get_all_logs():
# message = 'Resurrected x listings.'
log = log.filter_dates(start_time).filter_messages(["Resurrected*"])
if log.events:
count += int(log.events[-1].message.split()[1])
return count
def get_scrapers_with_errors(start_time: datetime) -> dict[str, list[str]]:
"""Returns scrapers that have errors after `start_time`.
Ouput is a dictionary where the error type is the key and the values are lists of scrapers.
Error keys: `redirects`, `404s`, `no_listings`, `parse_fails`, and `misc_fails`."""
scrapers: dict[str, list[str]] = {
"redirects": [],
"404s": [],
"no_listings": [],
"parse_fails": [],
"misc_fails": [],
}
for log in get_all_logs():
log = log.filter_dates(start_time)
assert log.path
error_exceptions = log.filter_levels(["ERROR", "EXCEPTION"])
if (
log.filter_levels(["WARNING", "EXCEPTION"])
.filter_messages(["Board url * resolved to *"])
.events
):
scrapers["redirects"].append(log.path.stem)
elif error_exceptions.filter_messages(["*returned status code 404*"]).events:
scrapers["404s"].append(log.path.stem)
elif log.filter_messages(["*get_parsable_items() returned 0 items*"]).events:
scrapers["no_listings"].append(log.path.stem)
elif error_exceptions.filter_messages(["*Failure to parse item*"]).events:
scrapers["parse_fails"].append(log.path.stem)
elif error_exceptions.events:
scrapers["misc_fails"].append(log.path.stem)
return scrapers
def get_empty_boards() -> list[str]:
"""Return the stems of scrapers that found no listings on their last run."""
empty_boards: list[str] = []
for log in get_all_logs():
if (
"get_parsable_items() returned 0 items"
in log.filter_messages(["*get_parsable_items() returned*"])
.events[::-1][0]
.message
):
assert log.path
empty_boards.append(log.path.stem)
return sorted(empty_boards)