Skip to content

Commit a6286a0

Browse files
authored
Merge pull request soxoj#83 from soxoj/executors-update
Created async requests executors, some sites fixes
2 parents faa03b6 + 314eb25 commit a6286a0

File tree

5 files changed

+238
-58
lines changed

5 files changed

+238
-58
lines changed

maigret/checking.py

Lines changed: 119 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import re
44
import ssl
55
import sys
6+
import tqdm
7+
import time
8+
from typing import Callable, Any, Iterable, Tuple
69

710
import aiohttp
811
import tqdm.asyncio
@@ -37,6 +40,95 @@
3740

3841
unsupported_characters = '#'
3942

43+
QueryDraft = Tuple[Callable, Any, Any]
44+
QueriesDraft = Iterable[QueryDraft]
45+
46+
class AsyncExecutor:
47+
def __init__(self, *args, **kwargs):
48+
self.logger = kwargs['logger']
49+
50+
async def run(self, tasks: QueriesDraft):
51+
start_time = time.time()
52+
results = await self._run(tasks)
53+
self.execution_time = time.time() - start_time
54+
self.logger.debug(f'Spent time: {self.execution_time}')
55+
return results
56+
57+
async def _run(self, tasks: QueriesDraft):
58+
await asyncio.sleep(0)
59+
60+
61+
class AsyncioSimpleExecutor(AsyncExecutor):
62+
def __init__(self, *args, **kwargs):
63+
super().__init__(*args, **kwargs)
64+
65+
async def _run(self, tasks: QueriesDraft):
66+
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
67+
return await asyncio.gather(*futures)
68+
69+
70+
class AsyncioProgressbarExecutor(AsyncExecutor):
71+
def __init__(self, *args, **kwargs):
72+
super().__init__(*args, **kwargs)
73+
74+
async def _run(self, tasks: QueriesDraft):
75+
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
76+
results = []
77+
for f in tqdm.asyncio.tqdm.as_completed(futures):
78+
results.append(await f)
79+
return results
80+
81+
82+
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
83+
def __init__(self, *args, **kwargs):
84+
super().__init__(*args, **kwargs)
85+
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
86+
87+
async def _run(self, tasks: QueriesDraft):
88+
async def _wrap_query(q: QueryDraft):
89+
async with self.semaphore:
90+
f, args, kwargs = q
91+
return await f(*args, **kwargs)
92+
93+
async def semaphore_gather(tasks: QueriesDraft):
94+
coros = [_wrap_query(q) for q in tasks]
95+
results = []
96+
for f in tqdm.asyncio.tqdm.as_completed(coros):
97+
results.append(await f)
98+
return results
99+
100+
return await semaphore_gather(tasks)
101+
102+
103+
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
104+
def __init__(self, *args, **kwargs):
105+
super().__init__(*args, **kwargs)
106+
self.workers_count = kwargs.get('in_parallel', 10)
107+
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
108+
self.queue = asyncio.Queue(self.workers_count)
109+
110+
async def worker(self):
111+
while True:
112+
f, args, kwargs = await self.queue.get()
113+
result = await f(*args, **kwargs)
114+
self.results.append(result)
115+
self.progress.update(1)
116+
self.queue.task_done()
117+
118+
async def _run(self, tasks: QueriesDraft):
119+
self.results = []
120+
workers = [asyncio.create_task(self.worker())
121+
for _ in range(self.workers_count)]
122+
task_list = list(tasks)
123+
self.progress = self.progress_func(total=len(task_list))
124+
for t in task_list:
125+
await self.queue.put(t)
126+
await self.queue.join()
127+
for w in workers:
128+
w.cancel()
129+
self.progress.close()
130+
return self.results
131+
40132

41133
async def get_response(request_future, site_name, logger):
42134
html_text = None
@@ -87,19 +179,18 @@ async def get_response(request_future, site_name, logger):
87179
return html_text, status_code, error_text, expection_text
88180

89181

90-
async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
91-
async with semaphore:
92-
site_obj = site_dict[sitename]
93-
future = site_obj.request_future
94-
if not future:
95-
# ignore: search by incompatible id type
96-
return
182+
async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify):
183+
site_obj = site_dict[sitename]
184+
future = site_obj.request_future
185+
if not future:
186+
# ignore: search by incompatible id type
187+
return
97188

98-
response = await get_response(request_future=future,
99-
site_name=sitename,
100-
logger=logger)
189+
response = await get_response(request_future=future,
190+
site_name=sitename,
191+
logger=logger)
101192

102-
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
193+
return sitename, process_site_result(response, query_notify, logger, results_info, site_obj)
103194

104195

105196
# TODO: move to separate class
@@ -454,32 +545,33 @@ async def maigret(username, site_dict, query_notify, logger,
454545
# Add this site's results into final dictionary with all of the other results.
455546
results_total[site_name] = results_site
456547

457-
# TODO: move into top-level function
458-
459-
sem = asyncio.Semaphore(max_connections)
460-
461-
tasks = []
548+
coroutines = []
462549
for sitename, result_obj in results_total.items():
463-
update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
464-
future = asyncio.ensure_future(update_site_coro)
465-
tasks.append(future)
550+
coroutines.append((update_site_dict_from_response, [sitename, site_dict, result_obj, logger, query_notify], {}))
466551

467552
if no_progressbar:
468-
await asyncio.gather(*tasks)
553+
executor = AsyncioSimpleExecutor(logger=logger)
469554
else:
470-
for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=timeout):
471-
try:
472-
await f
473-
except asyncio.exceptions.TimeoutError:
474-
# TODO: write timeout to results
475-
pass
555+
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=max_connections, timeout=timeout+0.5)
556+
557+
results = await executor.run(coroutines)
476558

477559
await session.close()
478560

479561
# Notify caller that all queries are finished.
480562
query_notify.finish()
481563

482-
return results_total
564+
data = {}
565+
for result in results:
566+
# TODO: still can be empty
567+
if result:
568+
try:
569+
data[result[0]] = result[1]
570+
except Exception as e:
571+
logger.error(e, exc_info=True)
572+
logger.info(result)
573+
574+
return data
483575

484576

485577
def timeout_check(value):

maigret/maigret.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ async def main():
261261
print('Maigret sites database self-checking...')
262262
is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
263263
if is_need_update:
264-
if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
264+
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
265265
db.save_to_file(args.db_file)
266266
print('Database was successfully updated.')
267267
else:
@@ -337,14 +337,13 @@ async def main():
337337
max_connections=args.connections,
338338
)
339339

340-
username_result = (username, id_type, results)
341340
general_results.append((username, id_type, results))
342341

343342
# TODO: tests
344343
for website_name in results:
345344
dictionary = results[website_name]
346345
# TODO: fix no site data issue
347-
if not dictionary:
346+
if not dictionary or not recursive_search_enabled:
348347
continue
349348

350349
new_usernames = dictionary.get('ids_usernames')

maigret/resources/data.json

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2426,8 +2426,6 @@
24262426
},
24272427
"Ccmixter": {
24282428
"tags": [
2429-
"global",
2430-
"in",
24312429
"us"
24322430
],
24332431
"checkType": "message",
@@ -2443,13 +2441,18 @@
24432441
},
24442442
"Cent": {
24452443
"tags": [
2446-
"in",
2447-
"mx",
2448-
"tw",
2449-
"us"
2444+
"us",
2445+
"art",
2446+
"writing"
24502447
],
2448+
"urlProbe": "https://beta.cent.co/data/user/profile?userHandles={username}",
24512449
"checkType": "message",
2452-
"absenceStrs": "<title>Cent</title>",
2450+
"presenseStrs": [
2451+
"display_name"
2452+
],
2453+
"absenceStrs": [
2454+
"\"results\":[]"
2455+
],
24532456
"alexaRank": 31175,
24542457
"url": "https://beta.cent.co/@{username}",
24552458
"urlMain": "https://cent.co/",
@@ -11713,15 +11716,13 @@
1171311716
"usernameClaimed": "vitaline",
1171411717
"usernameUnclaimed": "noonewouldeverusethis7"
1171511718
},
11716-
"Sevenforums": {
11719+
"SevenForums": {
1171711720
"tags": [
1171811721
"gb",
1171911722
"us"
1172011723
],
11721-
"checkType": "message",
11722-
"absenceStrs": "<title>Just a moment...</title>",
11724+
"engine": "vBulletin",
1172311725
"alexaRank": 20828,
11724-
"url": "https://www.sevenforums.com/members/{username}.html",
1172511726
"urlMain": "https://www.sevenforums.com",
1172611727
"usernameClaimed": "adam",
1172711728
"usernameUnclaimed": "noonewouldeverusethis7"
@@ -12349,7 +12350,7 @@
1234912350
"us"
1235012351
],
1235112352
"headers": {
12352-
"authorization": "Bearer BQA6sdhtUg3hadjln7DCoAK6sLn7KrHfsn2DObW2gr-W3HgF0h1KZGVYgwispRDR1tqRntVeTd0Duvb2q4g"
12353+
"authorization": "Bearer BQBQDhCkzUqE4QBPyqrSyRZbBRp5pdttS7rj9J8qT7OllWuJazqP6CcE-1eGcNoRkxNl9Ds9JCdgY3soi6U"
1235312354
},
1235412355
"errors": {
1235512356
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -12774,6 +12775,7 @@
1277412775
"usernameUnclaimed": "noonewouldeverusethis7"
1277512776
},
1277612777
"TJournal": {
12778+
"similarSearch": true,
1277712779
"tags": [
1277812780
"ru"
1277912781
],
@@ -12900,18 +12902,6 @@
1290012902
"usernameClaimed": "taplink.ru",
1290112903
"usernameUnclaimed": "noonewouldeverusethis77777"
1290212904
},
12903-
"Taringa": {
12904-
"tags": [
12905-
"ar"
12906-
],
12907-
"checkType": "message",
12908-
"absenceStrs": "Moved Permanently",
12909-
"alexaRank": 4125,
12910-
"url": "https://www.taringa.net/{username}",
12911-
"urlMain": "https://taringa.net/",
12912-
"usernameClaimed": "blue",
12913-
"usernameUnclaimed": "noonewouldeverusethis7"
12914-
},
1291512905
"TechPowerUp": {
1291612906
"tags": [
1291712907
"us"
@@ -13690,7 +13680,7 @@
1369013680
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
1369113681
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
1369213682
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
13693-
"x-guest-token": "1372637128920825857"
13683+
"x-guest-token": "1373308975769391104"
1369413684
},
1369513685
"errors": {
1369613686
"Bad guest token": "x-guest-token update required"
@@ -13865,6 +13855,7 @@
1386513855
"usernameUnclaimed": "noonewouldeverusethis7"
1386613856
},
1386713857
"VC.ru": {
13858+
"similarSearch": true,
1386813859
"tags": [
1386913860
"ru"
1387013861
],
@@ -14062,7 +14053,7 @@
1406214053
"video"
1406314054
],
1406414055
"headers": {
14065-
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTYxMDcyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.kzWxBf1qCJwjpZYUP6w-Pf4VptBMKpKUaMw8VnYwtPU"
14056+
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTYyNjMwODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.YKtLE0-AGmaXJNF99dVKjPW8z5_-wDs6tnnjVOybDaQ"
1406614057
},
1406714058
"activation": {
1406814059
"url": "https://vimeo.com/_rv/viewer",
@@ -14091,7 +14082,7 @@
1409114082
"usernameClaimed": "blue",
1409214083
"usernameUnclaimed": "noonewouldeverusethis7"
1409314084
},
14094-
"Virtualireland": {
14085+
"VirtualIreland": {
1409514086
"tags": [
1409614087
"ie",
1409714088
"ru"
@@ -23657,6 +23648,21 @@
2365723648
"urlMain": "https://linuxpip.org",
2365823649
"usernameClaimed": "diehard",
2365923650
"usernameUnclaimed": "noonewouldeverusethis7"
23651+
},
23652+
"Taringa": {
23653+
"checkType": "message",
23654+
"presenseStrs": [
23655+
"User",
23656+
" user-username",
23657+
" UserFeed"
23658+
],
23659+
"absenceStrs": [
23660+
"problema"
23661+
],
23662+
"url": "https://www.taringa.net/{username}",
23663+
"urlMain": "https://www.taringa.net",
23664+
"usernameClaimed": "UniversoGIA",
23665+
"usernameUnclaimed": "noonewouldeverusethis7"
2366023666
}
2366123667
},
2366223668
"engines": {
@@ -23727,6 +23733,7 @@
2372723733
],
2372823734
"checkType": "message",
2372923735
"errors": {
23736+
"\u041f\u0440\u043e\u0441\u0442\u0438\u0442\u0435, \u043d\u043e \u0432\u0430\u0448 IP \u0432 \u0441\u043f\u0438\u0441\u043a\u0435 \u0437\u0430\u043f\u0440\u0435\u0449\u0435\u043d\u043d\u044b\u0445 \u0430\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0446\u0438\u0435\u0439 \u0444\u043e\u0440\u0443\u043c\u0430": "IP ban",
2373023737
"You have been banned": "IP ban",
2373123738
"The administrator has banned your IP address": "IP ban",
2373223739
"\u0418\u0437\u0432\u0438\u043d\u0438\u0442\u0435, \u0441\u0435\u0440\u0432\u0435\u0440 \u043f\u0435\u0440\u0435\u0433\u0440\u0443\u0436\u0435\u043d. \u041f\u043e\u0436\u0430\u043b\u0443\u0439\u0441\u0442\u0430, \u043f\u043e\u043f\u0440\u043e\u0431\u0443\u0439\u0442\u0435 \u0437\u0430\u0439\u0442\u0438 \u043f\u043e\u0437\u0436\u0435.": "Server is overloaded"
@@ -23762,13 +23769,29 @@
2376223769
"error404"
2376323770
],
2376423771
"checkType": "message",
23772+
"requestHeadOnly": false,
2376523773
"url": "{urlMain}/author/{username}/"
2376623774
},
2376723775
"presenseStrs": [
2376823776
"/wp-admin",
2376923777
"/wp-includes/wlwmanifest.xml"
2377023778
]
2377123779
},
23780+
"Flarum": {
23781+
"name": "Flarum",
23782+
"site": {
23783+
"presenseStrs": [
23784+
"\"attributes\":{\"username\""
23785+
],
23786+
"absenceStrs": [
23787+
"NotFound"
23788+
],
23789+
"checkType": "message"
23790+
},
23791+
"presenseStrs": [
23792+
"flarum-loading-error"
23793+
]
23794+
},
2377223795
"engine404": {
2377323796
"name": "engine404",
2377423797
"site": {

0 commit comments

Comments
 (0)