Skip to content

Commit c7977f1

Browse files
committed
Draft of graph report
1 parent 49708da commit c7977f1

10 files changed

+231
-24
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ test:
66
coverage html
77

88
rerun-tests:
9-
pytest --lf
9+
pytest --lf -vv
1010

1111
lint:
1212
@echo 'syntax errors or undefined names'

example.ipynb

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"id": "8v6PEfyXb0Gx"
8+
},
9+
"outputs": [],
10+
"source": [
11+
"# clone the repo\n",
12+
"!git clone https://github.com/soxoj/maigret\n",
13+
"!pip3 install -r maigret/requirements.txt"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": null,
19+
"metadata": {
20+
"id": "cXOQUAhDchkl"
21+
},
22+
"outputs": [],
23+
"source": [
24+
"# help\n",
25+
"!python3 maigret/maigret.py --help"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"id": "SjDmpN4QGnJu"
33+
},
34+
"outputs": [],
35+
"source": [
36+
"# search\n",
37+
"!python3 maigret/maigret.py user"
38+
]
39+
}
40+
],
41+
"metadata": {
42+
"colab": {
43+
"collapsed_sections": [],
44+
"include_colab_link": true,
45+
"name": "maigret.ipynb",
46+
"provenance": []
47+
},
48+
"kernelspec": {
49+
"display_name": "Python 3",
50+
"language": "python",
51+
"name": "python3"
52+
},
53+
"language_info": {
54+
"codemirror_mode": {
55+
"name": "ipython",
56+
"version": 3
57+
},
58+
"file_extension": ".py",
59+
"mimetype": "text/x-python",
60+
"name": "python",
61+
"nbconvert_exporter": "python",
62+
"pygments_lexer": "ipython3",
63+
"version": "3.7.10"
64+
}
65+
},
66+
"nbformat": 4,
67+
"nbformat_minor": 1
68+
}

maigret/checking.py

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737

3838
SUPPORTED_IDS = (
39+
"username",
3940
"yandex_public_id",
4041
"gaia_id",
4142
"vk_id",

maigret/maigret.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
save_json_report,
3535
get_plaintext_report,
3636
sort_report_by_data_points,
37+
save_graph_report,
3738
)
3839
from .sites import MaigretDatabase
3940
from .submit import Submitter
@@ -62,17 +63,6 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
6263
)
6364

6465

65-
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
66-
results = {}
67-
for s in db.sites:
68-
result = s.extract_id_from_url(url)
69-
if not result:
70-
continue
71-
_id, _type = result
72-
results[_id] = _type
73-
return results
74-
75-
7666
def extract_ids_from_page(url, logger, timeout=5) -> dict:
7767
results = {}
7868
# url, headers
@@ -118,7 +108,7 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
118108
ids_results[u] = utype
119109

120110
for url in dictionary.get('ids_links', []):
121-
ids_results.update(extract_ids_from_url(url, db))
111+
ids_results.update(db.extract_ids_from_url(url))
122112

123113
return ids_results
124114

@@ -431,6 +421,14 @@ def setup_arguments_parser():
431421
default=False,
432422
help="Generate a PDF report (general report on all usernames).",
433423
)
424+
report_group.add_argument(
425+
"-G",
426+
"--graph",
427+
action="store_true",
428+
dest="graph",
429+
default=False,
430+
help="Generate a graph report (general report on all usernames).",
431+
)
434432
report_group.add_argument(
435433
"-J",
436434
"--json",
@@ -693,6 +691,11 @@ async def main():
693691
save_pdf_report(filename, report_context)
694692
query_notify.warning(f'PDF report on all usernames saved in {filename}')
695693

694+
if args.graph:
695+
filename = report_filepath_tpl.format(username=username, postfix='.html')
696+
save_graph_report(filename, general_results, db)
697+
query_notify.warning(f'Graph report on all usernames saved in {filename}')
698+
696699
text_report = get_plaintext_report(report_context)
697700
if text_report:
698701
query_notify.info('Short text report:')

maigret/report.py

+120
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ast
12
import csv
23
import io
34
import json
@@ -11,8 +12,12 @@
1112
from dateutil.parser import parse as parse_datetime_str
1213
from jinja2 import Template
1314
from xhtml2pdf import pisa
15+
from pyvis.network import Network
16+
import networkx as nx
1417

18+
from .checking import SUPPORTED_IDS
1519
from .result import QueryStatus
20+
from .sites import MaigretDatabase
1621
from .utils import is_country_tag, CaseConverter, enrich_link_str
1722

1823
SUPPORTED_JSON_REPORT_FORMATS = [
@@ -82,6 +87,121 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
8287
generate_json_report(username, results, f, report_type=report_type)
8388

8489

90+
class MaigretGraph:
91+
other_params = {'size': 10, 'group': 3}
92+
site_params = {'size': 15, 'group': 2}
93+
username_params = {'size': 20, 'group': 1}
94+
95+
def __init__(self, graph):
96+
self.G = graph
97+
98+
def add_node(self, key, value):
99+
node_name = f'{key}: {value}'
100+
101+
params = self.other_params
102+
if key in SUPPORTED_IDS:
103+
params = self.username_params
104+
elif value.startswith('http'):
105+
params = self.site_params
106+
107+
self.G.add_node(node_name, title=node_name, **params)
108+
109+
if value != value.lower():
110+
normalized_node_name = self.add_node(key, value.lower())
111+
self.link(node_name, normalized_node_name)
112+
113+
return node_name
114+
115+
def link(self, node1_name, node2_name):
116+
self.G.add_edge(node1_name, node2_name, weight=2)
117+
118+
119+
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
120+
G = nx.Graph()
121+
graph = MaigretGraph(G)
122+
123+
for username, id_type, results in username_results:
124+
username_node_name = graph.add_node(id_type, username)
125+
126+
for website_name in results:
127+
dictionary = results[website_name]
128+
# TODO: fix no site data issue
129+
if not dictionary:
130+
continue
131+
132+
if dictionary.get("is_similar"):
133+
continue
134+
135+
status = dictionary.get("status")
136+
if not status: # FIXME: currently in case of timeout
137+
continue
138+
139+
if dictionary["status"].status != QueryStatus.CLAIMED:
140+
continue
141+
142+
site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
143+
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
144+
site_node_name = graph.add_node('site', site_fallback_name)
145+
graph.link(username_node_name, site_node_name)
146+
147+
def process_ids(parent_node, ids):
148+
for k, v in ids.items():
149+
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
150+
continue
151+
if k in 'image':
152+
continue
153+
154+
v_data = v
155+
if v.startswith('['):
156+
try:
157+
v_data = ast.literal_eval(v)
158+
except Exception as e:
159+
logging.error(e)
160+
161+
# value is a list
162+
if isinstance(v_data, list):
163+
list_node_name = graph.add_node(k, site_fallback_name)
164+
for vv in v_data:
165+
data_node_name = graph.add_node(vv, site_fallback_name)
166+
graph.link(list_node_name, data_node_name)
167+
168+
add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
169+
if add_ids:
170+
process_ids(data_node_name, add_ids)
171+
else:
172+
# value is just a string
173+
# ids_data_name = f'{k}: {v}'
174+
# if ids_data_name == parent_node:
175+
# continue
176+
177+
ids_data_name = graph.add_node(k, v)
178+
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
179+
graph.link(parent_node, ids_data_name)
180+
181+
# check for username
182+
if 'username' in k or k in SUPPORTED_IDS:
183+
new_username_node_name = graph.add_node('username', v)
184+
graph.link(ids_data_name, new_username_node_name)
185+
186+
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
187+
if add_ids:
188+
process_ids(ids_data_name, add_ids)
189+
190+
if status.ids_data:
191+
process_ids(site_node_name, status.ids_data)
192+
193+
nodes_to_remove = []
194+
for node in G.nodes:
195+
if len(str(node)) > 100:
196+
nodes_to_remove.append(node)
197+
198+
[G.remove_node(node) for node in nodes_to_remove]
199+
200+
nt = Network(notebook=True, height="750px", width="100%")
201+
nt.from_nx(G)
202+
nt.show(filename)
203+
204+
85205
def get_plaintext_report(context: dict) -> str:
86206
output = (context['brief'] + " ").replace('. ', '.\n')
87207
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))

maigret/resources/data.json

+4-3
Original file line numberDiff line numberDiff line change
@@ -3643,6 +3643,7 @@
36433643
"errors": {
36443644
"Invalid API key": "New API key needed"
36453645
},
3646+
"regexCheck": "^[^/]+$",
36463647
"urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F",
36473648
"checkType": "status_code",
36483649
"presenseStrs": [
@@ -13036,7 +13037,7 @@
1303613037
"us"
1303713038
],
1303813039
"headers": {
13039-
"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
13040+
"authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA"
1304013041
},
1304113042
"errors": {
1304213043
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -14463,7 +14464,7 @@
1446314464
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
1446414465
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
1446514466
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
14466-
"x-guest-token": "1403829602053771266"
14467+
"x-guest-token": "1404906435025195008"
1446714468
},
1446814469
"errors": {
1446914470
"Bad guest token": "x-guest-token update required"
@@ -14870,7 +14871,7 @@
1487014871
"video"
1487114872
],
1487214873
"headers": {
14873-
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
14874+
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac"
1487414875
},
1487514876
"activation": {
1487614877
"url": "https://vimeo.com/_rv/viewer",

maigret/sites.py

+12
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,18 @@ def get_scan_stats(self, sites_dict):
400400

401401
return found_flags
402402

403+
404+
def extract_ids_from_url(self, url: str) -> dict:
405+
results = {}
406+
for s in self._sites:
407+
result = s.extract_id_from_url(url)
408+
if not result:
409+
continue
410+
_id, _type = result
411+
results[_id] = _type
412+
return results
413+
414+
403415
def get_db_stats(self, sites_dict):
404416
if not sites_dict:
405417
sites_dict = self.sites_dict()

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ webencodings==0.5.1
3737
xhtml2pdf==0.2.5
3838
XMind==1.2.0
3939
yarl==1.6.3
40+
networkx==2.5.1
41+
pyvis==0.1.9

tests/test_cli.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
'disable_recursive_search': False,
1414
'folderoutput': 'reports',
1515
'html': False,
16+
'graph': False,
1617
'id_type': 'username',
1718
'ignore_ids_list': [],
1819
'info': False,

tests/test_maigret.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from maigret.maigret import (
1010
extract_ids_from_page,
1111
extract_ids_from_results,
12-
extract_ids_from_url,
1312
)
1413
from maigret.sites import MaigretSite
1514
from maigret.result import QueryResult, QueryStatus
@@ -144,18 +143,18 @@ def test_maigret_results(test_db):
144143

145144

146145
def test_extract_ids_from_url(default_db):
147-
assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
146+
assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
148147
'test': 'username'
149148
}
150-
assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
151-
assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
149+
assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
150+
assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
152151
'ida123': 'username'
153152
}
154-
assert extract_ids_from_url(
155-
'https://my.mail.ru/yandex.ru/dipres8904/', default_db
153+
assert default_db.extract_ids_from_url(
154+
'https://my.mail.ru/yandex.ru/dipres8904/'
156155
) == {'dipres8904': 'username'}
157-
assert extract_ids_from_url(
158-
'https://reviews.yandex.ru/user/adbced123', default_db
156+
assert default_db.extract_ids_from_url(
157+
'https://reviews.yandex.ru/user/adbced123'
159158
) == {'adbced123': 'yandex_public_id'}
160159

161160

0 commit comments

Comments
 (0)