Skip to content

Commit 578f591

Browse files
committed
feat: add leetcode spider scripts
添加LeetCode爬虫脚本
1 parent a3ae61d commit 578f591

8 files changed

+182
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Complete solutions to [LeetCode](https://leetcode-cn.com/problemset/all/), [LCOF
2020
[中文文档](./README_CN.md)
2121

2222
## Maintainer
23-
[@yanglbme](https://github.com/yanglbme): Creator of [@Doocs](https://github.com/doocs) technical community.
23+
[@yanglbme](https://github.com/yanglbme): Creator of [@Doocs](https://github.com/doocs) technical community; member of [@TheAlgorithms](https://github.com/TheAlgorithms) organization.
2424

2525
## Sites
2626
- Gitee Pages: https://doocs.gitee.io/leetcode

README_CN.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
[English Version](./README.md)
2121

2222
## 维护者
23-
[@yanglbme](https://github.com/yanglbme): [Doocs](https://github.com/doocs) 技术社区创建者
23+
[@yanglbme](https://github.com/yanglbme): GitHub 技术社区 [@Doocs](https://github.com/doocs) 创建者;[@TheAlgorithms](https://github.com/TheAlgorithms) 组织成员
2424

2525
## 站点
2626
- Gitee Pages: https://doocs.gitee.io/leetcode

index.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
} else {
6262
url = 'https://github.com/doocs/leetcode/blob/master/' + vm.route.file
6363
}
64-
var editHtml = '[:memo: Edit Document](' + url + ')\n'
64+
var editHtml = '[:memo: Edit on GitHub](' + url + ')\n'
6565
return editHtml + html
6666
})
6767
},

scripts/__init__.py

Whitespace-only changes.

scripts/config.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# urls
2+
from enum import unique, Enum
3+
4+
cn_graphql_url = 'https://leetcode-cn.com/graphql'
5+
all_problems_url = 'https://leetcode.com/api/problems/all/'
6+
lcof_problems_url = 'https://leetcode-cn.com/api/problems/lcof/'
7+
lcci_problems_url = 'https://leetcode-cn.com/api/problems/lcci/'
8+
9+
# http
10+
fetch_timeout = 5
11+
async_headers = {
12+
'accept': 'application/json, text/javascript, */*; q=0.01',
13+
'content-type': 'application/json',
14+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
15+
'Chrome/77.0.3865.120 Safari/537.36',
16+
'x-requested-with': 'XMLHttpRequest'
17+
}
18+
19+
# retrying
20+
retry_max_number = 8
21+
retry_min_random_wait = 50 # ms
22+
retry_max_random_wait = 100 # ms
23+
24+
# maps
25+
difficulties = {
26+
'1': '简单',
27+
'2': '中等',
28+
'3': '困难'
29+
}
30+
31+
32+
# request
33+
@unique
34+
class Req(Enum):
35+
GET = 0
36+
POST = 1

scripts/fetch.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import requests
2+
from requests import Response
3+
from retrying import retry
4+
5+
from config import Req, retry_max_number, retry_min_random_wait, retry_max_random_wait, fetch_timeout
6+
7+
8+
def need_retry(exception):
9+
result = isinstance(exception, (requests.ConnectionError,
10+
requests.ReadTimeout,
11+
requests.exceptions.ConnectTimeout,
12+
requests.exceptions.ReadTimeout,
13+
requests.exceptions.Timeout))
14+
if result:
15+
print('Exception:{} occurred, retrying...'.format(type(exception)))
16+
return result
17+
18+
19+
def fetch(url, method=Req.GET, **kwargs):
20+
@retry(stop_max_attempt_number=retry_max_number, wait_random_min=retry_min_random_wait,
21+
wait_random_max=retry_max_random_wait, retry_on_exception=need_retry)
22+
def _fetch(url, **kwargs) -> Response:
23+
# kwargs.update({'verify': False})
24+
kwargs.update({'timeout': fetch_timeout})
25+
response = requests.post(url, **kwargs) if method == Req.POST else requests.get(url, **kwargs)
26+
if response.status_code != 200:
27+
raise requests.ConnectionError('Expected status code 200, but got {}'.format(response.status_code))
28+
return response
29+
30+
try:
31+
resp = _fetch(url, **kwargs)
32+
return resp
33+
except Exception as e:
34+
print('Something got wrong, error msg:{}'.format(e))
35+
return None

scripts/leetcode_spider.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import json
2+
from urllib.parse import quote
3+
4+
from config import async_headers, cn_graphql_url, Req, all_problems_url, difficulties, lcof_problems_url, \
5+
lcci_problems_url
6+
from fetch import fetch
7+
8+
9+
def get_cn_questions() -> dict:
10+
"""获取所有题目的ID和中文标题"""
11+
form_data = {
12+
'operationName': 'getQuestionTranslation',
13+
'variables': {
14+
15+
},
16+
'query': 'query getQuestionTranslation($lang: String) {\n translations: '
17+
'allAppliedQuestionTranslations(lang: $lang) {\n title\n questionId\n __typename\n }\n}\n'
18+
}
19+
20+
resp = fetch(url=cn_graphql_url, method=Req.POST, headers=async_headers, data=json.dumps(form_data))
21+
if resp is None:
22+
return dict()
23+
res = resp.json()
24+
questions = res['data']['translations']
25+
final_res = dict()
26+
for q in questions:
27+
qid = str(q['questionId']).zfill(4)
28+
final_res[qid] = q['title']
29+
return final_res
30+
31+
32+
def get_all_questions():
33+
"""获取所有题目"""
34+
cn_res = get_cn_questions()
35+
resp = fetch(url=all_problems_url, headers=async_headers)
36+
if resp is None:
37+
return
38+
res = resp.json()
39+
questions = res['stat_status_pairs']
40+
41+
for question in questions:
42+
qid = str(question['stat']['question_id']).zfill(4)
43+
title = question['stat']['question__title']
44+
link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
45+
git_link = '/solution/{}/README.md'.format(qid + '.' + quote(title))
46+
cn_title = cn_res.get(qid) or title
47+
col1 = '[{}]({})'.format(qid, link)
48+
col2 = '[{}]({})'.format(cn_title, git_link)
49+
col3 = difficulties.get(str(question['difficulty']['level']))
50+
yield [col1, col2, col3]
51+
52+
53+
def get_lcof_questions():
54+
"""获取剑指Offer题目"""
55+
resp = fetch(url=lcof_problems_url, headers=async_headers)
56+
if resp is None:
57+
return None
58+
res = resp.json()
59+
questions = res['stat_status_pairs']
60+
for question in questions:
61+
fe_qid = question['stat']['frontend_question_id']
62+
qno = fe_qid.replace('面试题', '').strip()
63+
title = question['stat']['question__title'].replace(' LCOF', '').strip()
64+
link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
65+
git_link = '/lcof/{}/README.md'.format(quote(fe_qid + '. ' + title))
66+
col1 = '[{}]({})'.format(qno, link)
67+
col2 = '[{}]({})'.format(title, git_link)
68+
col3 = difficulties.get(str(question['difficulty']['level']))
69+
yield [col1, col2, col3]
70+
71+
72+
def get_lcci_questions():
73+
"""获取程序员面试金典题目"""
74+
cn_res = get_cn_questions()
75+
resp = fetch(url=lcci_problems_url, headers=async_headers)
76+
if resp is None:
77+
return None
78+
res = resp.json()
79+
questions = res['stat_status_pairs']
80+
for question in questions:
81+
qid = question['stat']['question_id']
82+
fe_qid = question['stat']['frontend_question_id']
83+
qno = fe_qid.replace('面试题', '').strip()
84+
cn_title = cn_res.get(str(qid))
85+
link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
86+
git_link = '/lcci/{}/README.md'.format(quote(fe_qid + '. ' + cn_title))
87+
col1 = '[{}]({})'.format(qno, link)
88+
col2 = '[{}]({})'.format(cn_title, git_link)
89+
col3 = difficulties.get(str(question['difficulty']['level']))
90+
yield [col1, col2, col3]
91+
92+
93+
def generate_md_table_for_questions(res):
94+
"""生成markdown形式的表格"""
95+
print("""
96+
| 题号 | 题解 | 难度 |
97+
| --- | --- | --- |""")
98+
for item in sorted(res, key=lambda x: x[0]):
99+
print("| {} | {} | {} |".format(item[0], item[1], item[2]))
100+
101+
print('-------------------------')
102+
103+
104+
if __name__ == '__main__':
105+
generate_md_table_for_questions(get_all_questions())
106+
generate_md_table_for_questions(get_lcof_questions())
107+
generate_md_table_for_questions(get_lcci_questions())

scripts/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
retrying

0 commit comments

Comments
 (0)