feat: add leetcode spider scripts

yanglbme · yanglbme · commit 578f591d587b · 2020-03-15T16:43:06.000+08:00
添加LeetCode爬虫脚本
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Complete solutions to [LeetCode](https://leetcode-cn.com/problemset/all/), [LCOF
 [中文文档](./README_CN.md)
 
 ## Maintainer
-[@yanglbme](https://github.com/yanglbme): Creator of [@Doocs](https://github.com/doocs) technical community.
+[@yanglbme](https://github.com/yanglbme): Creator of [@Doocs](https://github.com/doocs) technical community; member of [@TheAlgorithms](https://github.com/TheAlgorithms) organization.
 
 ## Sites
 - Gitee Pages: https://doocs.gitee.io/leetcode
diff --git a/README_CN.md b/README_CN.md
@@ -20,7 +20,7 @@
 [English Version](./README.md)
 
 ## 维护者
-[@yanglbme](https://github.com/yanglbme): [Doocs](https://github.com/doocs) 技术社区创建者。
+[@yanglbme](https://github.com/yanglbme): GitHub 技术社区 [@Doocs](https://github.com/doocs) 创建者；[@TheAlgorithms](https://github.com/TheAlgorithms) 组织成员。
 
 ## 站点
 - Gitee Pages: https://doocs.gitee.io/leetcode
diff --git a/index.html b/index.html
@@ -61,7 +61,7 @@
             } else {
               url = 'https://github.com/doocs/leetcode/blob/master/' + vm.route.file
             }
-            var editHtml = '[:memo: Edit Document](' + url + ')\n'
+            var editHtml = '[:memo: Edit on GitHub](' + url + ')\n'
             return editHtml + html
           })
         },
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/config.py b/scripts/config.py
@@ -0,0 +1,36 @@
+# urls
+from enum import unique, Enum
+
+cn_graphql_url = 'https://leetcode-cn.com/graphql'
+all_problems_url = 'https://leetcode.com/api/problems/all/'
+lcof_problems_url = 'https://leetcode-cn.com/api/problems/lcof/'
+lcci_problems_url = 'https://leetcode-cn.com/api/problems/lcci/'
+
+# http
+fetch_timeout = 5
+async_headers = {
+    'accept': 'application/json, text/javascript, */*; q=0.01',
+    'content-type': 'application/json',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                  'Chrome/77.0.3865.120 Safari/537.36',
+    'x-requested-with': 'XMLHttpRequest'
+}
+
+# retrying
+retry_max_number = 8
+retry_min_random_wait = 50  # ms
+retry_max_random_wait = 100  # ms
+
+# maps
+difficulties = {
+    '1': '简单',
+    '2': '中等',
+    '3': '困难'
+}
+
+
+# request
+@unique
+class Req(Enum):
+    GET = 0
+    POST = 1
diff --git a/scripts/fetch.py b/scripts/fetch.py
@@ -0,0 +1,35 @@
+import requests
+from requests import Response
+from retrying import retry
+
+from config import Req, retry_max_number, retry_min_random_wait, retry_max_random_wait, fetch_timeout
+
+
+def need_retry(exception):
+    result = isinstance(exception, (requests.ConnectionError,
+                                    requests.ReadTimeout,
+                                    requests.exceptions.ConnectTimeout,
+                                    requests.exceptions.ReadTimeout,
+                                    requests.exceptions.Timeout))
+    if result:
+        print('Exception:{} occurred, retrying...'.format(type(exception)))
+    return result
+
+
+def fetch(url, method=Req.GET, **kwargs):
+    @retry(stop_max_attempt_number=retry_max_number, wait_random_min=retry_min_random_wait,
+           wait_random_max=retry_max_random_wait, retry_on_exception=need_retry)
+    def _fetch(url, **kwargs) -> Response:
+        # kwargs.update({'verify': False})
+        kwargs.update({'timeout': fetch_timeout})
+        response = requests.post(url, **kwargs) if method == Req.POST else requests.get(url, **kwargs)
+        if response.status_code != 200:
+            raise requests.ConnectionError('Expected status code 200, but got {}'.format(response.status_code))
+        return response
+
+    try:
+        resp = _fetch(url, **kwargs)
+        return resp
+    except Exception as e:
+        print('Something got wrong, error msg:{}'.format(e))
+        return None
diff --git a/scripts/leetcode_spider.py b/scripts/leetcode_spider.py
@@ -0,0 +1,107 @@
+import json
+from urllib.parse import quote
+
+from config import async_headers, cn_graphql_url, Req, all_problems_url, difficulties, lcof_problems_url, \
+    lcci_problems_url
+from fetch import fetch
+
+
+def get_cn_questions() -> dict:
+    """获取所有题目的ID和中文标题"""
+    form_data = {
+        'operationName': 'getQuestionTranslation',
+        'variables': {
+
+        },
+        'query': 'query getQuestionTranslation($lang: String) {\n  translations: '
+                 'allAppliedQuestionTranslations(lang: $lang) {\n    title\n    questionId\n    __typename\n  }\n}\n'
+    }
+
+    resp = fetch(url=cn_graphql_url, method=Req.POST, headers=async_headers, data=json.dumps(form_data))
+    if resp is None:
+        return dict()
+    res = resp.json()
+    questions = res['data']['translations']
+    final_res = dict()
+    for q in questions:
+        qid = str(q['questionId']).zfill(4)
+        final_res[qid] = q['title']
+    return final_res
+
+
+def get_all_questions():
+    """获取所有题目"""
+    cn_res = get_cn_questions()
+    resp = fetch(url=all_problems_url, headers=async_headers)
+    if resp is None:
+        return
+    res = resp.json()
+    questions = res['stat_status_pairs']
+
+    for question in questions:
+        qid = str(question['stat']['question_id']).zfill(4)
+        title = question['stat']['question__title']
+        link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
+        git_link = '/solution/{}/README.md'.format(qid + '.' + quote(title))
+        cn_title = cn_res.get(qid) or title
+        col1 = '[{}]({})'.format(qid, link)
+        col2 = '[{}]({})'.format(cn_title, git_link)
+        col3 = difficulties.get(str(question['difficulty']['level']))
+        yield [col1, col2, col3]
+
+
+def get_lcof_questions():
+    """获取剑指Offer题目"""
+    resp = fetch(url=lcof_problems_url, headers=async_headers)
+    if resp is None:
+        return None
+    res = resp.json()
+    questions = res['stat_status_pairs']
+    for question in questions:
+        fe_qid = question['stat']['frontend_question_id']
+        qno = fe_qid.replace('面试题', '').strip()
+        title = question['stat']['question__title'].replace(' LCOF', '').strip()
+        link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
+        git_link = '/lcof/{}/README.md'.format(quote(fe_qid + '. ' + title))
+        col1 = '[{}]({})'.format(qno, link)
+        col2 = '[{}]({})'.format(title, git_link)
+        col3 = difficulties.get(str(question['difficulty']['level']))
+        yield [col1, col2, col3]
+
+
+def get_lcci_questions():
+    """获取程序员面试金典题目"""
+    cn_res = get_cn_questions()
+    resp = fetch(url=lcci_problems_url, headers=async_headers)
+    if resp is None:
+        return None
+    res = resp.json()
+    questions = res['stat_status_pairs']
+    for question in questions:
+        qid = question['stat']['question_id']
+        fe_qid = question['stat']['frontend_question_id']
+        qno = fe_qid.replace('面试题', '').strip()
+        cn_title = cn_res.get(str(qid))
+        link = 'https://leetcode-cn.com/problems/' + question['stat']['question__title_slug']
+        git_link = '/lcci/{}/README.md'.format(quote(fe_qid + '. ' + cn_title))
+        col1 = '[{}]({})'.format(qno, link)
+        col2 = '[{}]({})'.format(cn_title, git_link)
+        col3 = difficulties.get(str(question['difficulty']['level']))
+        yield [col1, col2, col3]
+
+
+def generate_md_table_for_questions(res):
+    """生成markdown形式的表格"""
+    print("""
+|  题号  |  题解  |  难度  |
+| --- | --- | --- |""")
+    for item in sorted(res, key=lambda x: x[0]):
+        print("|  {}  |  {}  |  {}  |".format(item[0], item[1], item[2]))
+
+    print('-------------------------')
+
+
+if __name__ == '__main__':
+    generate_md_table_for_questions(get_all_questions())
+    generate_md_table_for_questions(get_lcof_questions())
+    generate_md_table_for_questions(get_lcci_questions())
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -0,0 +1 @@
+retrying

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@`
`61`	`61`	`} else {`
`62`	`62`	`url = 'https://github.com/doocs/leetcode/blob/master/' + vm.route.file`
`63`	`63`	`}`
`64`		`- var editHtml = '[:memo: Edit Document](' + url + ')\n'`
	`64`	`+ var editHtml = '[:memo: Edit on GitHub](' + url + ')\n'`
`65`	`65`	`return editHtml + html`
`66`	`66`	`})`
`67`	`67`	`},`