From 993deb533b5c03e7fad8b631c108f8299e93a727 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 16 Jun 2017 10:51:49 +0800 Subject: [PATCH 001/399] =?UTF-8?q?[update]=E6=90=9C=E7=B4=A2censys.io?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 172 +++++++++++------------------------- 1 file changed, 52 insertions(+), 120 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 7954c81dc..836dc32e2 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -3,41 +3,23 @@ """ ------------------------------------------------- File Name: GetFreeProxy.py - Description : 抓取免费代理 + Description : 通过关键字扫描censys.io中的疑似ip Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: - 2016/11/25: - 这一部分考虑用scrapy框架代替 + 2017/06/15: 通过关键字扫描censys.io中的疑似ip ------------------------------------------------- """ -import re -import requests - -try: - from importlib import reload #py3 实际不会实用,只是为了不显示语法错误 -except: - import sys # py2 - reload(sys) - sys.setdefaultencoding('utf-8') - - - -from Util.utilFunction import robustCrawl, getHtmlTree, getHTMLText - -# for debug to disable insecureWarning -requests.packages.urllib3.disable_warnings() +import json +import requests +import threading -HEADER = {'Connection': 'keep-alive', - 'Cache-Control': 'max-age=0', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, sdch', - 'Accept-Language': 'zh-CN,zh;q=0.8', - } +API_URL = "https://www.censys.io/api/v1/search/" +UID = "45bbe9db-87c9-4256-b1f0-0509037f1e84" +SECRET = "DqtxZX43liWHZPY0gNLkorptTCIaAgyu" +KEY_WORD = ['Squid', 'CCProxy', 'Tinyproxy', 'Wingate', 'Pound', 'Proxy', 'Mikrotik'] class GetFreeProxy(object): @@ -49,99 +31,49 @@ def __init__(self): pass @staticmethod - @robustCrawl #decoration print error if exception happen - def freeProxyFirst(page=10): - """ - 抓取快代理IP http://www.kuaidaili.com/ - :param page: 翻页数 - :return: - """ - url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format(page=page) for page in range(1, page + 1)) - # 页数不用太多, 后面的全是历史IP, 可用性不高 - - for url in url_list: - tree = getHtmlTree(url) - proxy_list = tree.xpath('.//div[@id="index_free_list"]//tbody/tr') - for proxy in proxy_list: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - - @staticmethod - @robustCrawl - def freeProxySecond(proxy_number=100): - """ - 抓取代理66 http://www.66ip.cn/ - :param proxy_number: 代理数量 - :return: - """ - url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) + def scanner_ip(): + for key in KEY_WORD: + print('Search Key: {key}'.format(key=key)) + t = ThreadScanner(key) + t.start() + + +class ThreadScanner(threading.Thread): + def __init__(self, key): + super(ThreadScanner, self).__init__() + self.key = key + self.query = {'query': self.key, 'page': 1, 'fields': ['ip']} + + def run(self): + self.scanner_ip() + + def get_total_page(self): + res = requests.post(API_URL + 'ipv4', data=json.dumps(self.query), + auth=(UID, SECRET), timeout=30) + res_result = res.json() + total_page = res_result.get('metadata').get('pages', 400) + return total_page + + def scanner_ip(self): + total_page = self.get_total_page() + for page in range(1, total_page): + self.query.update({'page': page}) + try: + res = requests.post(API_URL + 'ipv4', data=json.dumps(self.query), + auth=(UID, SECRET), timeout=30) + res_result = res.json() + total_page = res_result.get('metadata').get('pages') + ip_results = res_result.get('results') + except Exception as e: + ip_results = list() + for each_ip in ip_results: + ip = each_ip.get('ip') + print(ip) + if page >= total_page: + break + print('Key {k} page: {p}'.format(k=self.key, p=page)) - html = getHTMLText(url, headers=HEADER) - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - @robustCrawl - def freeProxyThird(days=1): - """ - 抓取有代理 http://www.youdaili.net/Daili/http/ - :param days: - :return: - """ - url = "http://www.youdaili.net/Daili/http/" - tree = getHtmlTree(url) - page_url_list = tree.xpath('.//div[@class="chunlist"]/ul/li/p/a/@href')[0:days] - for page_url in page_url_list: - html = requests.get(page_url, headers=HEADER).content - # print html - proxy_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html) - for proxy in proxy_list: - yield proxy - - @staticmethod - @robustCrawl - def freeProxyFourth(): - """ - 抓取西刺代理 http://api.xicidaili.com/free2016.txt - :return: - """ - url_list = ['http://www.xicidaili.com/nn', # 高匿 - 'http://www.xicidaili.com/nt', # 透明 - ] - for each_url in url_list: - tree = getHtmlTree(each_url) - proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') - for proxy in proxy_list: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - - @staticmethod - @robustCrawl - def freeProxyFifth(): - """ - 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml - :return: - """ - url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('//td[@class="ip"]') - for each_proxy in proxy_list: - yield ''.join(each_proxy.xpath('.//text()')) if __name__ == '__main__': - gg = GetFreeProxy() - # for e in gg.freeProxyFirst(): - # print e - - # for e in gg.freeProxySecond(): - # print e - - # for e in gg.freeProxyThird(): - # print e - # - # for e in gg.freeProxyFourth(): - # print e - - for e in gg.freeProxyFifth(): - print(e) \ No newline at end of file + g = GetFreeProxy() + g.scanner_ip() \ No newline at end of file From f509ba3760b76440f8f6b763289c7f527ae1ca21 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 16 Jun 2017 17:34:30 +0800 Subject: [PATCH 002/399] =?UTF-8?q?[update]=E6=90=9C=E7=B4=A2censys.io?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 76 ++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 836dc32e2..6034e46a1 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -12,13 +12,22 @@ ------------------------------------------------- """ -import json +from lxml import etree import requests import threading -API_URL = "https://www.censys.io/api/v1/search/" -UID = "45bbe9db-87c9-4256-b1f0-0509037f1e84" -SECRET = "DqtxZX43liWHZPY0gNLkorptTCIaAgyu" +API_URL = "https://www.censys.io/ipv4/_search?q={k}&page={p}" +header = { + 'Host': 'www.censys.io', + 'Connection': 'keep-alive', + 'Accept': '*/*', + 'X-Requested-With': 'XMLHttpRequest', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', + 'Referer': 'https://www.censys.io/', + 'Accept-Encoding': 'gzip, deflate, sdch, br', + 'Accept-Language': 'zh-CN,zh;q=0.8' +} + KEY_WORD = ['Squid', 'CCProxy', 'Tinyproxy', 'Wingate', 'Pound', 'Proxy', 'Mikrotik'] @@ -30,12 +39,26 @@ class GetFreeProxy(object): def __init__(self): pass - @staticmethod - def scanner_ip(): + def scanner_ip(self): + """ + 根据关键字搜索ip + :return: + """ for key in KEY_WORD: - print('Search Key: {key}'.format(key=key)) - t = ThreadScanner(key) - t.start() + for page in range(1, 200): + url = API_URL.format(k=key, p=page) + try: + res = requests.get(url, headers=header, timeout=30, ) + if res.status_code != 200: + break + tree = etree.HTML(res.content) + ip_list_el = tree.xpath('//span[@class="ip"]/a/text()') + ip_list = [each.strip() for each in ip_list_el if each.strip()] + for each in ip_list: + yield each + except Exception as e: + print(e) + print('Key {k} page: {p}'.format(k=key, p=page)) class ThreadScanner(threading.Thread): @@ -47,33 +70,24 @@ def __init__(self, key): def run(self): self.scanner_ip() - def get_total_page(self): - res = requests.post(API_URL + 'ipv4', data=json.dumps(self.query), - auth=(UID, SECRET), timeout=30) - res_result = res.json() - total_page = res_result.get('metadata').get('pages', 400) - return total_page - def scanner_ip(self): - total_page = self.get_total_page() - for page in range(1, total_page): - self.query.update({'page': page}) + for page in range(1, 200): + url = API_URL.format(k=self.key, p=page) try: - res = requests.post(API_URL + 'ipv4', data=json.dumps(self.query), - auth=(UID, SECRET), timeout=30) - res_result = res.json() - total_page = res_result.get('metadata').get('pages') - ip_results = res_result.get('results') + res = requests.get(url, headers=header, timeout=30, proxies={'https': 'https://106.75.87.49:53100'}) + if res.status_code == 429: + break + tree = etree.HTML(res.content) + ip_list_el = tree.xpath('//span[@class="ip"]/a/text()') + ip_list = [each.strip() for each in ip_list_el if each.strip()] + for each in ip_list: + print(each) except Exception as e: - ip_results = list() - for each_ip in ip_results: - ip = each_ip.get('ip') - print(ip) - if page >= total_page: - break + print(e) print('Key {k} page: {p}'.format(k=self.key, p=page)) if __name__ == '__main__': g = GetFreeProxy() - g.scanner_ip() \ No newline at end of file + for each in g.scanner_ip(): + print(each) From 9fa5d8c77d857c8f7d5bd983980a46dde2a3118c Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 21 Jun 2017 11:57:50 +0800 Subject: [PATCH 003/399] bug fix --- Config.ini | 5 ++--- Schedule/ProxyValidSchedule.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Config.ini b/Config.ini index 44dc2c013..59155b6e8 100644 --- a/Config.ini +++ b/Config.ini @@ -1,7 +1,7 @@ [DB] -type = SSDB +type = REDIS host = localhost -port = 8888 +port = 6379 name = proxy [ProxyGetter] @@ -11,4 +11,3 @@ freeProxySecond = 1 freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 - diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index b1da94469..e82661bc5 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -47,7 +47,7 @@ def __validProxy(self): # self.db.delete(each_proxy) self.log.info('validProxy_b: {} validation fail'.format(each_proxy)) value = self.db.getvalue(each_proxy) - if value and value < -5: + if value and int(value) < -5: # 计数器小于-5删除该代理 self.db.delete(each_proxy) self.log.info('validProxy_a running normal') From 95ce966b2af75938a96daad7d1a4b5f6e53d1d8b Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 21 Jun 2017 12:02:28 +0800 Subject: [PATCH 004/399] bug fix --- Util/utilFunction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 11edd1be9..f368d43ea 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -28,7 +28,8 @@ def getHTMLText(url, headers={'user': 'Mozilla/5.0'}): response.encoding = response.apparent_encoding return response.text except: - return response.status_code + return + # return response.status_code # noinspection PyPep8Naming From b4400903664b34ad5e8e436b731bb0dae800d90c Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 4 Jul 2017 12:58:50 +0800 Subject: [PATCH 005/399] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b00fd34b6..357d088ad 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ * 代理IP从何而来?   刚自学爬虫的时候没有代理IP就去西刺、快代理之类有免费代理的网站去爬,还是有个别代理能用。当然,如果你有更好的代理接口也可以自己接入。 +     免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存 * 如何保证代理质量? From 2f2646d476a1b74292c51d31ac8a98adce612ae4 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Wed, 26 Jul 2017 17:30:41 +0800 Subject: [PATCH 006/399] Update Config.ini --- Config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Config.ini b/Config.ini index 59155b6e8..bb74f3880 100644 --- a/Config.ini +++ b/Config.ini @@ -1,7 +1,7 @@ [DB] -type = REDIS +type = SSDB host = localhost -port = 6379 +port = 8888 name = proxy [ProxyGetter] From 3ed97d84d503d6b79d8f35e51e4f005f6cb95a6f Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 31 Jul 2017 15:14:51 +0800 Subject: [PATCH 007/399] [add] testGetConfig.py --- Test/testGetConfig.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 Test/testGetConfig.py diff --git a/Test/testGetConfig.py b/Test/testGetConfig.py new file mode 100644 index 000000000..7f44fa6b4 --- /dev/null +++ b/Test/testGetConfig.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testGetConfig + Description : test all function in GetConfig.py + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from Util.GetConfig import GetConfig + + +# noinspection PyPep8Naming +def testGetConfig(): + """ + test class GetConfig in Util/GetConfig + :return: + """ + gg = GetConfig() + print(gg.db_type) + print(gg.db_name) + print(gg.db_host) + print(gg.db_port) + assert isinstance(gg.proxy_getter_functions, list) + print(gg.proxy_getter_functions) + +if __name__ == '__main__': + testGetConfig() From f35d9e50a284254a60298cf362c0799315fd4a42 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 31 Jul 2017 15:31:10 +0800 Subject: [PATCH 008/399] [add] testGetFreeProxy.py --- Test/testGetFreeProxy.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Test/testGetFreeProxy.py diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py new file mode 100644 index 000000000..df99c79f3 --- /dev/null +++ b/Test/testGetFreeProxy.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testGetFreeProxy + Description : test model ProxyGetter/getFreeProxy + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31:function testGetFreeProxy +------------------------------------------------- +""" +__author__ = 'J_hao' + +from ProxyGetter.getFreeProxy import GetFreeProxy +from Util.GetConfig import GetConfig + + +# noinspection PyPep8Naming +def testGetFreeProxy(): + """ + test class GetFreeProxy in ProxyGetter/GetFreeProxy + :return: + """ + gc = GetConfig() + proxy_getter_functions = gc.proxy_getter_functions + for proxyGetter in proxy_getter_functions: + proxy_count = 0 + for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): + if proxy: + print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + proxy_count += 1 + assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) + + +if __name__ == '__main__': + testGetFreeProxy() From a3827cff50545bdee323ba1f87603ba551f9bffa Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 31 Jul 2017 16:00:33 +0800 Subject: [PATCH 009/399] [add] add WebRequest --- .gitignore | 1 - Test/testWebRequest.py | 30 ++++++++++++++++ Util/WebRequest.py | 82 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 Test/testWebRequest.py create mode 100644 Util/WebRequest.py diff --git a/.gitignore b/.gitignore index 13d6f2b6d..f09264408 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ .idea/ *.pyc *.log -test/ diff --git a/Test/testWebRequest.py b/Test/testWebRequest.py new file mode 100644 index 000000000..07dd54762 --- /dev/null +++ b/Test/testWebRequest.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testWebRequest + Description : test class WebRequest + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: function testWebRequest +------------------------------------------------- +""" +__author__ = 'J_hao' + +from Util.WebRequest import WebRequest + + +# noinspection PyPep8Naming +def testWebRequest(): + """ + test class WebRequest in Util/WebRequest.py + :return: + """ + wr = WebRequest() + request_object = wr.get('https://www.baidu.com/') + assert request_object.status_code == 200 + + +if __name__ == '__main__': + testWebRequest() diff --git a/Util/WebRequest.py b/Util/WebRequest.py new file mode 100644 index 000000000..5f012111b --- /dev/null +++ b/Util/WebRequest.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: WebRequest + Description : Network Requests Class + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' + +import requests +import random +import time + + +class WebRequest(object): + def __init__(self, *args, **kwargs): + pass + + @property + def user_agent(self): + """ + return an User-Agent at random + :return: + """ + ua_list = [ + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', + 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + ] + return random.choice(ua_list) + + @property + def header(self): + """ + basic header + :return: + """ + return {'User-Agent': self.user_agent, + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + + def get(self, url, header=None, retry_time=5, timeout=30, + retry_flag=list(), retry_interval=5, *args, **kwargs): + """ + get method + :param url: target url + :param header: headers + :param retry_time: retry time when network error + :param timeout: network timeout + :param retry_flag: if retry_flag in content. do retry + :param retry_interval: retry interval(second) + :param args: + :param kwargs: + :return: + """ + headers = self.header + if header and isinstance(header, dict): + headers.update(header) + while True: + try: + html = requests.get(url, headers=headers, timeout=timeout) + if filter(lambda key: key in html.content, retry_flag): + raise Exception + else: + return html + except Exception as e: + print(e) + retry_time -= 1 + if retry_time <= 0: + return + time.sleep(retry_interval) From 45c2aefc722f100dec234c51c0ebdae378d509c1 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 31 Jul 2017 16:43:56 +0800 Subject: [PATCH 010/399] [update] update request --- Util/utilFunction.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index f368d43ea..75f04e454 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -12,11 +12,10 @@ ------------------------------------------------- """ import requests -from requests.packages.urllib3.exceptions import InsecureRequestWarning - -requests.packages.urllib3.disable_warnings(InsecureRequestWarning) +from lxml import etree from Util.LogHandler import LogHandler +from Util.WebRequest import WebRequest logger = LogHandler(__name__) @@ -44,7 +43,8 @@ def decorate(*args, **kwargs): return decorate -def verifyProxy(proxy): +# noinspection PyPep8Naming +def verifyProxyFormat(proxy): """ 检查代理格式 :param proxy: @@ -55,6 +55,7 @@ def verifyProxy(proxy): return True if re.findall(verify_regex, proxy) else False +# noinspection PyPep8Naming def getHtmlTree(url, **kwargs): """ 获取html树 @@ -62,8 +63,7 @@ def getHtmlTree(url, **kwargs): :param kwargs: :return: """ - import requests - from lxml import etree + header = {'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', @@ -73,19 +73,21 @@ def getHtmlTree(url, **kwargs): 'Accept-Language': 'zh-CN,zh;q=0.8', } # TODO 取代理服务器用代理服务器访问 - html = requests.get(url=url, headers=header, timeout=30).content + wr = WebRequest() + html = wr.get(url=url, header=header).content return etree.HTML(html) +# noinspection PyPep8Naming def validUsefulProxy(proxy): """ - 检验代理可以性 + 检验代理是否可用 :param proxy: :return: """ proxies = {"https": "https://{proxy}".format(proxy=proxy)} try: - # 超过20秒的代理就不要了 + # 超过40秒的代理就不要了 r = requests.get('https://www.baidu.com', proxies=proxies, timeout=40, verify=False) if r.status_code == 200: logger.debug('%s is ok' % proxy) From 0a7ad305de01599592b1c2e5be98381ad944f329 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Tue, 1 Aug 2017 14:22:55 +0800 Subject: [PATCH 011/399] =?UTF-8?q?[update]=E4=BF=AE=E6=94=B9=E4=BB=A3?= =?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 2 + ProxyGetter/getFreeProxy.py | 79 ++++++++++++++++--------------------- Test/__init__.py | 13 ++++++ Util/utilFunction.py | 11 ------ 4 files changed, 48 insertions(+), 57 deletions(-) create mode 100644 Test/__init__.py diff --git a/Config.ini b/Config.ini index bb74f3880..81d6b6a1c 100644 --- a/Config.ini +++ b/Config.ini @@ -1,4 +1,6 @@ [DB] +;Configure the database information +;type: SSDB/REDIS type = SSDB host = localhost port = 8888 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 7954c81dc..e30a8b749 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -8,37 +8,26 @@ date: 2016/11/25 ------------------------------------------------- Change Activity: - 2016/11/25: - 这一部分考虑用scrapy框架代替 + 2016/11/25: ------------------------------------------------- """ import re import requests try: - from importlib import reload #py3 实际不会实用,只是为了不显示语法错误 + from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: - import sys # py2 + import sys # py2 + reload(sys) sys.setdefaultencoding('utf-8') - - - -from Util.utilFunction import robustCrawl, getHtmlTree, getHTMLText +from Util.utilFunction import robustCrawl, getHtmlTree +from Util.WebRequest import WebRequest # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() -HEADER = {'Connection': 'keep-alive', - 'Cache-Control': 'max-age=0', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, sdch', - 'Accept-Language': 'zh-CN,zh;q=0.8', - } - class GetFreeProxy(object): """ @@ -49,21 +38,22 @@ def __init__(self): pass @staticmethod - @robustCrawl #decoration print error if exception happen + @robustCrawl # decoration print error if exception happen def freeProxyFirst(page=10): """ - 抓取快代理IP http://www.kuaidaili.com/ - :param page: 翻页数 + 抓取无忧代理 http://www.data5u.com/ + :param page: 页数 :return: """ - url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format(page=page) for page in range(1, page + 1)) - # 页数不用太多, 后面的全是历史IP, 可用性不高 - + url_list = ['http://www.data5u.com/', + 'http://www.data5u.com/free/', + 'http://www.data5u.com/free/gngn/index.shtml', + 'http://www.data5u.com/free/gnpt/index.shtml'] for url in url_list: - tree = getHtmlTree(url) - proxy_list = tree.xpath('.//div[@id="index_free_list"]//tbody/tr') - for proxy in proxy_list: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) + html_tree = getHtmlTree(url) + ul_list = html_tree.xpath('//ul[@class="l2"]') + for ul in ul_list: + yield ':'.join(ul.xpath('.//li/text()')[0:2]) @staticmethod @robustCrawl @@ -73,10 +63,10 @@ def freeProxySecond(proxy_number=100): :param proxy_number: 代理数量 :return: """ - url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) - - html = getHTMLText(url, headers=HEADER) + url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( + proxy_number) + request = WebRequest() + html = request.get(url).content for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy @@ -84,19 +74,15 @@ def freeProxySecond(proxy_number=100): @robustCrawl def freeProxyThird(days=1): """ - 抓取有代理 http://www.youdaili.net/Daili/http/ + 抓取ip181 http://www.ip181.com/ :param days: :return: """ - url = "http://www.youdaili.net/Daili/http/" - tree = getHtmlTree(url) - page_url_list = tree.xpath('.//div[@class="chunlist"]/ul/li/p/a/@href')[0:days] - for page_url in page_url_list: - html = requests.get(page_url, headers=HEADER).content - # print html - proxy_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html) - for proxy in proxy_list: - yield proxy + url = 'http://www.ip181.com/' + html_tree = getHtmlTree(url) + tr_list = html_tree.xpath('//tr')[1:] + for tr in tr_list: + yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod @robustCrawl @@ -129,6 +115,7 @@ def freeProxyFifth(): for each_proxy in proxy_list: yield ''.join(each_proxy.xpath('.//text()')) + if __name__ == '__main__': gg = GetFreeProxy() # for e in gg.freeProxyFirst(): @@ -139,9 +126,9 @@ def freeProxyFifth(): # for e in gg.freeProxyThird(): # print e - # - # for e in gg.freeProxyFourth(): - # print e - for e in gg.freeProxyFifth(): - print(e) \ No newline at end of file + for e in gg.freeProxyFourth(): + print e + + # for e in gg.freeProxyFifth(): + # print(e) diff --git a/Test/__init__.py b/Test/__init__.py new file mode 100644 index 000000000..898942953 --- /dev/null +++ b/Test/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: __init__.py + Description : + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 75f04e454..eef32c098 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -20,17 +20,6 @@ logger = LogHandler(__name__) -def getHTMLText(url, headers={'user': 'Mozilla/5.0'}): - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - response.encoding = response.apparent_encoding - return response.text - except: - return - # return response.status_code - - # noinspection PyPep8Naming def robustCrawl(func): def decorate(*args, **kwargs): From 20e39bfd1a1b580fde51f6fefe6635626f66f809 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Wed, 2 Aug 2017 09:48:50 +0800 Subject: [PATCH 012/399] [update]new function restName in LogHandler --- Test/testLogHandler.py | 35 +++++++++++++++++++++++++++++++++++ Util/LogHandler.py | 14 ++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 Test/testLogHandler.py diff --git a/Test/testLogHandler.py b/Test/testLogHandler.py new file mode 100644 index 000000000..da309b707 --- /dev/null +++ b/Test/testLogHandler.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testLogHandler + Description : + Author : J_hao + date: 2017/8/2 +------------------------------------------------- + Change Activity: + 2017/8/2: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from Util.LogHandler import LogHandler + + +# noinspection PyPep8Naming +def testLogHandler(): + """ + test function LogHandler in Util/LogHandler + :return: + """ + log = LogHandler('test') + log.info('this is a log from test') + + log.resetName(name='test1') + log.info('this is a log from test1') + + log.resetName(name='test2') + log.info('this is a log from test2') + + +if __name__ == '__main__': + testLogHandler() diff --git a/Util/LogHandler.py b/Util/LogHandler.py index b62ba6dbe..2ce7b9d47 100644 --- a/Util/LogHandler.py +++ b/Util/LogHandler.py @@ -62,6 +62,7 @@ def __setFileHandler__(self, level=None): formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') file_handler.setFormatter(formatter) + self.file_handler = file_handler self.addHandler(file_handler) def __setStreamHandler__(self, level=None): @@ -79,8 +80,17 @@ def __setStreamHandler__(self, level=None): stream_handler.setLevel(level) self.addHandler(stream_handler) + def resetName(self, name): + """ + reset name + :param name: + :return: + """ + self.name = name + self.removeHandler(self.file_handler) + self.__setFileHandler__() + if __name__ == '__main__': - log = LogHandler(__name__) + log = LogHandler('test') log.info('this is a test msg') - pass From c137cb80013495501be91b3d17207d84230bbad1 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Thu, 3 Aug 2017 17:46:40 +0800 Subject: [PATCH 013/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 357d088ad..e1bae08c0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ 爬虫代理IP池 ======= -[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool)[![Yii2](https://img.shields.io/badge/Powered_by-Yii_Framework-green.svg?style=flat)](http://www.spiderpy.cn/) +[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool)[![Yii2](https://img.shields.io/badge/Powered_by-Yii_Framework-green.svg?style=flat)](http://www.spiderpy.cn/) [![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) > 在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行,当然在公司做的东西不能开源出来。不过呢,闲暇时间手痒,所以就想利用一些免费的资源搞一个简单的代理池服务。 From c71fa1e6aaa89a8f673a8ba40fce965fb1dce398 Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Thu, 24 Aug 2017 22:45:33 +0800 Subject: [PATCH 014/399] =?UTF-8?q?if=20filter()=E7=9A=84=E5=88=A4?= =?UTF-8?q?=E6=96=AD=E5=9C=A8python3=E4=B8=AD=E8=BF=94=E5=9B=9E=E4=B8=BAfi?= =?UTF-8?q?lter=E5=AF=B9=E8=B1=A1=EF=BC=8C=E5=8D=B3=E4=BD=BF=E4=B8=BA?= =?UTF-8?q?=E7=A9=BA=EF=BC=8Cif=E5=88=A4=E6=96=AD=E4=B9=9F=E4=BC=9A?= =?UTF-8?q?=E5=88=A4=E5=AE=9A=E4=B8=BATrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/WebRequest.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 5f012111b..b7c70b12e 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -70,12 +70,15 @@ def get(self, url, header=None, retry_time=5, timeout=30, while True: try: html = requests.get(url, headers=headers, timeout=timeout) - if filter(lambda key: key in html.content, retry_flag): - raise Exception - else: - return html + # if filter(lambda key: key in html.content, retry_flag): + # 原filter语句执行if判断所有情况均为True情况,python3与python2的区别? + # python3中filter返回filter对象,即使为空,if会判断为True + # python2中filter返回list对象,为空,if判断为False + for f in retry_flag: + if f in html.content: + raise Exception + return html except Exception as e: - print(e) retry_time -= 1 if retry_time <= 0: return From a6d40f797027542de00e63f002a205d5a77d5cc0 Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Thu, 24 Aug 2017 22:48:58 +0800 Subject: [PATCH 015/399] =?UTF-8?q?if=20filter()=E7=9A=84=E5=88=A4?= =?UTF-8?q?=E6=96=AD=E5=9C=A8python3=E4=B8=AD=E8=BF=94=E5=9B=9E=E4=B8=BAfi?= =?UTF-8?q?lter=E5=AF=B9=E8=B1=A1=EF=BC=8C=E5=8D=B3=E4=BD=BF=E4=B8=BA?= =?UTF-8?q?=E7=A9=BA=EF=BC=8Cif=E5=88=A4=E6=96=AD=E4=B9=9F=E4=BC=9A?= =?UTF-8?q?=E5=88=A4=E5=AE=9A=E4=B8=BATrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/WebRequest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index b7c70b12e..82ef20ef9 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -79,6 +79,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, raise Exception return html except Exception as e: + print(e) retry_time -= 1 if retry_time <= 0: return From d961375ada205f99abce9abc2fd05168f3c0668d Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Fri, 25 Aug 2017 17:34:08 +0800 Subject: [PATCH 016/399] =?UTF-8?q?redis=E5=9C=A8py3=E4=B8=AD=E8=BF=94?= =?UTF-8?q?=E5=9B=9Ebytes=E7=B1=BB=E5=9E=8B=EF=BC=8C=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=A4=E5=A4=84=E8=BF=94=E5=9B=9E=E6=97=B6=E7=9A=84=E8=A7=A3?= =?UTF-8?q?=E7=A0=81=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/RedisClient.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 512103954..20e4bb4bb 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -9,7 +9,7 @@ import json import random import redis - +import sys class RedisClient(object): """ @@ -33,7 +33,14 @@ def get(self): :return: """ key = self.__conn.hgetall(name=self.name) - return random.choice(key.keys()) if key else None + # return random.choice(key.keys()) if key else None + # key.keys()在python3中返回dict_keys,不支持index,不能直接使用random.choice + # 另:python3中,redis返回为bytes,需要解码 + rkey = random.choice(list(key.keys())) if key else None + if isinstance(rkey, bytes): + return rkey.decode('utf-8') + else: + return rkey # return self.__conn.srandmember(name=self.name) def put(self, key): @@ -74,7 +81,12 @@ def inckey(self, key, value): self.__conn.hincrby(self.name, key, value) def getAll(self): - return self.__conn.hgetall(self.name).keys() + # return self.__conn.hgetall(self.name).keys() + # python3 redis返回bytes类型,需要解码 + if sys.version_info.major == 3: + return [key.decode('utf-8') for key in self.__conn.hgetall(self.name).keys()] + else: + return self.__conn.hgetall(self.name).keys() # return self.__conn.smembers(self.name) def get_status(self): From 551b35a1c32edf2c7476edb511fb94b2aa71a073 Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Fri, 25 Aug 2017 21:53:54 +0800 Subject: [PATCH 017/399] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=AC=AC=E4=BA=94?= =?UTF-8?q?=E4=B8=AA=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99goubanjia.com?= =?UTF-8?q?=E7=9A=84ip:port=E6=8A=93=E5=8F=96=E6=96=B9=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E7=AD=9B=E9=80=89=E9=A1=B5=E9=9D=A2=E4=B8=AD=E9=9A=90=E8=97=8F?= =?UTF-8?q?=E7=9A=84=E6=B7=B7=E6=B7=86=E6=95=B0=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index e30a8b749..717eedcc6 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -66,7 +66,9 @@ def freeProxySecond(proxy_number=100): url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() - html = request.get(url).content + # html = request.get(url).content + # content为未解码,text为解码后的字符串 + html = request.get(url).text for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy @@ -112,8 +114,18 @@ def freeProxyFifth(): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('//td[@class="ip"]') + # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 + # 需要过滤掉

的内容 + xpath_str = """.//*[not(contains(@style, 'display: none')) + and not(contains(@style, 'display:none')) + and not(contains(@class, 'port')) + ]/text() + """ for each_proxy in proxy_list: - yield ''.join(each_proxy.xpath('.//text()')) + # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port + ip_addr = ''.join(each_proxy.xpath(xpath_str)) + port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] + yield '{}:{}'.format(ip_addr, port) if __name__ == '__main__': @@ -127,8 +139,8 @@ def freeProxyFifth(): # for e in gg.freeProxyThird(): # print e - for e in gg.freeProxyFourth(): - print e + # for e in gg.freeProxyFourth(): + # print(e) - # for e in gg.freeProxyFifth(): - # print(e) + for e in gg.freeProxyFifth(): + print(e) From 492b17df0f7b108a867dca9b943df074dedfeab5 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 29 Aug 2017 16:47:43 +0800 Subject: [PATCH 018/399] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..158b3ac08 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 J_hao104 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 3dfff5d7a1077d6c3f3be895c9a436103c12f16e Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 29 Aug 2017 16:49:31 +0800 Subject: [PATCH 019/399] Set theme jekyll-theme-time-machine --- _config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 _config.yml diff --git a/_config.yml b/_config.yml new file mode 100644 index 000000000..ddeb671b6 --- /dev/null +++ b/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-time-machine \ No newline at end of file From 1de1d7d84111231f2b0a71d79e7ff3c29f5642a1 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 29 Aug 2017 17:16:47 +0800 Subject: [PATCH 020/399] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e1bae08c0..8102b5a83 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,12 @@ 爬虫代理IP池 ======= -[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool)[![Yii2](https://img.shields.io/badge/Powered_by-Yii_Framework-green.svg?style=flat)](http://www.spiderpy.cn/) [![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) +[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) +[![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) +[![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) +[![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) +[![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool) +[![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) > 在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行,当然在公司做的东西不能开源出来。不过呢,闲暇时间手痒,所以就想利用一些免费的资源搞一个简单的代理池服务。 From 5d05635012e8f65f829b8b3cd2bdac5176d174e1 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 29 Aug 2017 17:17:43 +0800 Subject: [PATCH 021/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8102b5a83..e32239e32 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) [![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) -[![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool) +[![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) From 11d0580fc35bc6fbb310dcf42b92a7a158be990a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=BB=BA=E5=88=9A?= Date: Fri, 1 Sep 2017 18:16:33 +0800 Subject: [PATCH 022/399] build docker image with Dockerfile, or run docker run -p 5000:5000 -d gladmo/proxy:latest --- Dockerfile | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..34c49f836 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +FROM python:2.7 + +WORKDIR /usr/src/app + +COPY . . + +ENV DEBIAN_FRONTEND noninteractive +ENV TZ Asia/Shanghai + +RUN pip install --no-cache-dir -r requirements.txt && \ + apt-get update && \ + apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \ + git clone --depth 1 https://github.com/ideawu/ssdb.git ssdb && \ + cd ssdb && make && make install && cp ssdb-server /usr/bin && \ + apt-get remove -y --force-yes git make gcc g++ autoconf && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \ + + mkdir -p /var/lib/ssdb && \ + sed \ + -e 's@home.*@home /var/lib@' \ + -e 's/loglevel.*/loglevel info/' \ + -e 's@work_dir = .*@work_dir = /var/lib/ssdb@' \ + -e 's@pidfile = .*@pidfile = /run/ssdb.pid@' \ + -e 's@level:.*@level: info@' \ + -e 's@ip:.*@ip: 0.0.0.0@' \ + -i /etc/ssdb.conf && \ + + echo "# ! /bin/sh " > /usr/src/app/run.sh && \ + echo "cd Run" >> /usr/src/app/run.sh && \ + echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \ + echo "python main.py" >> /usr/src/app/run.sh && \ + + chmod 777 run.sh + +EXPOSE 5000 + +CMD [ "sh", "run.sh" ] \ No newline at end of file From b8fb4b70978258e459d6d6bbe81e71a90c4bad26 Mon Sep 17 00:00:00 2001 From: Ye Date: Sat, 2 Sep 2017 22:41:42 -0700 Subject: [PATCH 023/399] =?UTF-8?q?=E6=AF=8F=E6=AC=A1=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E6=96=B0=E4=BB=A3=E7=90=86=E5=8E=BB=E9=87=8D=E6=97=B6=EF=BC=8C?= =?UTF-8?q?=E9=87=8D=E6=96=B0=E8=AE=A1=E7=AE=97=E5=89=A9=E4=BD=99=E4=BB=A3?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index db996a35c..bfca1a68f 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -48,9 +48,10 @@ def validProxy(self): self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start validProxy_a' % time.ctime()) - exist_proxy = self.db.getAll() + # 计算剩余代理,用来 + remaining_proxies = self.db.getAll() while raw_proxy: - if validUsefulProxy(raw_proxy) and (raw_proxy not in exist_proxy): + if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('validProxy_a: %s validation pass' % raw_proxy) @@ -58,6 +59,7 @@ def validProxy(self): self.log.debug('validProxy_a: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() + remaining_proxies = self.db.getAll() self.log.info('%s validProxy_a complete' % time.ctime()) From 26225f7b8e8c06c9ab77a8b66b13aeef38b1ad81 Mon Sep 17 00:00:00 2001 From: Ye Date: Sat, 2 Sep 2017 22:44:05 -0700 Subject: [PATCH 024/399] =?UTF-8?q?=E5=87=8F=E5=B0=91=E9=87=8D=E5=A4=8D?= =?UTF-8?q?=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index bfca1a68f..a21db88ba 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -48,7 +48,7 @@ def validProxy(self): self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start validProxy_a' % time.ctime()) - # 计算剩余代理,用来 + # 计算剩余代理,用来减少重复计算 remaining_proxies = self.db.getAll() while raw_proxy: if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): From 626f9117caa149f33d70c8eb0175fa8f8d9213d6 Mon Sep 17 00:00:00 2001 From: gladmo Date: Mon, 4 Sep 2017 19:21:52 +0800 Subject: [PATCH 025/399] build docker images with Dockerfile git clone git@github.com:jhao104/proxy_pool.git cd proxy_pool docker build -t proxy:latest -f Dockerfile . docker run -p 5000:5000 -d proxy:latest # Wait a few minutes curl localhost:5000/get/ # result: xxx.xxx.xxx.xxx:xxxx curl localhost:5000/get_all/ --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index e32239e32..103981c09 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,23 @@ pip install -r requirements.txt 你也可以分别运行他们,依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可 ``` +docker: +``` +git clone git@github.com:jhao104/proxy_pool.git + +cd proxy_pool + +docker build -t proxy:latest -f Dockerfile . + +docker run -p 5000:5000 -d proxy:latest + +# Wait a few minutes +curl localhost:5000/get/ +# result: xxx.xxx.xxx.xxx:xxxx + +curl localhost:5000/get_all/ +``` + ### 5、使用   定时任务启动后,会通过代理获取方法fetch所有代理放入数据库并验证。此后默认每20分钟会重复执行一次。定时任务启动大概一两分钟后,便可在[SSDB](https://github.com/jhao104/SSDBAdmin)中看到刷新出来的可用的代理: From 175a7732a4b1a548a7c0d7bb4cade43ef4cf89c1 Mon Sep 17 00:00:00 2001 From: Ning Kang Date: Mon, 18 Sep 2017 15:49:27 +1000 Subject: [PATCH 026/399] add host ip&port config --- Config.ini | 4 ++++ DB/RedisClient.py | 2 +- Util/GetConfig.py | 9 +++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index 81d6b6a1c..7aa5fa6da 100644 --- a/Config.ini +++ b/Config.ini @@ -13,3 +13,7 @@ freeProxySecond = 1 freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 + +[HOST] +ip = 0.0.0.0 +port = 5010 \ No newline at end of file diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 20e4bb4bb..12b692f11 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -25,7 +25,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0) + self.__conn = redis.Redis(host=host, port=port, db=0,password='b840fc02d524045429941cc15f59e41cb7be6c52Kn@#9873#$%') def get(self): """ diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 8d9dc09ab..d7a734658 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -49,6 +49,13 @@ def db_port(self): def proxy_getter_functions(self): return self.config_file.options('ProxyGetter') + @LazyProperty + def host_ip(self): + return self.config_file.get('HOST','ip') + + @LazyProperty + def host_port(self): + return self.config_file.get('HOST', 'port') if __name__ == '__main__': gg = GetConfig() @@ -57,3 +64,5 @@ def proxy_getter_functions(self): print(gg.db_host) print(gg.db_port) print(gg.proxy_getter_functions) + print(gg.host_ip) + print(gg.host_port) From 8bd3b7fde44b1aaf12928f6b773b145803b45201 Mon Sep 17 00:00:00 2001 From: kangnwh Date: Mon, 18 Sep 2017 15:56:49 +1000 Subject: [PATCH 027/399] Update RedisClient.py remove password --- DB/RedisClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 12b692f11..d4c75a20f 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -25,7 +25,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0,password='b840fc02d524045429941cc15f59e41cb7be6c52Kn@#9873#$%') + self.__conn = redis.Redis(host=host, port=port, db=0,password='') def get(self): """ From 6ff99409cb81b072a56f88dc2d25a67dba59a451 Mon Sep 17 00:00:00 2001 From: Ning Kang Date: Mon, 18 Sep 2017 16:06:19 +1000 Subject: [PATCH 028/399] add host ip&port config --- Api/ProxyApi.py | 4 +++- Run/main.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index b29cc9a16..d150353b0 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -16,6 +16,7 @@ import sys from flask import Flask, jsonify, request +from Util.GetConfig import GetConfig sys.path.append('../') @@ -71,7 +72,8 @@ def get_status(): def run(): - app.run(host='0.0.0.0', port=5000) + config = GetConfig() + app.run(host=config.host_ip, port=config.host_port) if __name__ == '__main__': run() diff --git a/Run/main.py b/Run/main.py index 2526e3e07..fabd24967 100644 --- a/Run/main.py +++ b/Run/main.py @@ -14,6 +14,7 @@ import sys from multiprocessing import Process +from Util.GetConfig import GetConfig sys.path.append('../') @@ -22,7 +23,7 @@ from Schedule.ProxyRefreshSchedule import run as RefreshRun -def run(): +def run(host,port): p_list = list() p1 = Process(target=ProxyApiRun, name='ProxyApiRun') p_list.append(p1) @@ -30,10 +31,11 @@ def run(): p_list.append(p2) p3 = Process(target=RefreshRun, name='RefreshRun') p_list.append(p3) + for p in p_list: p.start() for p in p_list: p.join() if __name__ == '__main__': - run() + run() From 977d080d0f93f61a8afd60445b6a7aca3b7b5b52 Mon Sep 17 00:00:00 2001 From: Ning Kang Date: Mon, 18 Sep 2017 16:07:20 +1000 Subject: [PATCH 029/399] remove password option for redis --- DB/RedisClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DB/RedisClient.py b/DB/RedisClient.py index d4c75a20f..20e4bb4bb 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -25,7 +25,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0,password='') + self.__conn = redis.Redis(host=host, port=port, db=0) def get(self): """ From bf163f45d1e7a28db34396b20209778668103f0a Mon Sep 17 00:00:00 2001 From: Ning Kang Date: Mon, 18 Sep 2017 16:09:56 +1000 Subject: [PATCH 030/399] remove password option for redis --- Run/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Run/main.py b/Run/main.py index fabd24967..567035f2e 100644 --- a/Run/main.py +++ b/Run/main.py @@ -14,14 +14,14 @@ import sys from multiprocessing import Process -from Util.GetConfig import GetConfig + sys.path.append('../') from Api.ProxyApi import run as ProxyApiRun from Schedule.ProxyValidSchedule import run as ValidRun from Schedule.ProxyRefreshSchedule import run as RefreshRun - +from Util.GetConfig import GetConfig def run(host,port): p_list = list() From bc5d05796961ab0f6dddbc814f1201623dd9d130 Mon Sep 17 00:00:00 2001 From: kangnwh Date: Mon, 18 Sep 2017 16:19:41 +1000 Subject: [PATCH 031/399] Update main.py fix run function --- Run/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Run/main.py b/Run/main.py index 567035f2e..a4093289a 100644 --- a/Run/main.py +++ b/Run/main.py @@ -23,7 +23,7 @@ from Schedule.ProxyRefreshSchedule import run as RefreshRun from Util.GetConfig import GetConfig -def run(host,port): +def run(): p_list = list() p1 = Process(target=ProxyApiRun, name='ProxyApiRun') p_list.append(p1) From c74ec26cf968218c7f846498132e083b74433800 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 18 Sep 2017 15:51:23 +0800 Subject: [PATCH 032/399] =?UTF-8?q?[update]=E6=AF=8F=E6=AC=A1=E4=BC=91?= =?UTF-8?q?=E7=9C=A05=E5=88=86=E9=92=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyValidSchedule.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index e82661bc5..b39aadf59 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -13,6 +13,7 @@ __author__ = 'JHao' import sys +from time import sleep sys.path.append('../') @@ -50,7 +51,8 @@ def __validProxy(self): if value and int(value) < -5: # 计数器小于-5删除该代理 self.db.delete(each_proxy) - self.log.info('validProxy_a running normal') + self.log.info('validProxy_a running normal') + sleep(60 * 5) def main(self): self.__validProxy() From 78ab6ef362e7742cb3807949c6f1fed23007942c Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 18 Sep 2017 16:18:08 +0800 Subject: [PATCH 033/399] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9ssdb=20mod?= =?UTF-8?q?ule=20repo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 2 +- DB/SsdbClient.py | 25 ++++++++++++++----------- Schedule/ProxyValidSchedule.py | 2 +- Util/utilFunction.py | 2 ++ requirements.txt | 1 - 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index b29cc9a16..a7c8a6ffc 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -54,7 +54,7 @@ def refresh(): @app.route('/get_all/') def getAll(): proxies = ProxyManager().getAll() - return jsonify(list(proxies)) + return jsonify([proxy.decode('utf8') for proxy in proxies]) @app.route('/delete/', methods=['GET']) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index ae6b6f76d..b99ba18e0 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -14,8 +14,10 @@ """ __author__ = 'JHao' -from ssdb.connection import BlockingConnectionPool -from ssdb import SSDB +from Util import EnvUtil + +from redis.connection import BlockingConnectionPool +from redis import Redis import random import json @@ -39,7 +41,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = SSDB(connection_pool=BlockingConnectionPool(host=host, port=port)) + self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port)) def get(self): """ @@ -48,8 +50,9 @@ def get(self): 从useful_proxy_queue随机获取一个可用代理, 使用前需要调用changeTable("useful_proxy_queue") :return: """ - values = self.__conn.hgetall(name=self.name) - return random.choice(values.keys()) if values else None + values = self.__conn.hkeys(name=self.name) + keys = list(values) if EnvUtil.PY3 else values + return random.choice(keys) if values else None def put(self, key): """ @@ -59,9 +62,8 @@ def put(self, key): :param key: :return: """ - key = json.dump(key, ensure_ascii=False).encode('utf-8') if isinstance(key, (dict, list)) else key - return self.__conn.hincr(self.name, key, 1) - # return self.__conn.hset(self.name, value, None) + key = json.dump(key, ensure_ascii=False) if isinstance(key, (dict, list)) else key + return self.__conn.hincrby(self.name, key, 1) def getvalue(self, key): value = self.__conn.hget(self.name, key) @@ -88,17 +90,18 @@ def delete(self, key): self.__conn.hdel(self.name, key) def inckey(self, key, value): - self.__conn.hincr(self.name, key, value) + self.__conn.hincrby(self.name, key, value) def getAll(self): - return self.__conn.hgetall(self.name).keys() + keys = self.__conn.hkeys(self.name) + return list(keys) if EnvUtil.PY3 else keys def get_status(self): """ Return the number of elements in hash ``name`` :return: """ - return self.__conn.hsize(self.name) + return self.__conn.hlen(self.name) def changeTable(self, name): self.name = name diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index b39aadf59..0ee4d669a 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -52,7 +52,7 @@ def __validProxy(self): # 计数器小于-5删除该代理 self.db.delete(each_proxy) self.log.info('validProxy_a running normal') - sleep(60 * 5) + sleep(60 * 1) def main(self): self.__validProxy() diff --git a/Util/utilFunction.py b/Util/utilFunction.py index eef32c098..66360e8f3 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -74,6 +74,8 @@ def validUsefulProxy(proxy): :param proxy: :return: """ + if isinstance(proxy, bytes): + proxy = proxy.decode('utf8') proxies = {"https": "https://{proxy}".format(proxy=proxy)} try: # 超过40秒的代理就不要了 diff --git a/requirements.txt b/requirements.txt index d8d62887b..698cc8197 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ Flask==0.11.1 requests==2.11.0 lxml==3.7.1 -ssdb==0.0.3 pymongo==3.2.2 redis==2.10.5 From 0c7b1493181d709d3bb28a71c65bc2346f75ab75 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 18 Sep 2017 16:26:54 +0800 Subject: [PATCH 034/399] [add] EnvUtil --- Util/EnvUtil.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Util/EnvUtil.py diff --git a/Util/EnvUtil.py b/Util/EnvUtil.py new file mode 100644 index 000000000..b9df83c55 --- /dev/null +++ b/Util/EnvUtil.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: EnvUtil + Description : 环境相关 + Author : J_hao + date: 2017/9/18 +------------------------------------------------- + Change Activity: + 2017/9/18: 区分Python版本 +------------------------------------------------- +""" +__author__ = 'J_hao' + +import sys + +PY3 = sys.version_info >= (3,) \ No newline at end of file From d13f41ffbd2c7d67a0b8a156b733fb619f0aa79e Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Thu, 21 Sep 2017 17:26:10 +0800 Subject: [PATCH 035/399] =?UTF-8?q?[update]=20LogHandler=E8=BE=93=E5=87=BA?= =?UTF-8?q?=E5=8F=AF=E9=80=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/LogHandler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Util/LogHandler.py b/Util/LogHandler.py index 2ce7b9d47..6e7341c1b 100644 --- a/Util/LogHandler.py +++ b/Util/LogHandler.py @@ -2,12 +2,13 @@ """ ------------------------------------------------- File Name: LogHandler.py - Description : + Description : 日志操作模块 Author : JHao date: 2017/3/6 ------------------------------------------------- Change Activity: 2017/3/6: log handler + 2017/9/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) ------------------------------------------------- """ __author__ = 'JHao' @@ -38,12 +39,14 @@ class LogHandler(logging.Logger): LogHandler """ - def __init__(self, name, level=DEBUG): + def __init__(self, name, level=DEBUG, stream=True, file=True): self.name = name self.level = level logging.Logger.__init__(self, self.name, level=level) - self.__setFileHandler__() - self.__setStreamHandler__() + if stream: + self.__setStreamHandler__() + if file: + self.__setFileHandler__() def __setFileHandler__(self, level=None): """ From c590e64e47cedf82cfa4f194545d30bbadfe5cad Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Thu, 21 Sep 2017 17:52:31 +0800 Subject: [PATCH 036/399] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9=E9=AA=8C?= =?UTF-8?q?=E8=AF=81=E8=AE=A1=E6=95=B0=EF=BC=8C=E6=94=BE=E6=9D=BE=E9=AA=8C?= =?UTF-8?q?=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 1 + Manager/ProxyManager.py | 3 ++- Run/main.py | 2 +- Schedule/ProxyRefreshSchedule.py | 16 ++++++++++------ Schedule/ProxyValidSchedule.py | 23 +++++++++++++---------- Util/utilFunction.py | 6 +++--- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/Config.ini b/Config.ini index 7aa5fa6da..c6c99e8f4 100644 --- a/Config.ini +++ b/Config.ini @@ -15,5 +15,6 @@ freeProxyFourth = 1 freeProxyFifth = 1 [HOST] +; API接口配置 http://127.0.0.1:5051 ip = 0.0.0.0 port = 5010 \ No newline at end of file diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 6edb98c64..c6f9b7c17 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -40,7 +40,7 @@ def refresh(self): proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - if proxy.strip(): + if proxy: self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) @@ -76,6 +76,7 @@ def getAll(self): return self.db.getAll() def get_status(self): + # TODO rename get_count.. self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) diff --git a/Run/main.py b/Run/main.py index a4093289a..7e31566dc 100644 --- a/Run/main.py +++ b/Run/main.py @@ -21,7 +21,7 @@ from Api.ProxyApi import run as ProxyApiRun from Schedule.ProxyValidSchedule import run as ValidRun from Schedule.ProxyRefreshSchedule import run as RefreshRun -from Util.GetConfig import GetConfig + def run(): p_list = list() diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index a21db88ba..785cc6434 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -47,20 +47,24 @@ def validProxy(self): """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() - self.log.info('%s start validProxy_a' % time.ctime()) + self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.db.getAll() while raw_proxy: + if isinstance(raw_proxy, bytes): + # 兼容Py3 + raw_proxy = raw_proxy.decode('utf8') + if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) - self.log.info('validProxy_a: %s validation pass' % raw_proxy) + self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: - self.log.debug('validProxy_a: %s validation fail' % raw_proxy) + self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() remaining_proxies = self.db.getAll() - self.log.info('%s validProxy_a complete' % time.ctime()) + self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime()) def refreshPool(): @@ -88,9 +92,9 @@ def main(process_num=30): def run(): - # main() + main() sched = BlockingScheduler() - sched.add_job(main, 'interval', minutes=5) + sched.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 sched.start() diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 0ee4d669a..947d8e5cd 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -36,22 +36,25 @@ def __validProxy(self): self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if isinstance(each_proxy, bytes): + # 兼容PY3 each_proxy = each_proxy.decode('utf-8') + value = self.db.getvalue(each_proxy) if validUsefulProxy(each_proxy): # 成功计数器加1 - self.db.inckey(each_proxy, 1) - self.log.debug('validProxy_b: {} validation pass'.format(each_proxy)) + if value and int(value) < 1: + self.db.inckey(each_proxy, 1) + self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 - self.db.inckey(each_proxy, -1) - # self.db.delete(each_proxy) - self.log.info('validProxy_b: {} validation fail'.format(each_proxy)) - value = self.db.getvalue(each_proxy) - if value and int(value) < -5: - # 计数器小于-5删除该代理 - self.db.delete(each_proxy) - self.log.info('validProxy_a running normal') + if value and int(value) < -5: + # 计数器小于-5删除该代理 + self.db.delete(each_proxy) + else: + self.db.inckey(each_proxy, -1) + self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy)) + + self.log.info('ProxyValidSchedule running normal') sleep(60 * 1) def main(self): diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 66360e8f3..a09ad12bb 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -17,7 +17,7 @@ from Util.LogHandler import LogHandler from Util.WebRequest import WebRequest -logger = LogHandler(__name__) +logger = LogHandler(__name__, stream=False) # noinspection PyPep8Naming @@ -81,8 +81,8 @@ def validUsefulProxy(proxy): # 超过40秒的代理就不要了 r = requests.get('https://www.baidu.com', proxies=proxies, timeout=40, verify=False) if r.status_code == 200: - logger.debug('%s is ok' % proxy) + logger.info('%s is ok' % proxy) return True except Exception as e: - logger.info(e) + logger.debug(e) return False From 88e8fab0d3f9be8dab85f818f020b14c327c2857 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Thu, 21 Sep 2017 18:03:37 +0800 Subject: [PATCH 037/399] [update] Readme --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 103981c09..2ce9632bc 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ * 代理IP从何而来?   刚自学爬虫的时候没有代理IP就去西刺、快代理之类有免费代理的网站去爬,还是有个别代理能用。当然,如果你有更好的代理接口也可以自己接入。 -   +   免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存 * 如何保证代理质量? @@ -155,10 +155,10 @@ curl localhost:5000/get_all/ import requests def get_proxy(): - return requests.get("http://127.0.0.1:5000/get/").content + return requests.get("http://127.0.0.1:5010/get/").content def delete_proxy(proxy): - requests.get("http://127.0.0.1:5000/delete/?proxy={}".format(proxy)) + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) # your spider code @@ -169,7 +169,7 @@ def spider(): ``` -  测试地址:http://123.207.35.36:5000 单机勿压测。谢谢 +  测试地址:http://123.207.35.36:5010 单机勿压测。谢谢 ### 6、最后   时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! From 0add262f4e8edea0704814933cf100555a281194 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Fri, 22 Sep 2017 15:54:31 +0800 Subject: [PATCH 038/399] =?UTF-8?q?[update]=20=E5=81=9C=E7=94=A8Redis=20ty?= =?UTF-8?q?pe=20SsdbClient=E6=94=AF=E6=8C=81redis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 9 ++-- Config.ini | 2 +- DB/DbClient.py | 43 +++++++++++--------- DB/RedisClient.py | 10 ++++- DB/SsdbClient.py | 70 ++++++++++++++++---------------- Manager/ProxyManager.py | 31 ++++++++++---- Schedule/ProxyRefreshSchedule.py | 4 +- Schedule/ProxyValidSchedule.py | 6 +-- Util/utilFunction.py | 4 +- 9 files changed, 100 insertions(+), 79 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 57918e239..12e7d54df 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -27,9 +27,10 @@ api_list = { 'get': u'get an usable proxy', - 'refresh': u'refresh proxy pool', + # 'refresh': u'refresh proxy pool', 'get_all': u'get all proxy from proxy pool', 'delete?proxy=127.0.0.1:8080': u'delete an unable proxy', + 'get_status': u'proxy statistics' } @@ -55,7 +56,7 @@ def refresh(): @app.route('/get_all/') def getAll(): proxies = ProxyManager().getAll() - return jsonify([proxy.decode('utf8') for proxy in proxies]) + return jsonify(proxies) @app.route('/delete/', methods=['GET']) @@ -66,8 +67,8 @@ def delete(): @app.route('/get_status/') -def get_status(): - status = ProxyManager().get_status() +def getStatus(): + status = ProxyManager().getNumber() return jsonify(status) diff --git a/Config.ini b/Config.ini index c6c99e8f4..f430a94b7 100644 --- a/Config.ini +++ b/Config.ini @@ -1,6 +1,6 @@ [DB] ;Configure the database information -;type: SSDB/REDIS +;type: SSDB/REDIS if use redis, only modify the host port,the type should be SSDB type = SSDB host = localhost port = 8888 diff --git a/DB/DbClient.py b/DB/DbClient.py index e1eef8bdd..e730cc741 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -2,7 +2,7 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: DbClient.py + File Name: DbClient.py Description : DB工厂类 Author : JHao date: 2016/12/2 @@ -31,13 +31,16 @@ class DbClient(object): useful_proxy_queue: 存放检验后的代理; 抽象方法定义: - get: 随机返回一个代理; - put: 放回一个代理; - getvalue: 返回代理属性(一个计数器); - inckey: 修改代理属性计数器的值; - delete: 删除指定代理; - getAll: 返回所有代理; - changeTable: 切换 table or collection or hash; + get(proxy): 返回proxy的信息; + put(proxy): 存入一个代理; + pop(): 弹出一个代理 + exists(proxy): 判断代理是否存在 + getNumber(raw_proxy): 返回代理总数(一个计数器); + update(proxy, num): 修改代理属性计数器的值; + delete(proxy): 删除指定代理; + getAll(): 返回所有代理; + changeTable(name): 切换 table or collection or hash; + 所有方法需要相应类去具体实现: SSDB:SsdbClient.py @@ -72,32 +75,32 @@ def __initDbClient(self): host=self.config.db_host, port=self.config.db_port) - def get(self, **kwargs): - return self.client.get(**kwargs) + def get(self, key, **kwargs): + return self.client.get(key, **kwargs) def put(self, key, **kwargs): return self.client.put(key, **kwargs) - def getvalue(self, key, **kwargs): - return self.client.getvalue(key, **kwargs) - - def pop(self, **kwargs): - return self.client.pop(**kwargs) - - def inckey(self, key, value, **kwargs): - return self.client.inckey(key, value, **kwargs) + def update(self, key, value, **kwargs): + return self.client.update(key, value, **kwargs) def delete(self, key, **kwargs): return self.client.delete(key, **kwargs) + def exists(self, key, **kwargs): + return self.client.exists(key, **kwargs) + + def pop(self, **kwargs): + return self.client.pop(**kwargs) + def getAll(self): return self.client.getAll() def changeTable(self, name): self.client.changeTable(name) - def get_status(self): - return self.client.get_status() + def getNumber(self): + return self.client.getNumber() if __name__ == "__main__": diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 20e4bb4bb..7d9af4386 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -6,11 +6,17 @@ 2017/4/17 修改pop ''' +# ############################ +# 已弃用, +# SsdbClient.py 支持redis +############################## + import json import random import redis import sys + class RedisClient(object): """ Reids client @@ -41,7 +47,7 @@ def get(self): return rkey.decode('utf-8') else: return rkey - # return self.__conn.srandmember(name=self.name) + # return self.__conn.srandmember(name=self.name) def put(self, key): """ @@ -87,7 +93,7 @@ def getAll(self): return [key.decode('utf-8') for key in self.__conn.hgetall(self.name).keys()] else: return self.__conn.hgetall(self.name).keys() - # return self.__conn.smembers(self.name) + # return self.__conn.smembers(self.name) def get_status(self): return self.__conn.hlen(self.name) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index b99ba18e0..6f5fb2dd2 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -9,7 +9,7 @@ ------------------------------------------------- Change Activity: 2016/12/2: - 2017/04/26: 添加get_status方法获取hash长度 + 2017/09/22: PY3中 redis-py返回的数据是bytes型 ------------------------------------------------- """ __author__ = 'JHao' @@ -19,7 +19,6 @@ from redis.connection import BlockingConnectionPool from redis import Redis import random -import json class SsdbClient(object): @@ -27,8 +26,8 @@ class SsdbClient(object): SSDB client SSDB中代理存放的容器为hash: - 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; - 验证后供flask使用的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; + 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性; + 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ @@ -43,43 +42,28 @@ def __init__(self, name, host, port): self.name = name self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port)) - def get(self): + def get(self, proxy): """ get an item - - 从useful_proxy_queue随机获取一个可用代理, 使用前需要调用changeTable("useful_proxy_queue") + 从hash中获取对应的proxy, 使用前需要调用changeTable() + :param proxy: :return: """ - values = self.__conn.hkeys(name=self.name) - keys = list(values) if EnvUtil.PY3 else values - return random.choice(keys) if values else None + data = self.__conn.hget(name=self.name, key=proxy) + if data: + return data.decode('utf-8') if EnvUtil.PY3 else data + else: + return None - def put(self, key): + def put(self, proxy, num=1): """ - put an item - 将代理放入hash, 使用changeTable指定hash name - :param key: + :param proxy: + :param num: :return: """ - key = json.dump(key, ensure_ascii=False) if isinstance(key, (dict, list)) else key - return self.__conn.hincrby(self.name, key, 1) - - def getvalue(self, key): - value = self.__conn.hget(self.name, key) - return value if value else None - - def pop(self): - """ - pop an item - - 弹出一个代理, 使用changeTable指定hash name - :return: - """ - key = self.get() - if key: - self.__conn.hdel(self.name, key) - return key + data = self.__conn.hincrby(self.name, proxy, num) + return data def delete(self, key): """ @@ -89,14 +73,28 @@ def delete(self, key): """ self.__conn.hdel(self.name, key) - def inckey(self, key, value): + def update(self, key, value): self.__conn.hincrby(self.name, key, value) + def pop(self): + proxies = self.__conn.hkeys(self.name) + if proxies: + proxy = random.choice(proxies) + self.delete(proxy) + return proxy + return None + + def exists(self, key): + return self.__conn.hexists(self.name, key) + def getAll(self): - keys = self.__conn.hkeys(self.name) - return list(keys) if EnvUtil.PY3 else keys + item_dict = self.__conn.hgetall(self.name) + if EnvUtil.PY3: + return {key.decode('utf8'): value.decode('utf8') for key, value in item_dict.items()} + else: + return item_dict - def get_status(self): + def getNumber(self): """ Return the number of elements in hash ``name`` :return: diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index c6f9b7c17..90ca120c3 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -13,6 +13,9 @@ """ __author__ = 'JHao' +import random + +from Util import EnvUtil from DB.DbClient import DbClient from Util.GetConfig import GetConfig from Util.LogHandler import LogHandler @@ -45,8 +48,11 @@ def refresh(self): proxy_set.add(proxy.strip()) # store raw proxy - self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: + self.db.changeTable(self.useful_proxy_queue) + if self.db.exists(proxy): + continue + self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): @@ -55,7 +61,13 @@ def get(self): :return: """ self.db.changeTable(self.useful_proxy_queue) - return self.db.get() + item_dict = self.db.getAll() + if item_dict: + if EnvUtil.PY3: + return random.choice(list(item_dict.keys())) + else: + return random.choice(item_dict.keys()) + return None # return self.db.pop() def delete(self, proxy): @@ -69,21 +81,22 @@ def delete(self, proxy): def getAll(self): """ - get all proxy from pool + get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) - return self.db.getAll() + item_dict = self.db.getAll() + if EnvUtil.PY3: + return list(item_dict.keys()) if item_dict else list() + return item_dict.keys() if item_dict else list() - def get_status(self): - # TODO rename get_count.. + def getNumber(self): self.db.changeTable(self.raw_proxy_queue) - total_raw_proxy = self.db.get_status() + total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) - total_useful_queue = self.db.get_status() + total_useful_queue = self.db.getNumber() return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue} if __name__ == '__main__': pp = ProxyManager() pp.refresh() - print(pp.get_status()) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 785cc6434..5795e17a8 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -49,7 +49,7 @@ def validProxy(self): raw_proxy = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 - remaining_proxies = self.db.getAll() + remaining_proxies = self.getAll() while raw_proxy: if isinstance(raw_proxy, bytes): # 兼容Py3 @@ -63,7 +63,7 @@ def validProxy(self): self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() - remaining_proxies = self.db.getAll() + remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime()) diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 947d8e5cd..f6fa58893 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -39,11 +39,11 @@ def __validProxy(self): # 兼容PY3 each_proxy = each_proxy.decode('utf-8') - value = self.db.getvalue(each_proxy) + value = self.db.get(each_proxy) if validUsefulProxy(each_proxy): # 成功计数器加1 if value and int(value) < 1: - self.db.inckey(each_proxy, 1) + self.db.update(each_proxy, 1) self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy)) else: # 失败计数器减一 @@ -51,7 +51,7 @@ def __validProxy(self): # 计数器小于-5删除该代理 self.db.delete(each_proxy) else: - self.db.inckey(each_proxy, -1) + self.db.update(each_proxy, -1) self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy)) self.log.info('ProxyValidSchedule running normal') diff --git a/Util/utilFunction.py b/Util/utilFunction.py index a09ad12bb..268bb67a5 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -78,8 +78,8 @@ def validUsefulProxy(proxy): proxy = proxy.decode('utf8') proxies = {"https": "https://{proxy}".format(proxy=proxy)} try: - # 超过40秒的代理就不要了 - r = requests.get('https://www.baidu.com', proxies=proxies, timeout=40, verify=False) + # 超过20秒的代理就不要了 + r = requests.get('https://www.baidu.com', proxies=proxies, timeout=20, verify=False) if r.status_code == 200: logger.info('%s is ok' % proxy) return True From f033b2d032850bd2dbd4acef6d409ad823942a71 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Fri, 22 Sep 2017 17:41:07 +0800 Subject: [PATCH 039/399] =?UTF-8?q?[update]=E6=9B=B4=E6=96=B0=E6=96=87?= =?UTF-8?q?=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 187 ++++++++++++++++------------------------------- doc/introduce.md | 173 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+), 123 deletions(-) create mode 100644 doc/introduce.md diff --git a/README.md b/README.md index 2ce9632bc..ad6eef0cf 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -爬虫代理IP池 +爬虫IP代理池 ======= [![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) @@ -8,150 +8,81 @@ [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) +##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) -> 在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行,当然在公司做的东西不能开源出来。不过呢,闲暇时间手痒,所以就想利用一些免费的资源搞一个简单的代理池服务。 - +* 支持版本 ![](https://img.shields.io/badge/Python-2.x.svg) ![](https://img.shields.io/badge/Python-3.x.svg) -### 1、问题 +* 测试地址: http://123.207.35.36:5010 单机勿压测。谢谢 -* 代理IP从何而来? +### 下载安装 -  刚自学爬虫的时候没有代理IP就去西刺、快代理之类有免费代理的网站去爬,还是有个别代理能用。当然,如果你有更好的代理接口也可以自己接入。 +* 下载源码: -  免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存 - -* 如何保证代理质量? - -  可以肯定免费的代理IP大部分都是不能用的,不然别人为什么还提供付费的(不过事实是很多代理商的付费IP也不稳定,也有很多是不能用)。所以采集回来的代理IP不能直接使用,可以写检测程序不断的去用这些代理访问一个稳定的网站,看是否可以正常使用。这个过程可以使用多线程或异步的方式,因为检测代理是个很慢的过程。 - -* 采集回来的代理如何存储? - -  这里不得不推荐一个高性能支持多种数据结构的NoSQL数据库[SSDB](http://ssdb.io/docs/zh_cn/),用于替代Redis。支持队列、hash、set、k-v对,支持T级别数据。是做分布式爬虫很好中间存储工具。 - -* 如何让爬虫更简单的使用这些代理? - -  答案肯定是做成服务咯,python有这么多的web框架,随便拿一个来写个api供爬虫调用。这样有很多好处,比如:当爬虫发现代理不能使用可以主动通过api去delete代理IP,当爬虫发现代理池IP不够用时可以主动去refresh代理池。这样比检测程序更加靠谱。 - -### 2、代理池设计 - -  代理池由四部分组成: - -* ProxyGetter: - -  代理获取接口,目前有5个免费代理源,每调用一次就会抓取这个5个网站的最新代理放入DB,可自行添加额外的代理获取接口; - -* DB: - -  用于存放代理IP,现在暂时只支持SSDB。至于为什么选择SSDB,大家可以参考这篇[文章](https://www.sdk.cn/news/2684),个人觉得SSDB是个不错的Redis替代方案,如果你没有用过SSDB,安装起来也很简单,可以参考[这里](https://github.com/jhao104/memory-notes/blob/master/SSDB/SSDB%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE%E8%AE%B0%E5%BD%95.md); - -* Schedule: - -  计划任务用户定时去检测DB中的代理可用性,删除不可用的代理。同时也会主动通过ProxyGetter去获取最新代理放入DB; - -* ProxyApi: - -  代理池的外部接口,由于现在这么代理池功能比较简单,花两个小时看了下[Flask](http://flask.pocoo.org/),愉快的决定用Flask搞定。功能是给爬虫提供get/delete/refresh等接口,方便爬虫直接使用。 - -![设计](https://pic2.zhimg.com/v2-f2756da2986aa8a8cab1f9562a115b55_b.png) - -### 3、代码模块 - -  Python中高层次的数据结构,动态类型和动态绑定,使得它非常适合于快速应用开发,也适合于作为胶水语言连接已有的软件部件。用Python来搞这个代理IP池也很简单,代码分为6个模块: - -* Api: - -  api接口相关代码,目前api是由Flask实现,代码也非常简单。客户端请求传给Flask,Flask调用ProxyManager中的实现,包括`get/delete/refresh/get_all`; - -* DB: - -  数据库相关代码,目前数据库是采用SSDB。代码用工厂模式实现,方便日后扩展其他类型数据库; - -* Manager: - -  `get/delete/refresh/get_all`等接口的具体实现类,目前代理池只负责管理proxy,日后可能会有更多功能,比如代理和爬虫的绑定,代理和账号的绑定等等; - -* ProxyGetter: - -  代理获取的相关代码,目前抓取了[快代理](http://www.kuaidaili.com)、[代理66](http://www.66ip.cn/)、[有代理](http://www.youdaili.net/Daili/http/)、[西刺代理](http://api.xicidaili.com/free2016.txt)、[guobanjia](http://www.goubanjia.com/free/gngn/index.shtml)这个五个网站的免费代理,经测试这个5个网站每天更新的可用代理只有六七十个,当然也支持自己扩展代理接口; - -* Schedule: - -  定时任务相关代码,现在只是实现定时去刷新代码,并验证可用代理,采用多进程方式; - -* Util: - -  存放一些公共的模块方法或函数,包含`GetConfig`:读取配置文件config.ini的类,`ConfigParse`: 集成重写ConfigParser的类,使其对大小写敏感, `Singleton`:实现单例,`LazyProperty`:实现类属性惰性计算。等等; - -* 其他文件: - -  配置文件:Config.ini,数据库配置和代理获取接口配置,可以在GetFreeProxy中添加新的代理获取方法,并在Config.ini中注册即可使用; - -### 4、安装 - -下载代码: -``` +```shell git clone git@github.com:jhao104/proxy_pool.git 或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 ``` -安装依赖: -``` +* 安装依赖: + +```shell pip install -r requirements.txt ``` -启动: +* 配置Config.ini: -``` -如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py -到Run目录下: ->>>python main.py - -如果运行成功你应该可以看到有4个main.py进程在 +```shell +# Config.ini 为项目配置文件 +# 配置DB +type = SSDB # 如果使用SSDB或redis数据库,均配置为SSDB +host = localhost # db host +port = 8888 # db port +name = proxy # 默认配置 +# 配置 ProxyGetter +freeProxyFirst = 1 # 这里是启动的抓取函数,可在ProxyGetter/getFreeProxy.py 扩展 +freeProxySecond = 1 +.... -你也可以分别运行他们,依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可 -``` +# 配置 HOST (api服务) +ip = 127.0.0.1 # 监听ip,0.0.0.0开启外网访问 +port = 5010 # 监听端口 +# 上面配置启动后,代理api地址为 http://127.0.0.1:5010 -docker: ``` -git clone git@github.com:jhao104/proxy_pool.git - -cd proxy_pool -docker build -t proxy:latest -f Dockerfile . +* 启动: -docker run -p 5000:5000 -d proxy:latest +```shell +# 如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py +# 到Run目录下: +>>>python main.py -# Wait a few minutes -curl localhost:5000/get/ -# result: xxx.xxx.xxx.xxx:xxxx +# 如果运行成功你应该看到有4个main.py进程 -curl localhost:5000/get_all/ +# 你也可以分别运行他们, +# 依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可. ``` -### 5、使用 -  定时任务启动后,会通过代理获取方法fetch所有代理放入数据库并验证。此后默认每20分钟会重复执行一次。定时任务启动大概一两分钟后,便可在[SSDB](https://github.com/jhao104/SSDBAdmin)中看到刷新出来的可用的代理: - -![useful_proxy](https://pic2.zhimg.com/v2-12f9b7eb72f60663212f317535a113d1_b.png) - -  启动ProxyApi.py后即可在浏览器中使用接口获取代理,一下是浏览器中的截图: - -  index页面: +### 使用 -![index](https://pic3.zhimg.com/v2-a867aa3db1d413fea8aeeb4c693f004a_b.png) - -  get: +启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin),也可以通过api访问http://127.0.0.1:5010 查看。 -![get](https://pic1.zhimg.com/v2-f54b876b428893235533de20f2edbfe0_b.png) +* Api +| api | method | Description | arg| +| ----| ---- | ---- | ----| +| / | GET | api介绍 | None | +| /get | GET | 随机获取一个代理 | None| +| /get_all | GET | 获取所有代理 |None| +| /get_status | GET | 查看代理数量 |None| +| /delete | GET | 删除代理 |proxy=host:ip| -  get_all: +* 爬虫使用 -![get_all](https://pic3.zhimg.com/v2-5c79f8c07e04f9ef655b9bea406d0306_b.png) - +如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: -  爬虫中使用,如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: -``` +```python import requests def get_proxy(): @@ -162,14 +93,24 @@ def delete_proxy(proxy): # your spider code -def spider(): +def getHtml(): # .... - requests.get('https://www.example.com', proxies={"http": "http://{}".format(get_proxy())}) - # .... - + retry_count = 5 + proxy = get_proxy() + while retry_count > 0: + try: + html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy)}) + # 使用代理访问 + return html + except Exception: + retry_count -= 1 + # 出错5次, 删除代理池中代理 + delete_proxy(proxy) + return None ``` -  测试地址:http://123.207.35.36:5010 单机勿压测。谢谢 +### 问题反馈 + +任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 -### 6、最后 -  时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! +你的反馈会让此项目变得更加完美。 \ No newline at end of file diff --git a/doc/introduce.md b/doc/introduce.md new file mode 100644 index 000000000..07c4f95ab --- /dev/null +++ b/doc/introduce.md @@ -0,0 +1,173 @@ + +代理池介绍 + +本项目通过爬虫方式持续抓取代理网站公布的免费代理IP,实时校验,维护部分可以使用的代理,并通过api的形式提供外部使用。 + +### 1、问题 + +构建一个代理IP池,可能有下面这些问题: + +* 代理IP从何而来? + +  许多刚接触爬虫的,都试过去西刺、快代理之类有免费代理的网站去抓些免费代理,还是有一些代理能用。 +当然,如果你有更好的代理接口也可以自己接入。 + +  免费代理的采集也很简单,无非就是:`访问页面`` —> `正则/xpath提取` —> `保存` + +* 如何保证代理质量? + +  可以肯定免费的代理IP大部分都是不能用的,不然别人还提供付费接口干嘛(不过事实上很多代理商的付费IP也不稳定,也有很多是不能用)。 +所以采集回来的代理IP不能直接使用,检测的办法也很简单:可以写个程序不断的用代理访问一个稳定的网站,看是否可以正常访问即可。 +这个过程可以使用多线/进程或异步的方式,因为检测代理是个很慢的过程。 + +* 采集回来的代理如何存储? + +  这里不得不推荐一个国人开发的高性能支持多种数据结构的NoSQL数据库[SSDB](http://ssdb.io/docs/zh_cn/),用于替代Redis。支持队列、hash、set、k-v对,支持T级别数据。是做分布式爬虫很好中间存储工具。 + +* 如何让爬虫更方便的用到这些代理? + +  答案肯定是做成服务咯,Python有这么多的web框架,随便拿一个来写个api供爬虫调用。这样代理和爬虫架构分离有很多好处, +比如:当爬虫完全不用考虑如何校验代理,如何保证拿到的代理可用,这些都由代理池来完成。这样只需要安静的码爬虫代码就行啦。 + +### 2、代理池设计 + +  代理池由四部分组成: + +* ProxyGetter: + +  代理获取接口,目前有5个免费代理源,每调用一次就会抓取这个5个网站的最新代理放入DB,支持自定义扩展额外的代理获取接口; + +* DB: + +  用于存放代理IP,目前支持SSDB和Redis(推荐SSDB)。至于为什么选择SSDB,大家可以参考这篇[文章](https://www.sdk.cn/news/2684),个人觉得SSDB是个不错的Redis替代方案,如果你没有用过SSDB,安装起来也很简单,可以参考[这里](https://github.com/jhao104/memory-notes/blob/master/SSDB/SSDB%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE%E8%AE%B0%E5%BD%95.md); + +* Schedule: + +  计划任务,定时去检测DB中的代理可用性,删除不可用的代理。同时也会主动通过ProxyGetter去获取最新代理放入DB; + +* ProxyApi: + +  代理池的外部接口,由[Flask](http://flask.pocoo.org/)实现,功能是给爬虫提供与代理池交互的接口。 + + +![设计](https://pic2.zhimg.com/v2-f2756da2986aa8a8cab1f9562a115b55_b.png) + +### 3、代码模块 + +  Python中高层次的数据结构,动态类型和动态绑定,使得它非常适合于快速应用开发,也适合于作为胶水语言连接已有的软件部件。用Python来搞这个代理IP池也很简单,代码分为6个模块: + +* Api: + +  api接口相关代码,目前api是由Flask实现,代码也非常简单。客户端请求传给Flask,Flask调用`ProxyManager`中的实现,包括`get/delete/refresh/get_all`; + +* DB: + +  数据库相关代码,目前数据库是支持SSDB/Redis。代码用工厂模式实现,方便日后扩展其他类型数据库; + +* Manager: + +  `get/delete/refresh/get_all`等接口的具体实现类,目前代理池只负责管理proxy,日后可能会有更多功能,比如代理和爬虫的绑定,代理和账号的绑定等等; + +* ProxyGetter: + +  代理获取的相关代码,目前抓取了[快代理](http://www.kuaidaili.com)、[代理66](http://www.66ip.cn/)、[有代理](http://www.youdaili.net/Daili/http/)、[西刺代理](http://api.xicidaili.com/free2016.txt)、[guobanjia](http://www.goubanjia.com/free/gngn/index.shtml)这个五个网站的免费代理,经测试这个5个网站每天更新的可用代理只有六七十个,当然也支持自己扩展代理接口; + +* Schedule: + +  定时任务相关代码,现在只是实现定时去刷新代理,并验证可用代理,采用多进程方式; + +* Util: + +  存放一些公共的模块方法或函数,包含`GetConfig`:读取配置文件config.ini的类,`ConfigParse`: 扩展ConfigParser的类,使其对大小写敏感, `Singleton`:实现单例,`LazyProperty`:实现类属性惰性计算。等等; + +* 其他文件: + +  配置文件:`Config.ini``,数据库配置和代理获取接口配置,可以在GetFreeProxy中添加新的代理获取方法,并在Config.ini中注册即可使用; + +### 4、安装 + +下载代码: +``` +git clone git@github.com:jhao104/proxy_pool.git + +或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 +``` + +安装依赖: +``` +pip install -r requirements.txt +``` + +启动: + +``` +如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py +到Run目录下: +>>>python main.py + +如果运行成功你应该可以看到有4个main.py进程在 + + +你也可以分别运行他们,依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可 +``` + +docker: +``` +git clone git@github.com:jhao104/proxy_pool.git + +cd proxy_pool + +docker build -t proxy:latest -f Dockerfile . + +docker run -p 5000:5000 -d proxy:latest + +# Wait a few minutes +curl localhost:5000/get/ +# result: xxx.xxx.xxx.xxx:xxxx + +curl localhost:5000/get_all/ +``` + +### 5、使用 +  定时任务启动后,会通过GetFreeProxy中的方法抓取代理存入数据库并验证。此后默认每10分钟会重复执行一次。定时任务启动大概一两分钟后,便可在[SSDB](https://github.com/jhao104/SSDBAdmin)中看到刷新出来的可用的代理: + +![useful_proxy](https://pic2.zhimg.com/v2-12f9b7eb72f60663212f317535a113d1_b.png) + +  启动ProxyApi.py后即可在浏览器中使用接口获取代理,一下是浏览器中的截图: + +  index页面: + +![index](https://pic3.zhimg.com/v2-a867aa3db1d413fea8aeeb4c693f004a_b.png) + +  get: + +![get](https://pic1.zhimg.com/v2-f54b876b428893235533de20f2edbfe0_b.png) + +  get_all: + +![get_all](https://pic3.zhimg.com/v2-5c79f8c07e04f9ef655b9bea406d0306_b.png) + + +  爬虫中使用,如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: +``` +import requests + +def get_proxy(): + return requests.get("http://127.0.0.1:5010/get/").content + +def delete_proxy(proxy): + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) + +# your spider code + +def spider(): + # .... + requests.get('https://www.example.com', proxies={"http": "http://{}".format(get_proxy())}) + # .... + +``` + +  测试地址:http://123.207.35.36:5010 单机勿压测。谢谢 + +### 6、最后 +  时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! From b9ccdfaada51b57cfb1bbd0c01d4258971bc8352 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Fri, 22 Sep 2017 18:01:31 +0800 Subject: [PATCH 040/399] =?UTF-8?q?[update]=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ad6eef0cf..a1b8c20a2 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ ##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) -* 支持版本 ![](https://img.shields.io/badge/Python-2.x.svg) ![](https://img.shields.io/badge/Python-3.x.svg) +* 支持版本 ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) * 测试地址: http://123.207.35.36:5010 单机勿压测。谢谢 @@ -67,9 +67,12 @@ port = 5010 # 监听端口 ### 使用 -启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin),也可以通过api访问http://127.0.0.1:5010 查看。 +启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 + +也可以通过api访问http://127.0.0.1:5010 查看。 * Api + | api | method | Description | arg| | ----| ---- | ---- | ----| | / | GET | api介绍 | None | @@ -113,4 +116,18 @@ def getHtml(): 任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 -你的反馈会让此项目变得更加完美。 \ No newline at end of file +你的反馈会让此项目变得更加完美。 + +### 贡献代码 + +本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 + +本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,在确认后提交你的代码。 + +这里感谢以下contributor的无私奉献: + +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu) + + + + From e08d4b18e1eb1163cfca5c34a3b9853d58adf59c Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Fri, 22 Sep 2017 18:26:13 +0800 Subject: [PATCH 041/399] =?UTF-8?q?[update]=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a1b8c20a2..4ab608d31 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ ##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) -* 支持版本 ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) +* 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) -* 测试地址: http://123.207.35.36:5010 单机勿压测。谢谢 +* 测试地址: http://123.207.35.36:5010 (单机勿压。感谢) ### 下载安装 @@ -67,9 +67,9 @@ port = 5010 # 监听端口 ### 使用 -启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 +  启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 -也可以通过api访问http://127.0.0.1:5010 查看。 +  也可以通过api访问http://127.0.0.1:5010 查看。 * Api @@ -83,7 +83,7 @@ port = 5010 # 监听端口 * 爬虫使用 -如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: +  如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: ```python import requests @@ -112,19 +112,59 @@ def getHtml(): return None ``` +### 扩展代理 + +  项目默认包含几个免费的代理获取方法,但是免费的毕竟质量不好,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 + +  添加一个新的代理获取方法如下: + +* 1、首先在[GetFreeProxy](https://github.com/jhao104/proxy_pool/blob/b9ccdfaada51b57cfb1bbd0c01d4258971bc8352/ProxyGetter/getFreeProxy.py#L32)类中添加你的获取代理的静态方法, +该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: + +```python + +class GetFreeProxy(object): + # .... + + # 你自己的方法 + @staticmethod + def freeProxyCustom(): # 命名不和已有重复即可 + + # 通过某网站或者某接口或某数据库获取代理 任意你喜欢的姿势都行 + # 假设你拿到了一个代理列表 + proxies = ["139.129.166.68:3128", "139.129.166.61:3128", ...] + for proxy in proxies: + yield proxy + # 确保每个proxy都是 host:ip正确的格式就行 +``` + +* 2、添加好方法后,修改Config.ini文件中的`[ProxyGetter]`项: + +  在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字: + +```shell +[ProxyGetter] +;register the proxy getter function +freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可 +.... +freeProxyCustom = 1 # 确保名字和你添加方法名字一致 +```   + +  `ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 + ### 问题反馈 -任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 +  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 -你的反馈会让此项目变得更加完美。 +  你的反馈会让此项目变得更加完美。 ### 贡献代码 -本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 +  本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 -本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,在确认后提交你的代码。 +  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,在确认后提交你的代码。 -这里感谢以下contributor的无私奉献: +  这里感谢以下contributor的无私奉献:   [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu) From 3375a7cfb673ef0b1255638c2f7f15e09d51af77 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Fri, 22 Sep 2017 18:29:05 +0800 Subject: [PATCH 042/399] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ab608d31..c714c91ef 100644 --- a/README.md +++ b/README.md @@ -143,12 +143,15 @@ class GetFreeProxy(object):   在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字: ```shell + [ProxyGetter] ;register the proxy getter function freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可 .... freeProxyCustom = 1 # 确保名字和你添加方法名字一致 -```   + +``` +   `ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 From 1fae776e9e9a2e8d86dacf2c30bda5a739ab22ee Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Sun, 24 Sep 2017 10:46:12 +0800 Subject: [PATCH 043/399] Update ProxyApi.py --- Api/ProxyApi.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 12e7d54df..5bf580801 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -15,10 +15,12 @@ import sys +sys.path.append('../') + from flask import Flask, jsonify, request from Util.GetConfig import GetConfig -sys.path.append('../') + from Manager.ProxyManager import ProxyManager From db310f9a8430847be85439f75601ef285316ab1a Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Tue, 26 Sep 2017 09:12:57 +0800 Subject: [PATCH 044/399] =?UTF-8?q?[update]=E4=BF=AE=E6=94=B9=E6=96=87?= =?UTF-8?q?=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 +++++++++ Util/utilFunction.py | 17 +++++++++++++++-- doc/introduce.md | 2 +- doc/release_notes.md | 6 ++++++ 4 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 doc/release_notes.md diff --git a/README.md b/README.md index 4ab608d31..1efa9de00 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,15 @@ [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) + ______ ______ _ + | ___ \_ | ___ \ | | + | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | + | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | + | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ + \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ + __ / / + /___ / + ##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) * 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 268bb67a5..1f3732e86 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -67,6 +67,19 @@ def getHtmlTree(url, **kwargs): return etree.HTML(html) +def tcpConnect(proxy): + """ + TCP 三次握手 + :param proxy: + :return: + """ + from socket import socket, AF_INET, SOCK_STREAM + s = socket(AF_INET, SOCK_STREAM) + ip, port = proxy.split(':') + result = s.connect_ex((ip, int(port))) + return True if result == 0 else False + + # noinspection PyPep8Naming def validUsefulProxy(proxy): """ @@ -76,10 +89,10 @@ def validUsefulProxy(proxy): """ if isinstance(proxy, bytes): proxy = proxy.decode('utf8') - proxies = {"https": "https://{proxy}".format(proxy=proxy)} + proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: # 超过20秒的代理就不要了 - r = requests.get('https://www.baidu.com', proxies=proxies, timeout=20, verify=False) + r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=20, verify=False) if r.status_code == 200: logger.info('%s is ok' % proxy) return True diff --git a/doc/introduce.md b/doc/introduce.md index 07c4f95ab..ce3d9011c 100644 --- a/doc/introduce.md +++ b/doc/introduce.md @@ -1,5 +1,5 @@ -代理池介绍 +## 代理池介绍 本项目通过爬虫方式持续抓取代理网站公布的免费代理IP,实时校验,维护部分可以使用的代理,并通过api的形式提供外部使用。 diff --git a/doc/release_notes.md b/doc/release_notes.md new file mode 100644 index 000000000..6b7a91fe9 --- /dev/null +++ b/doc/release_notes.md @@ -0,0 +1,6 @@ +## Release Notes + +* 1.10 +  1. 第一版; +  2. 支持PY2/PY3; +  3. 代理池基本功能; \ No newline at end of file From 48cbac4a21f55543a69630c13fae78620281ffab Mon Sep 17 00:00:00 2001 From: netAir Date: Tue, 26 Sep 2017 16:04:37 +0800 Subject: [PATCH 045/399] =?UTF-8?q?=E5=AE=8C=E6=88=90=E5=AF=B9mongodb?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 12 +++++-- DB/DbClient.py | 2 ++ DB/MongodbClient.py | 77 ++++++++++++++++++++++----------------------- 3 files changed, 49 insertions(+), 42 deletions(-) diff --git a/Config.ini b/Config.ini index f430a94b7..9894be201 100644 --- a/Config.ini +++ b/Config.ini @@ -1,11 +1,17 @@ [DB] ;Configure the database information -;type: SSDB/REDIS if use redis, only modify the host port,the type should be SSDB -type = SSDB +;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB + +type = MONGODB host = localhost -port = 8888 +port = 27017 name = proxy +;type = SSDB +;host = localhost +;port = 8888 +;name = proxy + [ProxyGetter] ;register the proxy getter function freeProxyFirst = 1 diff --git a/DB/DbClient.py b/DB/DbClient.py index e730cc741..68c5db7a7 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -68,6 +68,8 @@ def __initDbClient(self): __type = "SsdbClient" elif "REDIS" == self.config.db_type: __type = "RedisClient" + elif "MONGODB" == self.config.db_type: + __type = "MongodbClient" else: pass assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index 89b3ccbb2..acd4798ec 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -8,56 +8,55 @@ class MongodbClient(object): + def __init__(self, name, host, port): + self.name = name + self.client = MongoClient(host, port) + self.db = self.client.proxy - def __init__(self, name, host, port): - self.name = name - self.client = MongoClient(host, port) - self.db = self.client.proxy + def changeTable(self, name): + self.name = name + def get(self, proxy): + data = self.db[self.name].find_one({'proxy': proxy}) + return data['num'] if data != None else None - def changeTable(self, name): - self.name = name + def put(self, proxy, num=1): + if self.db[self.name].find_one({'proxy': proxy}): + return None + else: + self.db[self.name].insert({'proxy': proxy, 'num': num}) + def pop(self): + value = list(self.db[self.name].aggregate([{'$sample': {'size': 1}}]))[0]['proxy'] + if value: + self.delete(value) + return value - def get(self): - proxy = self.getAll() - return random.choice(proxy) if proxy else None + def delete(self, value): + self.db[self.name].remove({'proxy': value}) + def getAll(self): + return {p['proxy']: p['num'] for p in self.db[self.name].find()} - def put(self, value): - if self.db[self.name].find_one({'proxy': value}): - return None - else: - self.db[self.name].insert({'proxy': value}) + def clean(self): + self.client.drop_database('proxy') + def delete_all(self): + self.db[self.name].remove() - def pop(self): - value = self.get() - if value: - self.delete(value) - return value + def update(self, key, value): + self.db[self.name].update({'proxy': key}, {'$inc': {'num': value}}) + def exists(self, key): + return True if self.db[self.name].find({'proxy': key}) == None else False - def delete(self, value): - self.db[self.name].remove({'proxy': value}) - - - def getAll(self): - return [p['proxy'] for p in self.db[self.name].find()] - - - def clean(self): - self.client.drop_database('proxy') - - - def delete_all(self): - self.db[self.name].remove() + def getNumber(self): + return self.db[self.name].count() if __name__ == "__main__": - db = MongodbClient('first', 'localhost', 27017) - db.put('127.0.0.1:1') - db2 = MongodbClient('second', 'localhost', 27017) - db2.put('127.0.0.1:2') - db.clean() - + db = MongodbClient('first', 'localhost', 27017) + db.put('127.0.0.1:1') + db2 = MongodbClient('second', 'localhost', 27017) + db2.put('127.0.0.1:2') + db.clean() From 56dc2fe421147bcea31acd2f568820a637502f13 Mon Sep 17 00:00:00 2001 From: netAir Date: Tue, 26 Sep 2017 16:12:30 +0800 Subject: [PATCH 046/399] =?UTF-8?q?=E6=97=A0=E8=80=BB=E7=9A=84=E5=9C=A8?= =?UTF-8?q?=E4=BD=9C=E8=80=85=E7=89=88=E6=9D=83=E4=BF=A1=E6=81=AF=E4=B8=8A?= =?UTF-8?q?=E5=8A=A0=E4=B8=8A=E6=88=91=E7=9A=84=E5=90=8D,=E5=98=BF?= =?UTF-8?q?=E5=98=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/MongodbClient.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index acd4798ec..db3db8696 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -1,10 +1,19 @@ # coding: utf-8 - -__author__ = 'Maps' +""" +------------------------------------------------- + File Name: MongodbClient.py + Description : 封装mongodb操作 + Author : JHao netAir + date: 2017/3/3 +------------------------------------------------- + Change Activity: + 2017/3/3: + 2017/9/26:完成对mongodb的支持 +------------------------------------------------- +""" +__author__ = 'Maps netAir' from pymongo import MongoClient -import random -import json class MongodbClient(object): From c326ca13fbd18f932515aca2953d6d6fa723d856 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Wed, 27 Sep 2017 16:20:06 +0800 Subject: [PATCH 047/399] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9pop?= =?UTF-8?q?=E6=96=B9=E6=B3=95=20=E8=BF=94=E5=9B=9E=E5=AD=97=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/SsdbClient.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 6f5fb2dd2..08778fdb6 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -10,6 +10,7 @@ Change Activity: 2016/12/2: 2017/09/22: PY3中 redis-py返回的数据是bytes型 + 2017/09/27: 修改pop()方法 返回{proxy:value}字典 ------------------------------------------------- """ __author__ = 'JHao' @@ -77,11 +78,17 @@ def update(self, key, value): self.__conn.hincrby(self.name, key, value) def pop(self): + """ + 弹出一个代理 + :return: dict {proxy: value} + """ proxies = self.__conn.hkeys(self.name) if proxies: proxy = random.choice(proxies) + value = self.__conn.hget(self.name, proxy) self.delete(proxy) - return proxy + return {'proxy': proxy.decode('utf-8') if EnvUtil.PY3 else proxy, + 'value': value.decode('utf-8') if EnvUtil.PY3 and value else value} return None def exists(self, key): From e11020791cd91450b7021ec20f19da6099d3fb42 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Wed, 27 Sep 2017 16:21:07 +0800 Subject: [PATCH 048/399] =?UTF-8?q?[update]=E5=A4=9A=E7=BA=BF=E7=A8=8B?= =?UTF-8?q?=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyCheck.py | 62 ++++++++++++++++++++++++++++++++ Schedule/ProxyRefreshSchedule.py | 7 ++-- Schedule/ProxyValidSchedule.py | 44 +++++++---------------- log/test.log | 1 - 4 files changed, 79 insertions(+), 35 deletions(-) create mode 100644 Schedule/ProxyCheck.py delete mode 100644 log/test.log diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py new file mode 100644 index 000000000..0a6f2340c --- /dev/null +++ b/Schedule/ProxyCheck.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: ProxyCheck + Description : 多线程验证useful_proxy + Author : J_hao + date: 2017/9/26 +------------------------------------------------- + Change Activity: + 2017/9/26: 多线程验证useful_proxy +------------------------------------------------- +""" +__author__ = 'J_hao' + +import sys +from time import sleep +from threading import Thread + +sys.path.append('../') + +from Util.utilFunction import validUsefulProxy +from Manager.ProxyManager import ProxyManager +from Util.LogHandler import LogHandler + + +class ProxyCheck(ProxyManager, Thread): + + def __init__(self): + ProxyManager.__init__(self) + Thread.__init__(self) + self.log = LogHandler('proxy_check') + + def run(self): + self.db.changeTable(self.useful_proxy_queue) + while True: + proxy_item = self.db.pop() + while proxy_item: + proxy = proxy_item.get('proxy') + counter = proxy_item.get('value') + if validUsefulProxy(proxy): + # 验证通过计数器加1, 计数在-5到1之间 + if counter and int(counter) < 1: + self.db.put(proxy, num=int(counter) + 1) + else: + self.db.put(proxy) + self.log.info('ProxyCheck: {} validation pass'.format(proxy)) + else: + self.log.info('ProxyCheck: {} validation fail'.format(proxy)) + # 验证失败,计数器减1 + if counter and int(counter) < -5: + self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) + self.db.delete(proxy) + else: + self.db.put(proxy, num=int(counter) - 1) + + proxy_item = self.db.pop() + sleep(60 * 5) + + +if __name__ == '__main__': + p = ProxyCheck() + p.run() \ No newline at end of file diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 5795e17a8..98d51bc29 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -46,11 +46,12 @@ def validProxy(self): :return: """ self.db.changeTable(self.raw_proxy_queue) - raw_proxy = self.db.pop() + raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() - while raw_proxy: + while raw_proxy_item: + raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): # 兼容Py3 raw_proxy = raw_proxy.decode('utf8') @@ -62,7 +63,7 @@ def validProxy(self): else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) - raw_proxy = self.db.pop() + raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime()) diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index f6fa58893..3a7304b92 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -13,49 +13,31 @@ __author__ = 'JHao' import sys -from time import sleep sys.path.append('../') -from Util.utilFunction import validUsefulProxy -from Manager.ProxyManager import ProxyManager -from Util.LogHandler import LogHandler +from Schedule.ProxyCheck import ProxyCheck -class ProxyValidSchedule(ProxyManager): +class ProxyValidSchedule(object): def __init__(self): - ProxyManager.__init__(self) - self.log = LogHandler('valid_schedule') + pass - def __validProxy(self): + def __validProxy(self, threads=5): """ - 验证代理 + 验证useful_proxy代理 + :param threads: 线程数 :return: """ - while True: - self.db.changeTable(self.useful_proxy_queue) - for each_proxy in self.db.getAll(): - if isinstance(each_proxy, bytes): - # 兼容PY3 - each_proxy = each_proxy.decode('utf-8') + thread_list = list() + for index in range(threads): + thread_list.append(ProxyCheck()) - value = self.db.get(each_proxy) - if validUsefulProxy(each_proxy): - # 成功计数器加1 - if value and int(value) < 1: - self.db.update(each_proxy, 1) - self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy)) - else: - # 失败计数器减一 - if value and int(value) < -5: - # 计数器小于-5删除该代理 - self.db.delete(each_proxy) - else: - self.db.update(each_proxy, -1) - self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy)) + for thread in thread_list: + thread.start() - self.log.info('ProxyValidSchedule running normal') - sleep(60 * 1) + for thread in thread_list: + thread.join() def main(self): self.__validProxy() diff --git a/log/test.log b/log/test.log deleted file mode 100644 index 71158551f..000000000 --- a/log/test.log +++ /dev/null @@ -1 +0,0 @@ -2017-04-24 15:34:08,434 LogHandler.py[line:85] ERROR this is a test msg From e4685070b865d7748c2c802bd2970d81136dbe94 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Wed, 27 Sep 2017 16:22:08 +0800 Subject: [PATCH 049/399] =?UTF-8?q?[update]=E5=A4=9A=E7=BA=BF=E7=A8=8B?= =?UTF-8?q?=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/release_notes.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/release_notes.md b/doc/release_notes.md index 6b7a91fe9..c1e4eeefa 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -1,5 +1,8 @@ ## Release Notes +* newest +  1.使用多线程验证useful_pool + * 1.10   1. 第一版;   2. 支持PY2/PY3; From 159007610133d20670c6b2131a492b2cdf2f82a6 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Wed, 27 Sep 2017 16:48:29 +0800 Subject: [PATCH 050/399] =?UTF-8?q?[update]=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- log/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 log/__init__.py diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 000000000..e69de29bb From a287960f6b3f8a43c075a7eac0957bfd682fbd93 Mon Sep 17 00:00:00 2001 From: netAir Date: Wed, 27 Sep 2017 20:18:18 +0800 Subject: [PATCH 051/399] =?UTF-8?q?bug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/MongodbClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index db3db8696..66bac25da 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -57,7 +57,7 @@ def update(self, key, value): self.db[self.name].update({'proxy': key}, {'$inc': {'num': value}}) def exists(self, key): - return True if self.db[self.name].find({'proxy': key}) == None else False + return True if self.db[self.name].find({'proxy': key}) != None else False def getNumber(self): return self.db[self.name].count() From f27a4bb6d9a8ffbd22612de108cbca55d1e225e8 Mon Sep 17 00:00:00 2001 From: netAir Date: Thu, 28 Sep 2017 10:07:16 +0800 Subject: [PATCH 052/399] =?UTF-8?q?=E6=94=B9=E4=B8=BA=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Config.ini b/Config.ini index 9894be201..a29b9b05b 100644 --- a/Config.ini +++ b/Config.ini @@ -1,17 +1,11 @@ [DB] ;Configure the database information ;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB - -type = MONGODB +type = SSDB host = localhost -port = 27017 +port = 8888 name = proxy -;type = SSDB -;host = localhost -;port = 8888 -;name = proxy - [ProxyGetter] ;register the proxy getter function freeProxyFirst = 1 From 8e2e92de6e7b19d972f100fb88f8dcd722e88357 Mon Sep 17 00:00:00 2001 From: netAir Date: Thu, 28 Sep 2017 10:45:33 +0800 Subject: [PATCH 053/399] =?UTF-8?q?=E4=BF=AE=E6=94=B9pop=E6=96=B9=E6=B3=95?= =?UTF-8?q?=E8=BF=94=E5=9B=9E=E5=AD=97=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/MongodbClient.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index 66bac25da..bd0647f51 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -36,10 +36,13 @@ def put(self, proxy, num=1): self.db[self.name].insert({'proxy': proxy, 'num': num}) def pop(self): - value = list(self.db[self.name].aggregate([{'$sample': {'size': 1}}]))[0]['proxy'] - if value: + data = list(self.db[self.name].aggregate([{'$sample': {'size': 1}}])) + if data: + data = data[0] + value = data['proxy'] self.delete(value) - return value + return {'proxy': value, 'value': data['num']} + return None def delete(self, value): self.db[self.name].remove({'proxy': value}) @@ -57,7 +60,7 @@ def update(self, key, value): self.db[self.name].update({'proxy': key}, {'$inc': {'num': value}}) def exists(self, key): - return True if self.db[self.name].find({'proxy': key}) != None else False + return True if self.db[self.name].find_one({'proxy': key}) != None else False def getNumber(self): return self.db[self.name].count() @@ -65,7 +68,7 @@ def getNumber(self): if __name__ == "__main__": db = MongodbClient('first', 'localhost', 27017) - db.put('127.0.0.1:1') - db2 = MongodbClient('second', 'localhost', 27017) - db2.put('127.0.0.1:2') - db.clean() + # db.put('127.0.0.1:1') + # db2 = MongodbClient('second', 'localhost', 27017) + # db2.put('127.0.0.1:2') + print(db.pop()) From cddc78cd45674c9767bf07c9499a32cf61287725 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Thu, 28 Sep 2017 11:43:00 +0800 Subject: [PATCH 054/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6861e3561..7133d24b7 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir) From 9e27c23d663d1e91c1ada9cac353a18c6bc09295 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Mon, 9 Oct 2017 15:50:48 +0800 Subject: [PATCH 055/399] =?UTF-8?q?[update]=20=E8=AE=BE=E7=BD=AE=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E6=AC=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyCheck.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 0a6f2340c..092f1ef61 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -22,9 +22,10 @@ from Manager.ProxyManager import ProxyManager from Util.LogHandler import LogHandler +FAIL_COUNT = 2 -class ProxyCheck(ProxyManager, Thread): +class ProxyCheck(ProxyManager, Thread): def __init__(self): ProxyManager.__init__(self) Thread.__init__(self) @@ -47,7 +48,7 @@ def run(self): else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 - if counter and int(counter) < -5: + if counter and int(counter) < -FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: @@ -59,4 +60,4 @@ def run(self): if __name__ == '__main__': p = ProxyCheck() - p.run() \ No newline at end of file + p.run() From bc07c051331e669ed8b2ad8e2221a23a299eb566 Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Tue, 10 Oct 2017 09:35:22 +0800 Subject: [PATCH 056/399] =?UTF-8?q?[update]=E4=BF=AE=E6=94=B9=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=A4=B1=E8=B4=A5=E6=AC=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyCheck.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 092f1ef61..57df4f24f 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -39,7 +39,7 @@ def run(self): proxy = proxy_item.get('proxy') counter = proxy_item.get('value') if validUsefulProxy(proxy): - # 验证通过计数器加1, 计数在-5到1之间 + # 验证通过计数器加1 if counter and int(counter) < 1: self.db.put(proxy, num=int(counter) + 1) else: @@ -48,7 +48,7 @@ def run(self): else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 - if counter and int(counter) < -FAIL_COUNT: + if counter and int(counter) <= -FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: From 67b5fbba8e8a2517136f7beb318c2ed31b62ee9b Mon Sep 17 00:00:00 2001 From: jinghao_wb Date: Fri, 13 Oct 2017 09:46:06 +0800 Subject: [PATCH 057/399] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E6=A0=A1?= =?UTF-8?q?=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyCheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 57df4f24f..e7a0bdf48 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -22,7 +22,7 @@ from Manager.ProxyManager import ProxyManager from Util.LogHandler import LogHandler -FAIL_COUNT = 2 +FAIL_COUNT = 1 # 校验失败次数, 超过次数删除代理 class ProxyCheck(ProxyManager, Thread): From 30feb1646badf15b6f7d60a4c89e698bf3222869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=BB=BA=E5=88=9A?= Date: Thu, 19 Oct 2017 23:21:54 +0800 Subject: [PATCH 058/399] Change docker expose port --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 34c49f836..46a72e47b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,6 +34,6 @@ RUN pip install --no-cache-dir -r requirements.txt && \ chmod 777 run.sh -EXPOSE 5000 +EXPOSE 5010 -CMD [ "sh", "run.sh" ] \ No newline at end of file +CMD [ "sh", "run.sh" ] From 4dc27691021188765296a84292dac4fbf93c6597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=BB=BA=E5=88=9A?= Date: Sun, 22 Oct 2017 22:32:05 +0800 Subject: [PATCH 059/399] upgrade python --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 46a72e47b..1b77a67f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:2.7 +FROM python:3.6 WORKDIR /usr/src/app From 278dc60bf9d8fef54c8433ca3fec8dce5e78e242 Mon Sep 17 00:00:00 2001 From: gladmo Date: Sun, 22 Oct 2017 10:13:58 -0500 Subject: [PATCH 060/399] Update docker expose port --- doc/introduce.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/introduce.md b/doc/introduce.md index ce3d9011c..13f45317a 100644 --- a/doc/introduce.md +++ b/doc/introduce.md @@ -119,13 +119,13 @@ cd proxy_pool docker build -t proxy:latest -f Dockerfile . -docker run -p 5000:5000 -d proxy:latest +docker run -p 5010:5010 -d proxy:latest # Wait a few minutes -curl localhost:5000/get/ +curl localhost:5010/get/ # result: xxx.xxx.xxx.xxx:xxxx -curl localhost:5000/get_all/ +curl localhost:5010/get_all/ ``` ### 5、使用 From 728a282b9059d4fee8341a21d6f8069bfcfb21c4 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Mon, 23 Oct 2017 19:47:43 +0800 Subject: [PATCH 061/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7133d24b7..aa34995f5 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@gladmo](https://github.com/gladmo) From f7923896db9719ed7da4a3a95917fac9590bb274 Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Wed, 1 Nov 2017 22:43:04 +0800 Subject: [PATCH 062/399] =?UTF-8?q?redis=E5=9C=A8py3=E4=B8=AD=E8=BF=94?= =?UTF-8?q?=E5=9B=9Ebytes=E7=B1=BB=E5=9E=8B=EF=BC=8C=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=A4=E5=A4=84=E8=BF=94=E5=9B=9E=E6=97=B6=E7=9A=84=E8=A7=A3?= =?UTF-8?q?=E7=A0=81=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 15 ++++++++------- ProxyGetter/getFreeProxy.py | 22 ++++++++++++++++++++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/Config.ini b/Config.ini index 81d6b6a1c..49506748d 100644 --- a/Config.ini +++ b/Config.ini @@ -1,15 +1,16 @@ [DB] ;Configure the database information ;type: SSDB/REDIS -type = SSDB +type = REDIS host = localhost -port = 8888 +port = 6379 name = proxy [ProxyGetter] ;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 +;freeProxyFirst = 1 +;freeProxySecond = 1 +;freeProxyThird = 1 +;freeProxyFourth = 1 +;freeProxyFifth = 1 +freeProxySixth = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index e30a8b749..6634924f5 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -13,6 +13,8 @@ """ import re import requests +import sys +sys.path.append('../') try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -115,6 +117,22 @@ def freeProxyFifth(): for each_proxy in proxy_list: yield ''.join(each_proxy.xpath('.//text()')) + @staticmethod + @robustCrawl + def freeProxySixth(): + """ + 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 + :return: + """ + url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' + request = WebRequest() + try: + res = request.get(url).json() + for row in res['RESULT']['rows']: + yield '{}:{}'.format(row['ip'], row['port']) + except Exception as e: + pass + if __name__ == '__main__': gg = GetFreeProxy() @@ -127,8 +145,8 @@ def freeProxyFifth(): # for e in gg.freeProxyThird(): # print e - for e in gg.freeProxyFourth(): - print e + for e in gg.freeProxySixth(): + print(e) # for e in gg.freeProxyFifth(): # print(e) From a5a8926d223f430de768c590139bf3cb9a569c8f Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Wed, 1 Nov 2017 22:57:38 +0800 Subject: [PATCH 063/399] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=9B=9E=E5=8E=9F?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 12 ++++++------ ProxyGetter/getFreeProxy.py | 6 ++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Config.ini b/Config.ini index f55522c3e..c7dd74cc0 100644 --- a/Config.ini +++ b/Config.ini @@ -3,14 +3,14 @@ ;type: SSDB/REDIS type = SSDB host = localhost -port = 6379 +port = 8888 name = proxy [ProxyGetter] ;register the proxy getter function -;freeProxyFirst = 1 -;freeProxySecond = 1 -;freeProxyThird = 1 -;freeProxyFourth = 1 -;freeProxyFifth = 1 +freeProxyFirst = 1 +freeProxySecond = 1 +freeProxyThird = 1 +freeProxyFourth = 1 +freeProxyFifth = 1 freeProxySixth = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 5e74ebb2b..4a6552819 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -13,8 +13,6 @@ """ import re import requests -import sys -sys.path.append('../') try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -157,8 +155,8 @@ def freeProxySixth(): # for e in gg.freeProxyThird(): # print e - for e in gg.freeProxySixth(): - print(e) + # for e in gg.freeProxySixth(): + # print(e) # for e in gg.freeProxyFifth(): # print(e) From fd1455ad53dc095ac15ccbfac7e3e92e706326ca Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Wed, 1 Nov 2017 23:02:09 +0800 Subject: [PATCH 064/399] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=9B=9E=E5=8E=9F?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE=EF=BC=8C=E5=8F=AA=E5=A2=9E=E5=8A=A0=E6=96=B0?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=90=86=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index c7dd74cc0..81d4f4911 100644 --- a/Config.ini +++ b/Config.ini @@ -1,6 +1,6 @@ [DB] ;Configure the database information -;type: SSDB/REDIS +;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = localhost port = 8888 @@ -14,3 +14,8 @@ freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 freeProxySixth = 1 + +[HOST] +; API接口配置 http://127.0.0.1:5051 +ip = 0.0.0.0 +port = 5010 \ No newline at end of file From 5c8fd5760f2d699b9bb7cf77f28f7300c74726ef Mon Sep 17 00:00:00 2001 From: jhao104 <946150454@qq.com> Date: Sun, 5 Nov 2017 16:39:42 +0800 Subject: [PATCH 065/399] =?UTF-8?q?[update]=20=E5=A4=9A=E6=AC=A1=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E5=A4=B1=E8=B4=A5=E6=97=B6=EF=BC=8C=E8=BF=94=E5=9B=9E?= =?UTF-8?q?=E7=99=BE=E5=BA=A6=E9=A1=B5=E9=9D=A2=EF=BC=8C=E8=BF=94=E5=9B=9E?= =?UTF-8?q?None=E4=BC=9A=E6=9C=89=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/WebRequest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 82ef20ef9..b9bf4a5e4 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -82,5 +82,6 @@ def get(self, url, header=None, retry_time=5, timeout=30, print(e) retry_time -= 1 if retry_time <= 0: - return + # 多次请求失败时,返回百度页面 + return requests.get("https://www.baidu.com/") time.sleep(retry_interval) From b1018a96aa00767b480a080aa8b25c70c2ca56cc Mon Sep 17 00:00:00 2001 From: jhao104 <946150454@qq.com> Date: Sun, 5 Nov 2017 18:54:14 +0800 Subject: [PATCH 066/399] =?UTF-8?q?[update]=E6=B2=A1=E6=9C=89=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E6=97=B6=20=E4=B8=8D=E8=BF=94=E5=9B=9ENone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 5bf580801..e62aa9fc1 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -44,7 +44,7 @@ def index(): @app.route('/get/') def get(): proxy = ProxyManager().get() - return proxy + return proxy if proxy else 'no proxy!' @app.route('/refresh/') From 200b2795dd51713831ea5ab3e162d9b7060a1030 Mon Sep 17 00:00:00 2001 From: jhao104 <946150454@qq.com> Date: Sun, 5 Nov 2017 19:32:02 +0800 Subject: [PATCH 067/399] =?UTF-8?q?[update]=20staticmethod=E4=B8=8B?= =?UTF-8?q?=E7=9A=84robustCrawl=E4=B8=8D=E8=B5=B7=E4=BD=9C=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 55 +++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 4a6552819..c780ea48a 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -38,7 +38,6 @@ def __init__(self): pass @staticmethod - @robustCrawl # decoration print error if exception happen def freeProxyFirst(page=10): """ 抓取无忧代理 http://www.data5u.com/ @@ -53,10 +52,12 @@ def freeProxyFirst(page=10): html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: - yield ':'.join(ul.xpath('.//li/text()')[0:2]) + try: + yield ':'.join(ul.xpath('.//li/text()')[0:2]) + except Exception as e: + pass @staticmethod - @robustCrawl def freeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ @@ -73,7 +74,6 @@ def freeProxySecond(proxy_number=100): yield proxy @staticmethod - @robustCrawl def freeProxyThird(days=1): """ 抓取ip181 http://www.ip181.com/ @@ -82,12 +82,14 @@ def freeProxyThird(days=1): """ url = 'http://www.ip181.com/' html_tree = getHtmlTree(url) - tr_list = html_tree.xpath('//tr')[1:] - for tr in tr_list: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + try: + tr_list = html_tree.xpath('//tr')[1:] + for tr in tr_list: + yield ':'.join(tr.xpath('./td/text()')[0:2]) + except Exception as e: + pass @staticmethod - @robustCrawl def freeProxyFourth(): """ 抓取西刺代理 http://api.xicidaili.com/free2016.txt @@ -100,10 +102,12 @@ def freeProxyFourth(): tree = getHtmlTree(each_url) proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') for proxy in proxy_list: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) + try: + yield ':'.join(proxy.xpath('./td/text()')[0:2]) + except Exception as e: + pass @staticmethod - @robustCrawl def freeProxyFifth(): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml @@ -122,13 +126,15 @@ def freeProxyFifth(): ]/text() """ for each_proxy in proxy_list: - # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port - ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] - yield '{}:{}'.format(ip_addr, port) + try: + # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port + ip_addr = ''.join(each_proxy.xpath(xpath_str)) + port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] + yield '{}:{}'.format(ip_addr, port) + except Exception as e: + pass @staticmethod - @robustCrawl def freeProxySixth(): """ 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 @@ -147,16 +153,19 @@ def freeProxySixth(): if __name__ == '__main__': gg = GetFreeProxy() # for e in gg.freeProxyFirst(): - # print e - + # print(e) + # # for e in gg.freeProxySecond(): - # print e - + # print(e) + # # for e in gg.freeProxyThird(): - # print e + # print(e) - # for e in gg.freeProxySixth(): + # for e in gg.freeProxyFourth(): # print(e) - # for e in gg.freeProxyFifth(): - # print(e) + for e in gg.freeProxyFifth(): + print(e) + + # for e in gg.freeProxySixth(): + # print(e) From 640eba9eac284ba14ccbe84e260f6852da8acb25 Mon Sep 17 00:00:00 2001 From: jhao104 <946150454@qq.com> Date: Sun, 5 Nov 2017 20:01:02 +0800 Subject: [PATCH 068/399] =?UTF-8?q?[update]=20=E8=AE=BE=E7=BD=AEdaemon=20?= =?UTF-8?q?=3D=20True?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Run/main.py | 1 + Schedule/ProxyRefreshSchedule.py | 1 + Schedule/ProxyValidSchedule.py | 1 + 3 files changed, 3 insertions(+) diff --git a/Run/main.py b/Run/main.py index 7e31566dc..3a2e95fd5 100644 --- a/Run/main.py +++ b/Run/main.py @@ -33,6 +33,7 @@ def run(): p_list.append(p3) for p in p_list: + p.daemon = True p.start() for p in p_list: p.join() diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 98d51bc29..c358acd8a 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -86,6 +86,7 @@ def main(process_num=30): pl.append(proc) for num in range(process_num): + pl[num].daemon = True pl[num].start() for num in range(process_num): diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 3a7304b92..8345a83a4 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -34,6 +34,7 @@ def __validProxy(self, threads=5): thread_list.append(ProxyCheck()) for thread in thread_list: + thread.daemon = True thread.start() for thread in thread_list: From 6b11946a8279ef879dedeb868be4f3ef6c290c17 Mon Sep 17 00:00:00 2001 From: bobobo80 Date: Sun, 12 Nov 2017 21:23:33 +0800 Subject: [PATCH 069/399] =?UTF-8?q?=E5=AF=B9=E6=AF=8F=E6=AC=A1=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E5=89=8D=E5=A2=9E=E5=8A=A0=E5=BB=B6=E8=BF=9F=EF=BC=8C?= =?UTF-8?q?=E9=98=B2=E6=AD=A2=E7=9F=AD=E6=9C=9F=E8=AE=BF=E9=97=AE=E9=87=8F?= =?UTF-8?q?=E8=BF=87=E9=AB=98=E8=A2=AB=E5=B0=81ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/utilFunction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 1f3732e86..2227bfec4 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -12,6 +12,7 @@ ------------------------------------------------- """ import requests +import time from lxml import etree from Util.LogHandler import LogHandler @@ -63,6 +64,10 @@ def getHtmlTree(url, **kwargs): } # TODO 取代理服务器用代理服务器访问 wr = WebRequest() + + # delay 2s for per request + time.sleep(2) + html = wr.get(url=url, header=header).content return etree.HTML(html) From f4c8f27d260773be94f9a82b2621a587f4e4e116 Mon Sep 17 00:00:00 2001 From: scil Date: Wed, 22 Nov 2017 23:23:01 +0800 Subject: [PATCH 070/399] use any() to valid response content --- Util/WebRequest.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index b9bf4a5e4..abbdb17be 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -70,13 +70,8 @@ def get(self, url, header=None, retry_time=5, timeout=30, while True: try: html = requests.get(url, headers=headers, timeout=timeout) - # if filter(lambda key: key in html.content, retry_flag): - # 原filter语句执行if判断所有情况均为True情况,python3与python2的区别? - # python3中filter返回filter对象,即使为空,if会判断为True - # python2中filter返回list对象,为空,if判断为False - for f in retry_flag: - if f in html.content: - raise Exception + if any(f in html.content for f in retry_flag): + raise Exception return html except Exception as e: print(e) From 0f03dd5aca95e0c99ec8d9c494f22db171f1c295 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Thu, 23 Nov 2017 09:27:46 +0800 Subject: [PATCH 071/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aa34995f5..b62d6ae5c 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@gladmo](https://github.com/gladmo) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@gladmo](https://github.com/gladmo)| [@scil](https://github.com/scil) From 2f17fe19862513bb32276cd44ce274262c2b8aee Mon Sep 17 00:00:00 2001 From: hymzhek Date: Mon, 11 Dec 2017 13:51:24 +0800 Subject: [PATCH 072/399] Update GetConfig.py Fixed TypeError: port must be an integer error --- Util/GetConfig.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index d7a734658..24b003f28 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -55,7 +55,7 @@ def host_ip(self): @LazyProperty def host_port(self): - return self.config_file.get('HOST', 'port') + return int(self.config_file.get('HOST', 'port')) if __name__ == '__main__': gg = GetConfig() From 3f381019fe81891c3015bc71d9bf9fc1d1fcc40f Mon Sep 17 00:00:00 2001 From: bigdata Date: Fri, 12 Jan 2018 12:54:38 +0800 Subject: [PATCH 073/399] add kuaidaili --- Config.ini | 4 +++- Dockerfile | 9 --------- ProxyGetter/getFreeProxy.py | 18 ++++++++++++++++-- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/Config.ini b/Config.ini index 81d4f4911..39eb6cdb6 100644 --- a/Config.ini +++ b/Config.ini @@ -3,6 +3,7 @@ ;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = localhost +;port = 6379 port = 8888 name = proxy @@ -14,8 +15,9 @@ freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 freeProxySixth = 1 +freeProxySeventh = 1 [HOST] ; API接口配置 http://127.0.0.1:5051 ip = 0.0.0.0 -port = 5010 \ No newline at end of file +port = 5010 diff --git a/Dockerfile b/Dockerfile index 1b77a67f0..7c815a4e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,8 @@ FROM python:3.6 - WORKDIR /usr/src/app - COPY . . - ENV DEBIAN_FRONTEND noninteractive ENV TZ Asia/Shanghai - RUN pip install --no-cache-dir -r requirements.txt && \ apt-get update && \ apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \ @@ -16,7 +12,6 @@ RUN pip install --no-cache-dir -r requirements.txt && \ apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \ - mkdir -p /var/lib/ssdb && \ sed \ -e 's@home.*@home /var/lib@' \ @@ -26,14 +21,10 @@ RUN pip install --no-cache-dir -r requirements.txt && \ -e 's@level:.*@level: info@' \ -e 's@ip:.*@ip: 0.0.0.0@' \ -i /etc/ssdb.conf && \ - echo "# ! /bin/sh " > /usr/src/app/run.sh && \ echo "cd Run" >> /usr/src/app/run.sh && \ echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \ echo "python main.py" >> /usr/src/app/run.sh && \ - chmod 777 run.sh - EXPOSE 5010 - CMD [ "sh", "run.sh" ] diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index c780ea48a..eb6485401 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -149,6 +149,18 @@ def freeProxySixth(): except Exception as e: pass + @staticmethod + def freeProxySeventh(): + """ + 快代理免费https://www.kuaidaili.com/free/inha/1/ + """ + url = 'https://www.kuaidaili.com/free/inha/{page}/' + for page in range(1, 10): + page_url = url.format(page=page) + tree = getHtmlTree(page_url) + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) if __name__ == '__main__': gg = GetFreeProxy() @@ -164,8 +176,10 @@ def freeProxySixth(): # for e in gg.freeProxyFourth(): # print(e) - for e in gg.freeProxyFifth(): - print(e) + #for e in gg.freeProxyFifth(): + # print(e) # for e in gg.freeProxySixth(): # print(e) + for e in gg.freeProxySeventh(): + print(e) From 275288d07416eed9de102e18d7aa42de31838eb0 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Thu, 18 Jan 2018 14:30:40 +0800 Subject: [PATCH 074/399] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E6=8A=93?= =?UTF-8?q?=E5=8F=96=E6=97=B6=E6=B7=BB=E5=8A=A0=E5=BC=82=E5=B8=B8=E6=8D=95?= =?UTF-8?q?=E8=8E=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 4 +--- ProxyGetter/getFreeProxy.py | 7 ++++--- Run/main.py | 4 ++-- Schedule/ProxyCheck.py | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index e62aa9fc1..45db4843a 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -20,13 +20,10 @@ from flask import Flask, jsonify, request from Util.GetConfig import GetConfig - - from Manager.ProxyManager import ProxyManager app = Flask(__name__) - api_list = { 'get': u'get an usable proxy', # 'refresh': u'refresh proxy pool', @@ -78,5 +75,6 @@ def run(): config = GetConfig() app.run(host=config.host_ip, port=config.host_port) + if __name__ == '__main__': run() diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index eb6485401..d3d3af83f 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -65,7 +65,7 @@ def freeProxySecond(proxy_number=100): :return: """ url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) + proxy_number) request = WebRequest() # html = request.get(url).content # content为未解码,text为解码后的字符串 @@ -162,6 +162,7 @@ def freeProxySeventh(): for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) + if __name__ == '__main__': gg = GetFreeProxy() # for e in gg.freeProxyFirst(): @@ -171,12 +172,12 @@ def freeProxySeventh(): # print(e) # # for e in gg.freeProxyThird(): - # print(e) + # print(e) # for e in gg.freeProxyFourth(): # print(e) - #for e in gg.freeProxyFifth(): + # for e in gg.freeProxyFifth(): # print(e) # for e in gg.freeProxySixth(): diff --git a/Run/main.py b/Run/main.py index 3a2e95fd5..6b07654ee 100644 --- a/Run/main.py +++ b/Run/main.py @@ -15,7 +15,6 @@ import sys from multiprocessing import Process - sys.path.append('../') from Api.ProxyApi import run as ProxyApiRun @@ -38,5 +37,6 @@ def run(): for p in p_list: p.join() + if __name__ == '__main__': - run() + run() diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index e7a0bdf48..91db84142 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -37,7 +37,7 @@ def run(self): proxy_item = self.db.pop() while proxy_item: proxy = proxy_item.get('proxy') - counter = proxy_item.get('value') + counter = proxy_item.get('value', 1) if validUsefulProxy(proxy): # 验证通过计数器加1 if counter and int(counter) < 1: @@ -48,7 +48,7 @@ def run(self): else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 - if counter and int(counter) <= -FAIL_COUNT: + if counter and int(counter) <= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: From de132a2550792d0e21bac729dba3c8b66ee2a5f5 Mon Sep 17 00:00:00 2001 From: tangrela Date: Wed, 31 Jan 2018 12:50:57 +0800 Subject: [PATCH 075/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E7=BD=91=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加以下代理 cn-proxy.com www.mimiip.com proxy-list.org cz88.net ip181.com --- ProxyGetter/getFreeProxy.py | 75 ++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index d3d3af83f..d5ad08572 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -28,6 +28,21 @@ # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() +""" +66ip.cn +data5u.com +ip181.com +xicidaili.com +goubanjia.com +xdaili.cn +kuaidaili.com +cn-proxy.com +www.mimiip.com +proxy-list.org +cz88.net +ip181.com +""" + class GetFreeProxy(object): """ @@ -129,7 +144,8 @@ def freeProxyFifth(): try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] + port = each_proxy.xpath( + ".//span[contains(@class, 'port')]/text()")[0] yield '{}:{}'.format(ip_addr, port) except Exception as e: pass @@ -162,6 +178,63 @@ def freeProxySeventh(): for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) + @staticmethod + def freeProxyEight(): + urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] + request = WebRequest() + for url in urls: + r = requests.get(url) + proxies = re.findall( + '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.content) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod + def freeProxyNight(): + urls = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] + request = WebRequest() + for url in urls: + r = requests.get(url) + proxies = re.findall( + '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.content) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod + def freeProxyTenth(): + urls = ['https://proxy-list.org/english/index.php?p=%s' % + n for n in range(1, 10)] + request = WebRequest() + import base64 + for url in urls: + r = requests.get(url) + proxies = re.findall("Proxy\('(.*?)'\)", r.content) + for proxy in proxies: + yield base64.b64decode(proxy) + + @staticmethod + def freeProxyEleventh(): + urls = ['http://www.cz88.net/proxy/%s' % m for m in + ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]] + request = WebRequest() + for url in urls: + r = requests.get(url) + proxies = re.findall( + '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})

(\d+)
', r.content) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod + def freeProxy12th(): + urls = ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)] + request = WebRequest() + for url in urls: + r = requests.get(url) + proxies = re.findall( + '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W]*?(\d+)', r.content) + for proxy in proxies: + yield ':'.join(proxy) + if __name__ == '__main__': gg = GetFreeProxy() From 6fbba5b403adf34912be68f6c0991e0c954c074f Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 11:40:02 +0800 Subject: [PATCH 076/399] =?UTF-8?q?[update]=20=E5=A2=9E=E5=8A=A0=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E6=BA=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 10 +++-- ProxyGetter/getFreeProxy.py | 80 ++++++++++++++----------------------- 2 files changed, 36 insertions(+), 54 deletions(-) diff --git a/Config.ini b/Config.ini index 39eb6cdb6..43c10747c 100644 --- a/Config.ini +++ b/Config.ini @@ -2,7 +2,7 @@ ;Configure the database information ;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB -host = localhost +host = 123.207.35.36 ;port = 6379 port = 8888 name = proxy @@ -11,12 +11,16 @@ name = proxy ;register the proxy getter function freeProxyFirst = 1 freeProxySecond = 1 -freeProxyThird = 1 +;freeProxyThird = 1 freeProxyFourth = 1 -freeProxyFifth = 1 +;freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 +;foreign website, outside the wall +;freeProxyWallFirst = 1 +;freeProxyWallSecond = 1 + [HOST] ; API接口配置 http://127.0.0.1:5051 ip = 0.0.0.0 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index d5ad08572..7273d0233 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -12,16 +12,17 @@ ------------------------------------------------- """ import re +import sys import requests try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: - import sys # py2 - reload(sys) sys.setdefaultencoding('utf-8') +sys.path.append('../') + from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest @@ -82,8 +83,6 @@ def freeProxySecond(proxy_number=100): url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() - # html = request.get(url).content - # content为未解码,text为解码后的字符串 html = request.get(url).text for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy @@ -179,61 +178,31 @@ def freeProxySeventh(): yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod - def freeProxyEight(): + def freeProxyWallFirst(): + """ + 墙外网站 cn-proxy + :return: + """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - r = requests.get(url) + r = request.get(url) proxies = re.findall( - '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.content) + r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @staticmethod - def freeProxyNight(): - urls = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] - request = WebRequest() - for url in urls: - r = requests.get(url) - proxies = re.findall( - '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.content) - for proxy in proxies: - yield ':'.join(proxy) - - @staticmethod - def freeProxyTenth(): + def freeProxyWallSecond(): urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: - r = requests.get(url) - proxies = re.findall("Proxy\('(.*?)'\)", r.content) + r = request.get(url) + proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: - yield base64.b64decode(proxy) - - @staticmethod - def freeProxyEleventh(): - urls = ['http://www.cz88.net/proxy/%s' % m for m in - ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]] - request = WebRequest() - for url in urls: - r = requests.get(url) - proxies = re.findall( - '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
(\d+)
', r.content) - for proxy in proxies: - yield ':'.join(proxy) - - @staticmethod - def freeProxy12th(): - urls = ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)] - request = WebRequest() - for url in urls: - r = requests.get(url) - proxies = re.findall( - '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W]*?(\d+)', r.content) - for proxy in proxies: - yield ':'.join(proxy) + yield base64.b64decode(proxy).decode() if __name__ == '__main__': @@ -245,15 +214,24 @@ def freeProxy12th(): # print(e) # # for e in gg.freeProxyThird(): - # print(e) - + # print(e) + # # for e in gg.freeProxyFourth(): # print(e) - + # # for e in gg.freeProxyFifth(): - # print(e) - + # print(e) + # # for e in gg.freeProxySixth(): # print(e) - for e in gg.freeProxySeventh(): + # + # for e in gg.freeProxySeventh(): + # print(e) + + # + # + # for e in gg.freeProxyWallFirst(): + # print(e) + # + for e in gg.freeProxyWallSecond(): print(e) From 24a9f09082f65af20ad3e543474865478687bbaa Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 11:40:25 +0800 Subject: [PATCH 077/399] =?UTF-8?q?[update]=20=E4=BC=98=E5=8C=96=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E9=AA=8C=E8=AF=81=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Manager/ProxyManager.py | 21 ++++++++++++++++----- Util/utilFunction.py | 3 ++- doc/release_notes.md | 16 +++++++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 90ca120c3..6131c089a 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -19,6 +19,7 @@ from DB.DbClient import DbClient from Util.GetConfig import GetConfig from Util.LogHandler import LogHandler +from Util.utilFunction import verifyProxyFormat from ProxyGetter.getFreeProxy import GetFreeProxy @@ -40,14 +41,23 @@ def refresh(self): :return: """ for proxyGetter in self.config.proxy_getter_functions: + # fetch proxy_set = set() - # fetch raw proxy - for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - if proxy: + try: + self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) + proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] + except Exception as e: + self.log.error("{func}: fetch proxy fail".format(func=proxyGetter)) + continue + for proxy in proxy_iter: + proxy = proxy.strip() + if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy.strip()) + proxy_set.add(proxy) + else: + self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) - # store raw proxy + # store for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): @@ -97,6 +107,7 @@ def getNumber(self): total_useful_queue = self.db.getNumber() return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue} + if __name__ == '__main__': pp = ProxyManager() pp.refresh() diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 2227bfec4..2422658f9 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -42,7 +42,8 @@ def verifyProxyFormat(proxy): """ import re verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" - return True if re.findall(verify_regex, proxy) else False + _proxy = re.findall(verify_regex, proxy) + return True if len(_proxy) == 1 and _proxy[0] == proxy else False # noinspection PyPep8Naming diff --git a/doc/release_notes.md b/doc/release_notes.md index c1e4eeefa..81977db94 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -1,9 +1,19 @@ ## Release Notes -* newest -  1.使用多线程验证useful_pool +* dev + + 1.优化代理格式检查; + + 2.增加代理源; + +* 1.11(2017.8) + +  1.使用多线程验证useful_pool; + +* 1.10(2016.11) -* 1.10   1. 第一版; +   2. 支持PY2/PY3; +   3. 代理池基本功能; \ No newline at end of file From ae09f0d0cbbfe6d7f734ad18bd66e6bc9443b1b5 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 13:53:19 +0800 Subject: [PATCH 078/399] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E6=A0=A1?= =?UTF-8?q?=E9=AA=8C=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DB/SsdbClient.py | 2 +- Schedule/ProxyCheck.py | 22 ++++++++-------------- Schedule/ProxyRefreshSchedule.py | 6 +++--- Util/utilFunction.py | 13 +++++++------ 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 08778fdb6..d9a4030f4 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -63,7 +63,7 @@ def put(self, proxy, num=1): :param num: :return: """ - data = self.__conn.hincrby(self.name, proxy, num) + data = self.__conn.hset(self.name, proxy, num) return data def delete(self, key): diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 91db84142..5f0c13d40 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -22,7 +22,7 @@ from Manager.ProxyManager import ProxyManager from Util.LogHandler import LogHandler -FAIL_COUNT = 1 # 校验失败次数, 超过次数删除代理 +FAIL_COUNT = 2 # 校验失败次数, 超过次数删除代理 class ProxyCheck(ProxyManager, Thread): @@ -34,27 +34,21 @@ def __init__(self): def run(self): self.db.changeTable(self.useful_proxy_queue) while True: - proxy_item = self.db.pop() - while proxy_item: - proxy = proxy_item.get('proxy') - counter = proxy_item.get('value', 1) + for proxy, count in self.db.getAll().items(): if validUsefulProxy(proxy): - # 验证通过计数器加1 - if counter and int(counter) < 1: - self.db.put(proxy, num=int(counter) + 1) + # 验证通过计数器减1 + if count and int(count) > 0: + self.db.put(proxy, num=int(count) - 1) else: - self.db.put(proxy) + pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) - # 验证失败,计数器减1 - if counter and int(counter) <= FAIL_COUNT: + if count and int(count) > FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: - self.db.put(proxy, num=int(counter) - 1) - - proxy_item = self.db.pop() + self.db.put(proxy, num=int(count) + 1) sleep(60 * 5) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index c358acd8a..7dac2aa34 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -95,9 +95,9 @@ def main(process_num=30): def run(): main() - sched = BlockingScheduler() - sched.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 - sched.start() + sch = BlockingScheduler() + sch.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 + sch.start() if __name__ == '__main__': diff --git a/Util/utilFunction.py b/Util/utilFunction.py index 2422658f9..fc26a59b1 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -18,7 +18,7 @@ from Util.LogHandler import LogHandler from Util.WebRequest import WebRequest -logger = LogHandler(__name__, stream=False) +# logger = LogHandler(__name__, stream=False) # noinspection PyPep8Naming @@ -27,8 +27,9 @@ def decorate(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: - logger.info(u"sorry, 抓取出错。错误原因:") - logger.info(e) + pass + # logger.info(u"sorry, 抓取出错。错误原因:") + # logger.info(e) return decorate @@ -98,10 +99,10 @@ def validUsefulProxy(proxy): proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: # 超过20秒的代理就不要了 - r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=20, verify=False) + r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) if r.status_code == 200: - logger.info('%s is ok' % proxy) + # logger.info('%s is ok' % proxy) return True except Exception as e: - logger.debug(e) + # logger.error(str(e)) return False From 2ea516f19dba441506498ba6fd7de01f42ffd3a7 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 14:09:50 +0800 Subject: [PATCH 079/399] [update] release notes --- doc/release_notes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/release_notes.md b/doc/release_notes.md index 81977db94..940b4773a 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -6,6 +6,8 @@ 2.增加代理源; + 3. fix bug [#122](https://github.com/jhao104/proxy_pool/issues/122) + * 1.11(2017.8)   1.使用多线程验证useful_pool; From ec7f79d15dd25e2619a4ebb5712a2420c15216ce Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 14:14:18 +0800 Subject: [PATCH 080/399] [update] read me --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b62d6ae5c..09c625e66 100644 --- a/README.md +++ b/README.md @@ -178,8 +178,10 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@gladmo](https://github.com/gladmo)| [@scil](https://github.com/scil) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela) +### Release Notes + [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) From f77e9f6a42970cd61e8c4ac43cf893c9892c4526 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 3 Apr 2018 14:17:02 +0800 Subject: [PATCH 081/399] [update] config --- Config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index 43c10747c..1c5026aec 100644 --- a/Config.ini +++ b/Config.ini @@ -2,7 +2,7 @@ ;Configure the database information ;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB -host = 123.207.35.36 +host = 127.0.0.1 ;port = 6379 port = 8888 name = proxy From 031f4a268e04afd6dac593795d66d0b394d041b9 Mon Sep 17 00:00:00 2001 From: "lei.wu" Date: Tue, 3 Apr 2018 15:13:02 +0800 Subject: [PATCH 082/399] =?UTF-8?q?=E6=9B=B4=E6=94=B9freeProxyFifth?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 79 ++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 7273d0233..b229a2725 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -120,34 +120,59 @@ def freeProxyFourth(): yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass + # 网站改版 + # @staticmethod + # def freeProxyFifth(): + # """ + # 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml + # :return: + # """ + # url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" + # for page in range(1, 10): + # page_url = url.format(page=page) + # tree = getHtmlTree(page_url) + # proxy_list = tree.xpath('//td[@class="ip"]') + # # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 + # # 需要过滤掉

的内容 + # xpath_str = """.//*[not(contains(@style, 'display: none')) + # and not(contains(@style, 'display:none')) + # and not(contains(@class, 'port')) + # ]/text() + # """ + # for each_proxy in proxy_list: + # try: + # # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port + # ip_addr = ''.join(each_proxy.xpath(xpath_str)) + # port = each_proxy.xpath( + # ".//span[contains(@class, 'port')]/text()")[0] + # yield '{}:{}'.format(ip_addr, port) + # except Exception as e: + # pass @staticmethod def freeProxyFifth(): """ - 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml + 抓取guobanjia http://www.goubanjia.com/ :return: """ - url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('//td[@class="ip"]') - # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 - # 需要过滤掉

的内容 - xpath_str = """.//*[not(contains(@style, 'display: none')) - and not(contains(@style, 'display:none')) - and not(contains(@class, 'port')) - ]/text() - """ - for each_proxy in proxy_list: - try: - # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port - ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath( - ".//span[contains(@class, 'port')]/text()")[0] - yield '{}:{}'.format(ip_addr, port) - except Exception as e: - pass + url = "http://www.goubanjia.com/" + tree = getHtmlTree(url) + proxy_list = tree.xpath('//td[@class="ip"]') + # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 + # 需要过滤掉

的内容 + xpath_str = """.//*[not(contains(@style, 'display: none')) + and not(contains(@style, 'display:none')) + and not(contains(@class, 'port')) + ]/text() + """ + for each_proxy in proxy_list: + try: + # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port + ip_addr = ''.join(each_proxy.xpath(xpath_str)) + port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] + yield '{}:{}'.format(ip_addr, port) + except Exception as e: + pass @staticmethod def freeProxySixth(): @@ -219,19 +244,17 @@ def freeProxyWallSecond(): # for e in gg.freeProxyFourth(): # print(e) # - # for e in gg.freeProxyFifth(): - # print(e) + for e in gg.freeProxyFifth(): + print(e) # # for e in gg.freeProxySixth(): # print(e) # # for e in gg.freeProxySeventh(): # print(e) - - # # # for e in gg.freeProxyWallFirst(): # print(e) # - for e in gg.freeProxyWallSecond(): - print(e) + # for e in gg.freeProxyWallSecond(): + # print(e) From 0c75b40a4e12bdf725f8e5ed8d1daf8cfc7ea8a8 Mon Sep 17 00:00:00 2001 From: "lei.wu" Date: Tue, 3 Apr 2018 15:14:45 +0800 Subject: [PATCH 083/399] =?UTF-8?q?=E6=94=BE=E5=BC=80freeProxyFifth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Config.ini b/Config.ini index 1c5026aec..e735aed17 100644 --- a/Config.ini +++ b/Config.ini @@ -13,7 +13,7 @@ freeProxyFirst = 1 freeProxySecond = 1 ;freeProxyThird = 1 freeProxyFourth = 1 -;freeProxyFifth = 1 +freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 @@ -22,6 +22,6 @@ freeProxySeventh = 1 ;freeProxyWallSecond = 1 [HOST] -; API接口配置 http://127.0.0.1:5051 +; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 From 2e49d1a94cf516156ac429a57583da0cb57db538 Mon Sep 17 00:00:00 2001 From: "lei.wu" Date: Tue, 3 Apr 2018 15:22:27 +0800 Subject: [PATCH 084/399] =?UTF-8?q?=E6=94=BE=E5=BC=80freeProxyFifth?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=EF=BC=8C=E4=BF=AE=E6=94=B9API=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Config.ini b/Config.ini index e735aed17..f0b23f45f 100644 --- a/Config.ini +++ b/Config.ini @@ -3,8 +3,8 @@ ;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = 127.0.0.1 -;port = 6379 -port = 8888 +port = 6379 +;port = 8888 name = proxy [ProxyGetter] From 62cde1d4af7fbb9d1bc101dba9094fce858dedda Mon Sep 17 00:00:00 2001 From: "lei.wu" Date: Tue, 3 Apr 2018 18:55:59 +0800 Subject: [PATCH 085/399] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=88=AC=E5=8F=96?= =?UTF-8?q?=E7=A7=98=E5=AF=86=E4=BB=A3=E7=90=86IP=E7=BD=91=E7=AB=99http:/w?= =?UTF-8?q?ww.mimiip.com?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 81 +++++++++++++++---------------------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index b229a2725..be3b9f37f 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -30,18 +30,16 @@ requests.packages.urllib3.disable_warnings() """ -66ip.cn -data5u.com -ip181.com -xicidaili.com -goubanjia.com -xdaili.cn -kuaidaili.com -cn-proxy.com -www.mimiip.com -proxy-list.org -cz88.net -ip181.com + 66ip.cn + data5u.com + ip181.com + xicidaili.com + goubanjia.com + xdaili.cn + kuaidaili.com + cn-proxy.com + proxy-list.org + www.mimiip.com """ @@ -120,34 +118,6 @@ def freeProxyFourth(): yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass - # 网站改版 - # @staticmethod - # def freeProxyFifth(): - # """ - # 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml - # :return: - # """ - # url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" - # for page in range(1, 10): - # page_url = url.format(page=page) - # tree = getHtmlTree(page_url) - # proxy_list = tree.xpath('//td[@class="ip"]') - # # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 - # # 需要过滤掉

的内容 - # xpath_str = """.//*[not(contains(@style, 'display: none')) - # and not(contains(@style, 'display:none')) - # and not(contains(@class, 'port')) - # ]/text() - # """ - # for each_proxy in proxy_list: - # try: - # # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port - # ip_addr = ''.join(each_proxy.xpath(xpath_str)) - # port = each_proxy.xpath( - # ".//span[contains(@class, 'port')]/text()")[0] - # yield '{}:{}'.format(ip_addr, port) - # except Exception as e: - # pass @staticmethod def freeProxyFifth(): @@ -202,6 +172,23 @@ def freeProxySeventh(): for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) + @staticmethod + def freeProxyEight(): + """ + 秘密代理IP网站http://www.mimiip.com + """ + url_gngao = ['http://www.mimiip.com/gngao/%s'%n for n in range(1, 10)] #国内高匿 + url_gnpu = ['http://www.mimiip.com/gnpu/%s' %n for n in range(1, 10)] #国内普匿 + url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] #国内透明 + url_list = url_gngao + url_gnpu + url_gntou + + request = WebRequest() + for url in url_list: + r = requests.get(url) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) + @staticmethod def freeProxyWallFirst(): """ @@ -212,15 +199,13 @@ def freeProxyWallFirst(): request = WebRequest() for url in urls: r = request.get(url) - proxies = re.findall( - r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @staticmethod def freeProxyWallSecond(): - urls = ['https://proxy-list.org/english/index.php?p=%s' % - n for n in range(1, 10)] + urls = ['https://proxy-list.org/english/index.php?p=%s'%n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: @@ -244,17 +229,17 @@ def freeProxyWallSecond(): # for e in gg.freeProxyFourth(): # print(e) # - for e in gg.freeProxyFifth(): - print(e) + # for e in gg.freeProxyFifth(): + # print(e) # # for e in gg.freeProxySixth(): # print(e) # # for e in gg.freeProxySeventh(): # print(e) - # + # for e in gg.freeProxyEight(): + # print(e) # for e in gg.freeProxyWallFirst(): # print(e) - # # for e in gg.freeProxyWallSecond(): # print(e) From c86937cb35b5111cb3798c738dfbc4d370b0ba07 Mon Sep 17 00:00:00 2001 From: "lei.wu" Date: Tue, 3 Apr 2018 18:56:49 +0800 Subject: [PATCH 086/399] =?UTF-8?q?=E5=87=BD=E6=95=B0=E6=B3=A8=E5=86=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index f0b23f45f..14e785427 100644 --- a/Config.ini +++ b/Config.ini @@ -16,7 +16,7 @@ freeProxyFourth = 1 freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 - +freeProxyEight = 1 ;foreign website, outside the wall ;freeProxyWallFirst = 1 ;freeProxyWallSecond = 1 From 663788cccf3758a5d637d642256ce6a39f87f958 Mon Sep 17 00:00:00 2001 From: highroom <827148@163.com> Date: Thu, 5 Apr 2018 14:32:44 +0800 Subject: [PATCH 087/399] modify proxy valid use queue --- Schedule/ProxyCheck.py | 43 ++++++++++++++++++---------------- Schedule/ProxyValidSchedule.py | 25 ++++++++++++++++---- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 5f0c13d40..78289aeca 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -26,32 +26,35 @@ class ProxyCheck(ProxyManager, Thread): - def __init__(self): + def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check') + self.queue = queue + self.item_dict = item_dict def run(self): - self.db.changeTable(self.useful_proxy_queue) - while True: - for proxy, count in self.db.getAll().items(): - if validUsefulProxy(proxy): - # 验证通过计数器减1 - if count and int(count) > 0: - self.db.put(proxy, num=int(count) - 1) - else: - pass - self.log.info('ProxyCheck: {} validation pass'.format(proxy)) + if self.queue.qsize(): + proxy = self.queue.get() + count = self.item_dict[proxy] + if validUsefulProxy(proxy): + # 验证通过计数器减1 + if count and int(count) > 0: + self.db.put(proxy, num=int(count) - 1) else: - self.log.info('ProxyCheck: {} validation fail'.format(proxy)) - if count and int(count) > FAIL_COUNT: - self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) - self.db.delete(proxy) - else: - self.db.put(proxy, num=int(count) + 1) - sleep(60 * 5) + pass + self.log.info('ProxyCheck: {} validation pass'.format(proxy)) + else: + self.log.info('ProxyCheck: {} validation fail'.format(proxy)) + if count and int(count) > FAIL_COUNT: + self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) + self.db.delete(proxy) + else: + self.db.put(proxy, num=int(count) + 1) + self.queue.task_done() if __name__ == '__main__': - p = ProxyCheck() - p.run() + # p = ProxyCheck() + # p.run() + pass diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 8345a83a4..2d8f1f926 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -17,11 +17,15 @@ sys.path.append('../') from Schedule.ProxyCheck import ProxyCheck +from Manager.ProxyManager import ProxyManager +from queue import Queue +import time -class ProxyValidSchedule(object): +class ProxyValidSchedule(ProxyManager, object): def __init__(self): - pass + ProxyManager.__init__(self) + self.queue = Queue() def __validProxy(self, threads=5): """ @@ -31,7 +35,7 @@ def __validProxy(self, threads=5): """ thread_list = list() for index in range(threads): - thread_list.append(ProxyCheck()) + thread_list.append(ProxyCheck(self.queue, self.item_dict)) for thread in thread_list: thread.daemon = True @@ -41,7 +45,20 @@ def __validProxy(self, threads=5): thread.join() def main(self): - self.__validProxy() + self.put_queue() + while True: + if self.queue.qsize(): + self.__validProxy() + else: + print('Time sleep 5 minutes.') + time.sleep(60 * 5) + self.put_queue() + + def put_queue(self): + self.db.changeTable(self.useful_proxy_queue) + self.item_dict = self.db.getAll() + for item in self.item_dict: + self.queue.put(item) def run(): From e46cba10c08cd35f6e1602bfa2e09adc2bf22a05 Mon Sep 17 00:00:00 2001 From: highroom <827148@163.com> Date: Thu, 5 Apr 2018 22:39:23 +0800 Subject: [PATCH 088/399] modify while cycle problem --- Schedule/ProxyCheck.py | 13 +++++++------ Schedule/ProxyValidSchedule.py | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 78289aeca..c4d53ee84 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -13,7 +13,7 @@ __author__ = 'J_hao' import sys -from time import sleep +import threading from threading import Thread sys.path.append('../') @@ -34,7 +34,8 @@ def __init__(self, queue, item_dict): self.item_dict = item_dict def run(self): - if self.queue.qsize(): + while self.queue.qsize(): + print('%s active threads, %s queue size' % (threading.active_count(), self.queue.qsize())) proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): @@ -43,11 +44,11 @@ def run(self): self.db.put(proxy, num=int(count) - 1) else: pass - self.log.info('ProxyCheck: {} validation pass'.format(proxy)) + print('ProxyCheck: {} validation pass'.format(proxy)) else: - self.log.info('ProxyCheck: {} validation fail'.format(proxy)) + print('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) > FAIL_COUNT: - self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) + print('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) @@ -57,4 +58,4 @@ def run(self): if __name__ == '__main__': # p = ProxyCheck() # p.run() - pass + pass \ No newline at end of file diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 2d8f1f926..2258ab63c 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -27,7 +27,7 @@ def __init__(self): ProxyManager.__init__(self) self.queue = Queue() - def __validProxy(self, threads=5): + def __validProxy(self, threads=50): """ 验证useful_proxy代理 :param threads: 线程数 @@ -51,7 +51,7 @@ def main(self): self.__validProxy() else: print('Time sleep 5 minutes.') - time.sleep(60 * 5) + time.sleep(60 * 1) self.put_queue() def put_queue(self): @@ -68,4 +68,4 @@ def run(): if __name__ == '__main__': p = ProxyValidSchedule() - p.main() + p.main() \ No newline at end of file From 5ba4ed3e0409e23f20a21e629ff51f288841fed4 Mon Sep 17 00:00:00 2001 From: highroom <827148@163.com> Date: Thu, 5 Apr 2018 22:44:17 +0800 Subject: [PATCH 089/399] del print debug info --- Schedule/ProxyCheck.py | 1 - Schedule/ProxyValidSchedule.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index c4d53ee84..15f49df6d 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -35,7 +35,6 @@ def __init__(self, queue, item_dict): def run(self): while self.queue.qsize(): - print('%s active threads, %s queue size' % (threading.active_count(), self.queue.qsize())) proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 2258ab63c..f5a605aad 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -27,7 +27,7 @@ def __init__(self): ProxyManager.__init__(self) self.queue = Queue() - def __validProxy(self, threads=50): + def __validProxy(self, threads=10): """ 验证useful_proxy代理 :param threads: 线程数 From 0c386902a6bd5a1b0239ec32fe0b8254fc1619b9 Mon Sep 17 00:00:00 2001 From: highroom <827148@qq.com> Date: Thu, 5 Apr 2018 23:46:46 +0800 Subject: [PATCH 090/399] Update ProxyCheck.py add self.db.changeTable(self.useful_proxy_queue) --- Schedule/ProxyCheck.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 15f49df6d..10d62aa98 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -34,6 +34,7 @@ def __init__(self, queue, item_dict): self.item_dict = item_dict def run(self): + self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] @@ -57,4 +58,4 @@ def run(self): if __name__ == '__main__': # p = ProxyCheck() # p.run() - pass \ No newline at end of file + pass From b82118f4bdc6be93420132ab91898c8256ca8f51 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 11 Apr 2018 11:00:30 +0800 Subject: [PATCH 091/399] [update] fetch proxy --- ProxyGetter/getFreeProxy.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index be3b9f37f..ff9ee8197 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -177,14 +177,14 @@ def freeProxyEight(): """ 秘密代理IP网站http://www.mimiip.com """ - url_gngao = ['http://www.mimiip.com/gngao/%s'%n for n in range(1, 10)] #国内高匿 - url_gnpu = ['http://www.mimiip.com/gnpu/%s' %n for n in range(1, 10)] #国内普匿 - url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] #国内透明 + url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 + url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 + url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: - r = requests.get(url) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -205,7 +205,7 @@ def freeProxyWallFirst(): @staticmethod def freeProxyWallSecond(): - urls = ['https://proxy-list.org/english/index.php?p=%s'%n for n in range(1, 10)] + urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: @@ -237,9 +237,12 @@ def freeProxyWallSecond(): # # for e in gg.freeProxySeventh(): # print(e) + # # for e in gg.freeProxyEight(): # print(e) + # # for e in gg.freeProxyWallFirst(): # print(e) + # # for e in gg.freeProxyWallSecond(): # print(e) From d3c44af27c4876cc2c342be921a74cff7645b587 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 11 Apr 2018 11:44:36 +0800 Subject: [PATCH 092/399] [update] valid proxy --- Schedule/ProxyCheck.py | 13 ++++++------- Schedule/ProxyValidSchedule.py | 30 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 10d62aa98..4300f7bf7 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -13,7 +13,6 @@ __author__ = 'J_hao' import sys -import threading from threading import Thread sys.path.append('../') @@ -22,14 +21,14 @@ from Manager.ProxyManager import ProxyManager from Util.LogHandler import LogHandler -FAIL_COUNT = 2 # 校验失败次数, 超过次数删除代理 +FAIL_COUNT = 1 # 校验失败次数, 超过次数删除代理 class ProxyCheck(ProxyManager, Thread): def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) - self.log = LogHandler('proxy_check') + self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict @@ -44,11 +43,11 @@ def run(self): self.db.put(proxy, num=int(count) - 1) else: pass - print('ProxyCheck: {} validation pass'.format(proxy)) + self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: - print('ProxyCheck: {} validation fail'.format(proxy)) - if count and int(count) > FAIL_COUNT: - print('ProxyCheck: {} fail too many, delete!'.format(proxy)) + self.log.info('ProxyCheck: {} validation fail'.format(proxy)) + if count and int(count) + 1 >= FAIL_COUNT: + self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index f5a605aad..9b075cf90 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -13,19 +13,24 @@ __author__ = 'JHao' import sys +import time + +try: + from Queue import Queue # py3 +except: + from queue import Queue # py2 sys.path.append('../') from Schedule.ProxyCheck import ProxyCheck from Manager.ProxyManager import ProxyManager -from queue import Queue -import time class ProxyValidSchedule(ProxyManager, object): def __init__(self): ProxyManager.__init__(self) self.queue = Queue() + self.proxy_item = dict() def __validProxy(self, threads=10): """ @@ -35,7 +40,7 @@ def __validProxy(self, threads=10): """ thread_list = list() for index in range(threads): - thread_list.append(ProxyCheck(self.queue, self.item_dict)) + thread_list.append(ProxyCheck(self.queue, self.proxy_item)) for thread in thread_list: thread.daemon = True @@ -45,19 +50,20 @@ def __validProxy(self, threads=10): thread.join() def main(self): - self.put_queue() + self.putQueue() while True: - if self.queue.qsize(): + if not self.queue.empty(): + self.log.info("Start valid useful proxy") self.__validProxy() else: - print('Time sleep 5 minutes.') - time.sleep(60 * 1) - self.put_queue() + self.log.info('Valid Complete! sleep 5 minutes.') + time.sleep(60 * 5) + self.putQueue() - def put_queue(self): + def putQueue(self): self.db.changeTable(self.useful_proxy_queue) - self.item_dict = self.db.getAll() - for item in self.item_dict: + self.proxy_item = self.db.getAll() + for item in self.proxy_item: self.queue.put(item) @@ -68,4 +74,4 @@ def run(): if __name__ == '__main__': p = ProxyValidSchedule() - p.main() \ No newline at end of file + p.main() From ba302cdf7ca0d71c2c32f7f144d3ed53fc18a4d7 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 11 Apr 2018 13:53:11 +0800 Subject: [PATCH 093/399] [update] doc --- README.md | 2 +- doc/release_notes.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 09c625e66..32a0ac968 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom) ### Release Notes diff --git a/doc/release_notes.md b/doc/release_notes.md index 940b4773a..d74dbdbf7 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -6,7 +6,7 @@ 2.增加代理源; - 3. fix bug [#122](https://github.com/jhao104/proxy_pool/issues/122) + 3.fix bug [#122](https://github.com/jhao104/proxy_pool/issues/122) [#126](https://github.com/jhao104/proxy_pool/issues/126) * 1.11(2017.8) From c44f3e8150d6c8b38e3baa150cad82b5b2bc8112 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 11 Apr 2018 13:56:45 +0800 Subject: [PATCH 094/399] [update]release notes --- doc/release_notes.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/release_notes.md b/doc/release_notes.md index d74dbdbf7..0871a2db5 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -1,6 +1,6 @@ ## Release Notes -* dev +* 1.12 (2018.4) 1.优化代理格式检查; @@ -8,11 +8,11 @@ 3.fix bug [#122](https://github.com/jhao104/proxy_pool/issues/122) [#126](https://github.com/jhao104/proxy_pool/issues/126) -* 1.11(2017.8) +* 1.11 (2017.8)   1.使用多线程验证useful_pool; -* 1.10(2016.11) +* 1.10 (2016.11)   1. 第一版; From ac8842504f488d6139e10370ecbbf3806be399e8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Mon, 16 Apr 2018 15:56:44 +0800 Subject: [PATCH 095/399] [update]jsonify --- Api/ProxyApi.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 45db4843a..724dc35e6 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -14,16 +14,29 @@ __author__ = 'JHao' import sys +from werkzeug.wrappers import Response +from flask import Flask, jsonify, request sys.path.append('../') -from flask import Flask, jsonify, request from Util.GetConfig import GetConfig - from Manager.ProxyManager import ProxyManager app = Flask(__name__) + +class JsonResponse(Response): + + @classmethod + def force_type(cls, response, environ=None): + if isinstance(response, (dict, list)): + response = jsonify(response) + + return super(JsonResponse, cls).force_type(response, environ) + + +app.response_class = JsonResponse + api_list = { 'get': u'get an usable proxy', # 'refresh': u'refresh proxy pool', @@ -35,7 +48,7 @@ @app.route('/') def index(): - return jsonify(api_list) + return api_list @app.route('/get/') @@ -55,7 +68,7 @@ def refresh(): @app.route('/get_all/') def getAll(): proxies = ProxyManager().getAll() - return jsonify(proxies) + return proxies @app.route('/delete/', methods=['GET']) @@ -68,7 +81,7 @@ def delete(): @app.route('/get_status/') def getStatus(): status = ProxyManager().getNumber() - return jsonify(status) + return status def run(): From 34ef9cedb989feeb3ba1019596706c897c95a965 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 10:21:51 +0800 Subject: [PATCH 096/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 5 +- ProxyGetter/getFreeProxy.py | 235 +++++++++++++++++++++++++++--------- Util/WebRequest.py | 20 ++- requirements.txt | 10 +- 4 files changed, 203 insertions(+), 67 deletions(-) diff --git a/Config.ini b/Config.ini index 14e785427..763007e23 100644 --- a/Config.ini +++ b/Config.ini @@ -17,9 +17,10 @@ freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 +freeProxyNinth = 1 ;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 +freeProxyWallFirst = 1 +freeProxyWallSecond = 1 [HOST] ; API接口配置 http://127.0.0.1:5010 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index ff9ee8197..4cd08ced5 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -21,7 +21,7 @@ reload(sys) sys.setdefaultencoding('utf-8') -sys.path.append('../') +sys.path.append('..') from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest @@ -32,14 +32,13 @@ """ 66ip.cn data5u.com - ip181.com xicidaili.com goubanjia.com xdaili.cn kuaidaili.com cn-proxy.com proxy-list.org - www.mimiip.com + www.mimiip.com to do """ @@ -55,13 +54,15 @@ def __init__(self): def freeProxyFirst(page=10): """ 抓取无忧代理 http://www.data5u.com/ + 几乎没有能用的 :param page: 页数 :return: """ - url_list = ['http://www.data5u.com/', - 'http://www.data5u.com/free/', - 'http://www.data5u.com/free/gngn/index.shtml', - 'http://www.data5u.com/free/gnpt/index.shtml'] + url_list = [ + 'http://www.data5u.com/', + 'http://www.data5u.com/free/gngn/index.shtml', + 'http://www.data5u.com/free/gnpt/index.shtml' + ] for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') @@ -69,10 +70,10 @@ def freeProxyFirst(page=10): try: yield ':'.join(ul.xpath('.//li/text()')[0:2]) except Exception as e: - pass + print(e) @staticmethod - def freeProxySecond(proxy_number=100): + def deprecatedFreeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 @@ -85,6 +86,29 @@ def freeProxySecond(proxy_number=100): for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy + @staticmethod + def freeProxySecond(area=33): + """ + 修改抓取代理66 http://www.66ip.cn/ + :param page:抓取代理页数,page=1北京代理页,page=2上海代理页...... + :return: + """ + if area > 33: + page = 33 + for area_index in range(1, area + 1): + page_count = 5 + for i in range(1, page_count + 1): + url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) + html_tree = getHtmlTree(url) + tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") + if len(tr_list) == 0: + continue + for tr in tr_list: + yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] + break + + ''' + 不能用了 @staticmethod def freeProxyThird(days=1): """ @@ -100,24 +124,28 @@ def freeProxyThird(days=1): yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass + ''' @staticmethod - def freeProxyFourth(): + def freeProxyFourth(page_count=2): """ 抓取西刺代理 http://api.xicidaili.com/free2016.txt :return: """ - url_list = ['http://www.xicidaili.com/nn', # 高匿 - 'http://www.xicidaili.com/nt', # 透明 - ] + url_list = [ + 'http://www.xicidaili.com/nn/', # 高匿 + 'http://www.xicidaili.com/nt/', # 透明 + ] for each_url in url_list: - tree = getHtmlTree(each_url) - proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') - for proxy in proxy_list: - try: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - except Exception as e: - pass + for i in range(1, page_count + 1): + page_url = each_url + str(i) + tree = getHtmlTree(page_url) + proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]') + for proxy in proxy_list: + try: + yield ':'.join(proxy.xpath('./td/text()')[0:2]) + except Exception as e: + pass @staticmethod def freeProxyFifth(): @@ -164,13 +192,17 @@ def freeProxySeventh(): """ 快代理免费https://www.kuaidaili.com/free/inha/1/ """ - url = 'https://www.kuaidaili.com/free/inha/{page}/' - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + url_list = [ + 'https://www.kuaidaili.com/free/inha/{page}/', + 'https://www.kuaidaili.com/free/intr/{page}/' + ] + for url in url_list: + for page in range(1, 5): + page_url = url.format(page=page) + tree = getHtmlTree(page_url) + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod def freeProxyEight(): @@ -184,15 +216,33 @@ def freeProxyEight(): request = WebRequest() for url in url_list: - r = request.get(url) + r = request.get(url, use_proxy=True) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) + + @staticmethod + def freeProxyNinth(): + """ + coderBusy + https://proxy.coderbusy.com/ + :return: + """ + urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod def freeProxyWallFirst(): """ 墙外网站 cn-proxy + 并没有被墙 :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] @@ -205,6 +255,10 @@ def freeProxyWallFirst(): @staticmethod def freeProxyWallSecond(): + ''' + 并没有被墙 + :return: + ''' urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 @@ -215,34 +269,101 @@ def freeProxyWallSecond(): yield base64.b64decode(proxy).decode() + @staticmethod + def freeProxyWallThird(): + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) + + +import threading + +lock = threading.Lock() +success = 0 +total = 0 + + +def test_once(proxy): + ip_port = proxy.split(":") + ip = ip_port[0] + port = ip_port[1] + import requests + + req_url = "http://www.baidu.com" + proxies = { + "http": "http://%s:%s" % (ip, port), + "https": "https://%s:%s" % (ip, port) + } + + global total + + try: + response = requests.get(req_url, proxies=proxies, timeout=4) + if response.status_code != 200: + print("unknow error, status code:" + str(response.status_code)) + lock.acquire() + total += 1 + lock.release() + return 0 + print("success") + global success + lock.acquire() + success += 1 + total += 1 + lock.release() + return 1 + except requests.exceptions.Timeout: + print("timeout") + except requests.exceptions.ConnectionError: + print("poxy unusable") + except Exception: + print("request error") + + lock.acquire() + total += 1 + lock.release() + return 0 + + +def test_batch(iterator): + global success + global total + + for proxy in iterator: + t = threading.Thread(target=test_once, args=(proxy,)) + t.start() + t.join() + + print("success:" + str(success) + "\ttotal:" + str(total)) + + if __name__ == '__main__': gg = GetFreeProxy() - # for e in gg.freeProxyFirst(): - # print(e) - # - # for e in gg.freeProxySecond(): - # print(e) - # - # for e in gg.freeProxyThird(): - # print(e) - # - # for e in gg.freeProxyFourth(): - # print(e) - # - # for e in gg.freeProxyFifth(): - # print(e) - # - # for e in gg.freeProxySixth(): - # print(e) - # - # for e in gg.freeProxySeventh(): - # print(e) - # - # for e in gg.freeProxyEight(): - # print(e) - # - # for e in gg.freeProxyWallFirst(): - # print(e) - # - # for e in gg.freeProxyWallSecond(): - # print(e) + + # test_batch(gg.freeProxyFirst()) + + # test_batch(gg.freeProxySecond()) + + # test_batch(gg.freeProxyFourth()) + + # test_batch(gg.freeProxyFifth()) + + # test_batch(gg.freeProxySixth()) + + # test_batch(gg.freeProxySeventh()) + + # to do + test_batch(gg.freeProxyEight()) + # gg.freeProxyEight() + + # test_batch(gg.freeProxyNinth()) + + # test_batch(gg.freeProxyWallFirst()) + + # test_batch(gg.freeProxyWallSecond()) + + # test_batch(gg.freeProxyWallThird()) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index abbdb17be..7f5011724 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -15,6 +15,7 @@ import requests import random import time +from requests.models import Response class WebRequest(object): @@ -51,7 +52,7 @@ def header(self): 'Accept-Language': 'zh-CN,zh;q=0.8'} def get(self, url, header=None, retry_time=5, timeout=30, - retry_flag=list(), retry_interval=5, *args, **kwargs): + retry_flag=list(), retry_interval=5, use_proxy=False, *args, **kwargs): """ get method :param url: target url @@ -60,6 +61,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, :param timeout: network timeout :param retry_flag: if retry_flag in content. do retry :param retry_interval: retry interval(second) + :param use_proxy: 是否使用代理 :param args: :param kwargs: :return: @@ -69,7 +71,16 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - html = requests.get(url, headers=headers, timeout=timeout) + if use_proxy: + proxy_url = "http://127.0.0.1:5010/get" + ip_proxy = requests.get(proxy_url).text + proxies = { + "http": "http://" + ip_proxy, + "https": "https://" + ip_proxy + } + html = requests.get(url, headers=headers, timeout=timeout, proxies=proxies) + else: + html = requests.get(url, headers=headers, timeout=timeout) if any(f in html.content for f in retry_flag): raise Exception return html @@ -78,5 +89,8 @@ def get(self, url, header=None, retry_time=5, timeout=30, retry_time -= 1 if retry_time <= 0: # 多次请求失败时,返回百度页面 - return requests.get("https://www.baidu.com/") + resp = Response() + resp.status_code = 200 + return resp time.sleep(retry_interval) + diff --git a/requirements.txt b/requirements.txt index 698cc8197..c0db31b10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ APScheduler==3.2.0 -Flask==0.11.1 -requests==2.11.0 -lxml==3.7.1 +Flask +requests +lxml -pymongo==3.2.2 -redis==2.10.5 +pymongo +redis From de58a7e81fc9f7f33b8437f674c4bbd7245b2391 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 11:43:07 +0800 Subject: [PATCH 097/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99=20http://www.ip3366.net/fr?= =?UTF-8?q?ee/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 1 + ProxyGetter/getFreeProxy.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 763007e23..322aaddd8 100644 --- a/Config.ini +++ b/Config.ini @@ -18,6 +18,7 @@ freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 freeProxyNinth = 1 +freeProxyTen = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 4cd08ced5..1330d88fc 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -238,6 +238,17 @@ def freeProxyNinth(): yield ':'.join(proxy) + @staticmethod + def freeProxyTen(): + urls = ['http://www.ip3366.net/free/'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod def freeProxyWallFirst(): """ @@ -356,12 +367,12 @@ def test_batch(iterator): # test_batch(gg.freeProxySeventh()) - # to do - test_batch(gg.freeProxyEight()) - # gg.freeProxyEight() + # test_batch(gg.freeProxyEight()) # test_batch(gg.freeProxyNinth()) + # test_batch(gg.freeProxyTen()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) From e2433d39fc8f5802fedfbc6d6e8e9fc9e1ff7b15 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 12:41:15 +0800 Subject: [PATCH 098/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99=20http://www.iphai.com?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 2 ++ ProxyGetter/getFreeProxy.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Config.ini b/Config.ini index 322aaddd8..5f417badc 100644 --- a/Config.ini +++ b/Config.ini @@ -19,9 +19,11 @@ freeProxySeventh = 1 freeProxyEight = 1 freeProxyNinth = 1 freeProxyTen = 1 +freeProxyEleven = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 +freeProxyWallThird = 1 [HOST] ; API接口配置 http://127.0.0.1:5010 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 1330d88fc..b9f542b01 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -249,6 +249,22 @@ def freeProxyTen(): yield ":".join(proxy) + @staticmethod + def freeProxyEleven(): + urls = [ + 'http://www.iphai.com/free/ng', + 'http://www.iphai.com/free/np', + 'http://www.iphai.com/free/wg', + 'http://www.iphai.com/free/wp' + ] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod def freeProxyWallFirst(): """ @@ -373,6 +389,8 @@ def test_batch(iterator): # test_batch(gg.freeProxyTen()) + # test_batch(gg.freeProxyEleven()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) From 73afc225356d0577c35b6f2d127bf53f1d4ec3b1 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 27 Apr 2018 16:08:59 +0800 Subject: [PATCH 099/399] merge luocandan's code --- DB/SsdbClient.py | 4 + ProxyGetter/getFreeProxy.py | 131 ++++++-------------------- Test/.pytest_cache/v/cache/lastfailed | 3 + Test/.pytest_cache/v/cache/nodeids | 3 + Util/WebRequest.py | 21 +---- requirements.txt | 7 +- 6 files changed, 46 insertions(+), 123 deletions(-) create mode 100644 Test/.pytest_cache/v/cache/lastfailed create mode 100644 Test/.pytest_cache/v/cache/nodeids diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index d9a4030f4..2522e0071 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -110,3 +110,7 @@ def getNumber(self): def changeTable(self, name): self.name = name + +if __name__ == '__main__': + c = SsdbClient('useful_proxy', '118.24.52.95', 8899) + print(c.getAll()) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index b9f542b01..78837d50a 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -53,7 +53,7 @@ def __init__(self): @staticmethod def freeProxyFirst(page=10): """ - 抓取无忧代理 http://www.data5u.com/ + 无忧代理 http://www.data5u.com/ 几乎没有能用的 :param page: 页数 :return: @@ -73,31 +73,16 @@ def freeProxyFirst(page=10): print(e) @staticmethod - def deprecatedFreeProxySecond(proxy_number=100): + def freeProxySecond(area=33, page=1): """ - 抓取代理66 http://www.66ip.cn/ - :param proxy_number: 代理数量 + 代理66 http://www.66ip.cn/ + :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页...... + :param page: 翻页 :return: """ - url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) - request = WebRequest() - html = request.get(url).text - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - def freeProxySecond(area=33): - """ - 修改抓取代理66 http://www.66ip.cn/ - :param page:抓取代理页数,page=1北京代理页,page=2上海代理页...... - :return: - """ - if area > 33: - page = 33 + area = 33 if area > 33 else area for area_index in range(1, area + 1): - page_count = 5 - for i in range(1, page_count + 1): + for i in range(1, page + 1): url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") @@ -107,12 +92,10 @@ def freeProxySecond(area=33): yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] break - ''' - 不能用了 @staticmethod def freeProxyThird(days=1): """ - 抓取ip181 http://www.ip181.com/ + ip181 http://www.ip181.com/ 不能用了 :param days: :return: """ @@ -124,12 +107,11 @@ def freeProxyThird(days=1): yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass - ''' @staticmethod def freeProxyFourth(page_count=2): """ - 抓取西刺代理 http://api.xicidaili.com/free2016.txt + 西刺代理 http://www.xicidaili.com :return: """ url_list = [ @@ -150,7 +132,7 @@ def freeProxyFourth(page_count=2): @staticmethod def freeProxyFifth(): """ - 抓取guobanjia http://www.goubanjia.com/ + guobanjia http://www.goubanjia.com/ :return: """ url = "http://www.goubanjia.com/" @@ -175,7 +157,7 @@ def freeProxyFifth(): @staticmethod def freeProxySixth(): """ - 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 + 讯代理 http://www.xdaili.cn/ :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' @@ -190,7 +172,7 @@ def freeProxySixth(): @staticmethod def freeProxySeventh(): """ - 快代理免费https://www.kuaidaili.com/free/inha/1/ + 快代理 https://www.kuaidaili.com """ url_list = [ 'https://www.kuaidaili.com/free/inha/{page}/', @@ -207,7 +189,7 @@ def freeProxySeventh(): @staticmethod def freeProxyEight(): """ - 秘密代理IP网站http://www.mimiip.com + 秘密代理 http://www.mimiip.com """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 @@ -221,12 +203,10 @@ def freeProxyEight(): for proxy in proxies: yield ':'.join(proxy) - @staticmethod def freeProxyNinth(): """ - coderBusy - https://proxy.coderbusy.com/ + 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -237,9 +217,12 @@ def freeProxyNinth(): for proxy in proxies: yield ':'.join(proxy) - @staticmethod def freeProxyTen(): + """ + 云代理 http://www.ip3366.net/free/ + :return: + """ urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: @@ -248,9 +231,12 @@ def freeProxyTen(): for proxy in proxies: yield ":".join(proxy) - @staticmethod def freeProxyEleven(): + """ + IP海 http://www.iphai.com/free/ng + :return: + """ urls = [ 'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np', @@ -260,16 +246,15 @@ def freeProxyEleven(): request = WebRequest() for url in urls: r = request.get(url) - proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) + proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', + r.text) for proxy in proxies: yield ":".join(proxy) - @staticmethod def freeProxyWallFirst(): """ 墙外网站 cn-proxy - 并没有被墙 :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] @@ -282,10 +267,10 @@ def freeProxyWallFirst(): @staticmethod def freeProxyWallSecond(): - ''' - 并没有被墙 + """ + https://proxy-list.org/english/index.php :return: - ''' + """ urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 @@ -295,7 +280,6 @@ def freeProxyWallSecond(): for proxy in proxies: yield base64.b64decode(proxy).decode() - @staticmethod def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] @@ -307,67 +291,6 @@ def freeProxyWallThird(): yield ':'.join(proxy) -import threading - -lock = threading.Lock() -success = 0 -total = 0 - - -def test_once(proxy): - ip_port = proxy.split(":") - ip = ip_port[0] - port = ip_port[1] - import requests - - req_url = "http://www.baidu.com" - proxies = { - "http": "http://%s:%s" % (ip, port), - "https": "https://%s:%s" % (ip, port) - } - - global total - - try: - response = requests.get(req_url, proxies=proxies, timeout=4) - if response.status_code != 200: - print("unknow error, status code:" + str(response.status_code)) - lock.acquire() - total += 1 - lock.release() - return 0 - print("success") - global success - lock.acquire() - success += 1 - total += 1 - lock.release() - return 1 - except requests.exceptions.Timeout: - print("timeout") - except requests.exceptions.ConnectionError: - print("poxy unusable") - except Exception: - print("request error") - - lock.acquire() - total += 1 - lock.release() - return 0 - - -def test_batch(iterator): - global success - global total - - for proxy in iterator: - t = threading.Thread(target=test_once, args=(proxy,)) - t.start() - t.join() - - print("success:" + str(success) + "\ttotal:" + str(total)) - - if __name__ == '__main__': gg = GetFreeProxy() diff --git a/Test/.pytest_cache/v/cache/lastfailed b/Test/.pytest_cache/v/cache/lastfailed new file mode 100644 index 000000000..65c9a06d6 --- /dev/null +++ b/Test/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,3 @@ +{ + "testGetFreeProxy.py::testGetFreeProxy": true +} \ No newline at end of file diff --git a/Test/.pytest_cache/v/cache/nodeids b/Test/.pytest_cache/v/cache/nodeids new file mode 100644 index 000000000..0ce3684ce --- /dev/null +++ b/Test/.pytest_cache/v/cache/nodeids @@ -0,0 +1,3 @@ +[ + "testGetFreeProxy.py::testGetFreeProxy" +] \ No newline at end of file diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 7f5011724..68db87500 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -12,10 +12,10 @@ """ __author__ = 'J_hao' +from requests.models import Response import requests import random import time -from requests.models import Response class WebRequest(object): @@ -37,7 +37,7 @@ def user_agent(self): 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', - ] + ] return random.choice(ua_list) @property @@ -52,7 +52,7 @@ def header(self): 'Accept-Language': 'zh-CN,zh;q=0.8'} def get(self, url, header=None, retry_time=5, timeout=30, - retry_flag=list(), retry_interval=5, use_proxy=False, *args, **kwargs): + retry_flag=list(), retry_interval=5, *args, **kwargs): """ get method :param url: target url @@ -61,7 +61,6 @@ def get(self, url, header=None, retry_time=5, timeout=30, :param timeout: network timeout :param retry_flag: if retry_flag in content. do retry :param retry_interval: retry interval(second) - :param use_proxy: 是否使用代理 :param args: :param kwargs: :return: @@ -71,16 +70,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - if use_proxy: - proxy_url = "http://127.0.0.1:5010/get" - ip_proxy = requests.get(proxy_url).text - proxies = { - "http": "http://" + ip_proxy, - "https": "https://" + ip_proxy - } - html = requests.get(url, headers=headers, timeout=timeout, proxies=proxies) - else: - html = requests.get(url, headers=headers, timeout=timeout) + html = requests.get(url, headers=headers, timeout=timeout) if any(f in html.content for f in retry_flag): raise Exception return html @@ -88,9 +78,8 @@ def get(self, url, header=None, retry_time=5, timeout=30, print(e) retry_time -= 1 if retry_time <= 0: - # 多次请求失败时,返回百度页面 + # 多次请求失败 resp = Response() resp.status_code = 200 return resp time.sleep(retry_interval) - diff --git a/requirements.txt b/requirements.txt index c0db31b10..5d00da69a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ APScheduler==3.2.0 -Flask -requests -lxml +werkzeug==0.11.15 +Flask==0.12 +requests==2.12.4 +lxml==3.7.2 pymongo redis From b9f3e148c64d9a50b7c23768c1007b4d993eead8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 27 Apr 2018 16:14:40 +0800 Subject: [PATCH 100/399] [update] add contributor --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32a0ac968..8c59cd631 100644 --- a/README.md +++ b/README.md @@ -183,5 +183,5 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致 ### Release Notes - [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) + [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) [@luocaodan](https://github.com/luocaodan) From f9a8bf54054f0e7435c7e9718da9033988e0958c Mon Sep 17 00:00:00 2001 From: highroom <827148@163.com> Date: Mon, 14 May 2018 23:39:30 +0800 Subject: [PATCH 101/399] =?UTF-8?q?=E5=A2=9E=E5=8A=A0fq=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E7=9A=84=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=85=8D=E7=BD=AE=E5=90=8E?= =?UTF-8?q?=E8=AF=B7=E8=B0=83=E7=94=A8=E4=BB=A3=E7=90=86=E8=AE=BF=E9=97=AE?= =?UTF-8?q?wallproxy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 ++++ ProxyGetter/getFreeProxy.py | 44 ++++++++++++++++++++++++++++++++++--- Util/WebRequest.py | 2 +- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/Config.ini b/Config.ini index 5f417badc..dae17c1a2 100644 --- a/Config.ini +++ b/Config.ini @@ -29,3 +29,7 @@ freeProxyWallThird = 1 ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 + +[WallProxy] +; fq代理配置 +; proxy = 127.0.0.1:1080 \ No newline at end of file diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 78837d50a..edb71fa8b 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -14,6 +14,12 @@ import re import sys import requests +import os + +try: + from configparser import ConfigParser # py3 +except: + from ConfigParser import ConfigParser # py2 try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -46,6 +52,15 @@ class GetFreeProxy(object): """ proxy getter """ + pwd = os.path.split(os.path.realpath(__file__))[0] + config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini') + config_file = ConfigParser() + config_file.read(config_path) + if config_file.has_option('WallProxy', 'proxy'): + WallProxy = config_file.get('WallProxy', 'proxy') + wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)} + else: + wall_proxies = None def __init__(self): pass @@ -257,10 +272,17 @@ def freeProxyWallFirst(): 墙外网站 cn-proxy :return: """ + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return + urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -271,21 +293,35 @@ def freeProxyWallSecond(): https://proxy-list.org/english/index.php :return: """ + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @staticmethod def freeProxyWallThird(): + + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -319,3 +355,5 @@ def freeProxyWallThird(): # test_batch(gg.freeProxyWallSecond()) # test_batch(gg.freeProxyWallThird()) + for e in gg.freeProxyWallThird(): + print(e) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 68db87500..47286a225 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -70,7 +70,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - html = requests.get(url, headers=headers, timeout=timeout) + html = requests.get(url, headers=headers, timeout=timeout, **kwargs) if any(f in html.content for f in retry_flag): raise Exception return html From 876b1ec4de23924f8e940772a8690d2373c1a402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8D=A3=E9=A3=9E=20=E5=BE=90?= Date: Wed, 23 May 2018 15:23:41 +0800 Subject: [PATCH 102/399] add http://ip.jiangxianli.com/ --- Config.ini | 1 + ProxyGetter/getFreeProxy.py | 37 +++++++++++++++++++++++++++++++++++++ Test/testGetFreeProxy.py | 16 ++++++++++++++-- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/Config.ini b/Config.ini index 5f417badc..ca011a01f 100644 --- a/Config.ini +++ b/Config.ini @@ -20,6 +20,7 @@ freeProxyEight = 1 freeProxyNinth = 1 freeProxyTen = 1 freeProxyEleven = 1 +freeProxyTwelve = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 78837d50a..54d09c1b0 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -15,6 +15,7 @@ import sys import requests + try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: @@ -25,6 +26,7 @@ from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest +from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() @@ -251,6 +253,24 @@ def freeProxyEleven(): for proxy in proxies: yield ":".join(proxy) + @staticmethod + def freeProxyTwelve(page_count=8): + """ + guobanjia http://ip.jiangxianli.com/?page= + 免费代理库 + 超多量 + :return: + """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?page={}'.format(i) + # print(url) + html_tree = getHtmlTree(url) + tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") + if len(tr_list) == 0: + continue + for tr in tr_list: + yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0] + @staticmethod def freeProxyWallFirst(): """ @@ -314,6 +334,23 @@ def freeProxyWallThird(): # test_batch(gg.freeProxyEleven()) + proxy_iter = gg.freeProxyTwelve() + proxy_set = set() + for proxy in proxy_iter: + proxy = proxy.strip() + if proxy and verifyProxyFormat(proxy): + #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + proxy_set.add(proxy) + #else: + #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) + + # store + for proxy in proxy_set: + print(proxy) + + + # test_batch(gg.freeProxyTwelve()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index df99c79f3..33c3f9e46 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -12,6 +12,18 @@ """ __author__ = 'J_hao' +import re +import sys +import requests + + +try: + from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 +except: + reload(sys) + sys.setdefaultencoding('utf-8') + +sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy from Util.GetConfig import GetConfig @@ -28,9 +40,9 @@ def testGetFreeProxy(): proxy_count = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: - print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count)) proxy_count += 1 - assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) + #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) if __name__ == '__main__': From 413e41b2973e41742e55ab7bb7a1d642fe6ada8d Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 10 Jul 2018 16:50:31 +0800 Subject: [PATCH 103/399] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9ProxyGette?= =?UTF-8?q?r=E6=A3=80=E6=9F=A5=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 -- ProxyGetter/CheckProxy.py | 72 ++++++++++++++++++++++++++++ ProxyGetter/getFreeProxy.py | 96 ++++--------------------------------- 3 files changed, 81 insertions(+), 91 deletions(-) create mode 100644 ProxyGetter/CheckProxy.py diff --git a/Config.ini b/Config.ini index 95e33400d..ca011a01f 100644 --- a/Config.ini +++ b/Config.ini @@ -30,7 +30,3 @@ freeProxyWallThird = 1 ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 - -[WallProxy] -; fq代理配置 -; proxy = 127.0.0.1:1080 \ No newline at end of file diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py new file mode 100644 index 000000000..f6ba9b66a --- /dev/null +++ b/ProxyGetter/CheckProxy.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: CheckProxy + Description : used for check getFreeProxy.py + Author : JHao + date: 2018/7/10 +------------------------------------------------- + Change Activity: + 2018/7/10: CheckProxy +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys +from getFreeProxy import GetFreeProxy +from Util.utilFunction import verifyProxyFormat + +sys.path.append('../') + +from Util.LogHandler import LogHandler + +log = LogHandler('check_proxy', file=False) + + +class CheckProxy(object): + + @staticmethod + def checkAllGetProxyFunc(): + """ + 检查getFreeProxy所有代理获取函数运行情况 + Returns: + None + """ + import inspect + member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) + proxy_count_dict = dict() + for func_name, func in member_list: + log.info(u"开始运行 {}".format(func_name)) + try: + proxy_list = [_ for _ in func() if verifyProxyFormat(_)] + proxy_count_dict[func_name] = len(proxy_list) + except Exception as e: + log.info(u"代理获取函数 {} 运行出错!".format(func_name)) + log.error(str(e)) + log.info(u"所有函数运行完毕 " + "***" * 5) + for func_name, func in member_list: + log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0))) + + @staticmethod + def checkGetProxyFunc(func): + """ + 检查指定的getFreeProxy某个function运行情况 + Args: + func: getFreeProxy中某个可调用方法 + + Returns: + None + """ + func_name = getattr(func, '__name__', "None") + log.info("start running func: {}".format(func_name)) + count = 0 + for proxy in func(): + if verifyProxyFormat(proxy): + log.info("fetch proxy: {}".format(proxy)) + count += 1 + log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) + + +if __name__ == '__main__': + CheckProxy.checkAllGetProxyFunc() + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 23542a5b7..bf2e03f61 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -14,13 +14,6 @@ import re import sys import requests -import os - -try: - from configparser import ConfigParser # py3 -except: - from ConfigParser import ConfigParser # py2 - try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -30,8 +23,8 @@ sys.path.append('..') -from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest +from Util.utilFunction import getHtmlTree from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning @@ -54,15 +47,6 @@ class GetFreeProxy(object): """ proxy getter """ - pwd = os.path.split(os.path.realpath(__file__))[0] - config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini') - config_file = ConfigParser() - config_file.read(config_path) - if config_file.has_option('WallProxy', 'proxy'): - WallProxy = config_file.get('WallProxy', 'proxy') - wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)} - else: - wall_proxies = None def __init__(self): pass @@ -215,7 +199,7 @@ def freeProxyEight(): request = WebRequest() for url in url_list: - r = request.get(url, use_proxy=True) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -278,7 +262,6 @@ def freeProxyTwelve(page_count=8): """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?page={}'.format(i) - # print(url) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") if len(tr_list) == 0: @@ -292,17 +275,10 @@ def freeProxyWallFirst(): 墙外网站 cn-proxy :return: """ - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return - urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -313,84 +289,30 @@ def freeProxyWallSecond(): https://proxy-list.org/english/index.php :return: """ - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @staticmethod def freeProxyWallThird(): - - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return - urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) if __name__ == '__main__': - gg = GetFreeProxy() - - # test_batch(gg.freeProxyFirst()) - - # test_batch(gg.freeProxySecond()) - - # test_batch(gg.freeProxyFourth()) - - # test_batch(gg.freeProxyFifth()) - - # test_batch(gg.freeProxySixth()) - - # test_batch(gg.freeProxySeventh()) - - # test_batch(gg.freeProxyEight()) - - # test_batch(gg.freeProxyNinth()) - - # test_batch(gg.freeProxyTen()) - - # test_batch(gg.freeProxyEleven()) - - proxy_iter = gg.freeProxyTwelve() - proxy_set = set() - for proxy in proxy_iter: - proxy = proxy.strip() - if proxy and verifyProxyFormat(proxy): - #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy) - #else: - #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) - - # store - for proxy in proxy_set: - print(proxy) - - - # test_batch(gg.freeProxyTwelve()) - - # test_batch(gg.freeProxyWallFirst()) + from CheckProxy import CheckProxy - # test_batch(gg.freeProxyWallSecond()) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) - # test_batch(gg.freeProxyWallThird()) - for e in gg.freeProxyWallThird(): - print(e) + CheckProxy.checkAllGetProxyFunc() From edac60ce8ea6340834e1e4afa53d37f3e1a783a8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 10 Jul 2018 16:54:13 +0800 Subject: [PATCH 104/399] [update] readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c59cd631..8dc30eee9 100644 --- a/README.md +++ b/README.md @@ -178,10 +178,10 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan) ### Release Notes - [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) [@luocaodan](https://github.com/luocaodan) + [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) From 62b05856fbed3f104842010defe0f46fd5e5c242 Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 17:50:34 +0800 Subject: [PATCH 105/399] =?UTF-8?q?1.flask=E6=94=AF=E6=8C=81=E5=A4=9A?= =?UTF-8?q?=E8=BF=9B=E7=A8=8B=E5=A4=84=E7=90=86=E4=BB=BB=E5=8A=A1=202.?= =?UTF-8?q?=E4=BC=98=E5=8C=96=20proxy=20=E9=87=87=E9=9B=86=E3=80=81?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C=E6=B5=81=E7=A8=8B=EF=BC=8C=E5=8A=A0=E5=BF=AB?= =?UTF-8?q?=20userfull=20proxy=20=E6=A0=A1=E9=AA=8C=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 9 ++++----- Config.ini | 8 +++++--- Manager/ProxyManager.py | 32 ++++++++++++-------------------- Schedule/ProxyRefreshSchedule.py | 23 +++++++++++++---------- Util/GetConfig.py | 11 ++++++++--- 5 files changed, 42 insertions(+), 41 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 724dc35e6..2e3733013 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -2,13 +2,13 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: ProxyApi.py - Description : + File Name: ProxyApi.py + Description : Author : JHao date: 2016/12/4 ------------------------------------------------- Change Activity: - 2016/12/4: + 2016/12/4: ------------------------------------------------- """ __author__ = 'JHao' @@ -26,7 +26,6 @@ class JsonResponse(Response): - @classmethod def force_type(cls, response, environ=None): if isinstance(response, (dict, list)): @@ -86,7 +85,7 @@ def getStatus(): def run(): config = GetConfig() - app.run(host=config.host_ip, port=config.host_port) + app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index ca011a01f..d1ab07bb4 100644 --- a/Config.ini +++ b/Config.ini @@ -9,11 +9,11 @@ name = proxy [ProxyGetter] ;register the proxy getter function -freeProxyFirst = 1 +freeProxyFirst = 1 freeProxySecond = 1 ;freeProxyThird = 1 freeProxyFourth = 1 -freeProxyFifth = 1 +freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 @@ -26,7 +26,9 @@ freeProxyWallFirst = 1 freeProxyWallSecond = 1 freeProxyWallThird = 1 -[HOST] +[API] ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 +; flask多进程处理请求 +processes = 10 diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 6131c089a..33aa76b39 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -2,13 +2,13 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: ProxyManager.py - Description : + File Name: ProxyManager.py + Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: - 2016/12/3: + 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' @@ -40,30 +40,22 @@ def refresh(self): fetch proxy into Db by ProxyGetter :return: """ + self.db.changeTable(self.raw_proxy_queue) for proxyGetter in self.config.proxy_getter_functions: # fetch - proxy_set = set() try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) - proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] + for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): + # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度 + proxy = proxy.strip() + if proxy and verifyProxyFormat(proxy): + self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + self.db.put(proxy) + else: + self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error("{func}: fetch proxy fail".format(func=proxyGetter)) continue - for proxy in proxy_iter: - proxy = proxy.strip() - if proxy and verifyProxyFormat(proxy): - self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy) - else: - self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) - - # store - for proxy in proxy_set: - self.db.changeTable(self.useful_proxy_queue) - if self.db.exists(proxy): - continue - self.db.changeTable(self.raw_proxy_queue) - self.db.put(proxy) def get(self): """ diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 7dac2aa34..6088fcb0a 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -18,7 +18,8 @@ import time import logging from threading import Thread -from apscheduler.schedulers.blocking import BlockingScheduler +# 使用后台调度,不使用阻塞式~ +from apscheduler.schedulers.background import BackgroundScheduler as Sch sys.path.append('../') @@ -73,12 +74,7 @@ def refreshPool(): pp.validProxy() -def main(process_num=30): - p = ProxyRefreshSchedule() - - # 获取新代理 - p.refresh() - +def batch_refresh(process_num=30): # 检验新代理 pl = [] for num in range(process_num): @@ -93,11 +89,18 @@ def main(process_num=30): pl[num].join() +def fetch_all(): + p = ProxyRefreshSchedule() + # 获取新代理 + p.refresh() + + def run(): - main() - sch = BlockingScheduler() - sch.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 + sch = Sch() + sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次 + sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次 sch.start() + fetch_all() if __name__ == '__main__': diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 24b003f28..8ea57be56 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -2,7 +2,7 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: GetConfig.py + File Name: GetConfig.py Description : fetch config from config.ini Author : JHao date: 2016/12/3 @@ -51,11 +51,15 @@ def proxy_getter_functions(self): @LazyProperty def host_ip(self): - return self.config_file.get('HOST','ip') + return self.config_file.get('API','ip') @LazyProperty def host_port(self): - return int(self.config_file.get('HOST', 'port')) + return int(self.config_file.get('API', 'port')) + + @LazyProperty + def processes(self): + return int(self.config_file.get('API', 'processes')) if __name__ == '__main__': gg = GetConfig() @@ -66,3 +70,4 @@ def host_port(self): print(gg.proxy_getter_functions) print(gg.host_ip) print(gg.host_port) + print(gg.processes) From a0b152a968e073c0c35f8dc03d862f783ba4ee86 Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 18:15:47 +0800 Subject: [PATCH 106/399] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7c815a4e7..d97495489 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,28 +3,25 @@ WORKDIR /usr/src/app COPY . . ENV DEBIAN_FRONTEND noninteractive ENV TZ Asia/Shanghai -RUN pip install --no-cache-dir -r requirements.txt && \ - apt-get update && \ - apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \ - git clone --depth 1 https://github.com/ideawu/ssdb.git ssdb && \ - cd ssdb && make && make install && cp ssdb-server /usr/bin && \ - apt-get remove -y --force-yes git make gcc g++ autoconf && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ - cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \ - mkdir -p /var/lib/ssdb && \ - sed \ - -e 's@home.*@home /var/lib@' \ - -e 's/loglevel.*/loglevel info/' \ - -e 's@work_dir = .*@work_dir = /var/lib/ssdb@' \ - -e 's@pidfile = .*@pidfile = /run/ssdb.pid@' \ - -e 's@level:.*@level: info@' \ - -e 's@ip:.*@ip: 0.0.0.0@' \ - -i /etc/ssdb.conf && \ - echo "# ! /bin/sh " > /usr/src/app/run.sh && \ - echo "cd Run" >> /usr/src/app/run.sh && \ - echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \ - echo "python main.py" >> /usr/src/app/run.sh && \ - chmod 777 run.sh + +RUN apt-get update +RUN apt-get install vim -y + +RUN apt-get install -y redis-server +RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \ + && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \ + && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf +# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \ +# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf + +RUN pip install --no-cache-dir -r requirements.txt + + +RUN echo "# ! /bin/sh " > run.sh \ + && echo "redis-server /etc/redis/redis.conf&" >> run.sh \ + && echo "cd Run" >> run.sh \ + && echo "python main.py" >> run.sh \ + && chmod 777 run.sh + EXPOSE 5010 CMD [ "sh", "run.sh" ] From 5de6b7d3793337f7c5aa05dd3539c7db3b31fc9e Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 19:29:58 +0800 Subject: [PATCH 107/399] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 6088fcb0a..38668072d 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -102,6 +102,9 @@ def run(): sch.start() fetch_all() + while True: + time.sleep(1) + if __name__ == '__main__': run() From 2086a52ecc21c3099c328fa0df40281399feebaf Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 17 Oct 2018 14:21:09 +0800 Subject: [PATCH 108/399] [fix] fix198 --- Api/ProxyApi.py | 5 ++++- Config.ini | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 2e3733013..b8977f9ca 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -85,7 +85,10 @@ def getStatus(): def run(): config = GetConfig() - app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) + if sys.platform.startswith("win"): + app.run(host=config.host_ip, port=config.host_port) + else: + app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index d1ab07bb4..9394f744e 100644 --- a/Config.ini +++ b/Config.ini @@ -27,8 +27,7 @@ freeProxyWallSecond = 1 freeProxyWallThird = 1 [API] -; API接口配置 http://127.0.0.1:5010 +; API config http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 -; flask多进程处理请求 processes = 10 From 7449f7dabb9449a6eedf67f2ff4d20df39a9e5ae Mon Sep 17 00:00:00 2001 From: vc5 Date: Thu, 25 Oct 2018 00:01:36 +0800 Subject: [PATCH 109/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=86=E7=A0=81?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 1 + DB/DbClient.py | 3 ++- DB/SsdbClient.py | 4 ++-- Util/GetConfig.py | 9 +++++++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 9394f744e..24f570f01 100644 --- a/Config.ini +++ b/Config.ini @@ -6,6 +6,7 @@ host = 127.0.0.1 port = 6379 ;port = 8888 name = proxy +#password = yourpassword [ProxyGetter] ;register the proxy getter function diff --git a/DB/DbClient.py b/DB/DbClient.py index 68c5db7a7..0036434ae 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -75,7 +75,8 @@ def __initDbClient(self): assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) self.client = getattr(__import__(__type), __type)(name=self.config.db_name, host=self.config.db_host, - port=self.config.db_port) + port=self.config.db_port, + password=self.config.db_password) def get(self, key, **kwargs): return self.client.get(key, **kwargs) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 2522e0071..2249fdcc1 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -32,7 +32,7 @@ class SsdbClient(object): """ - def __init__(self, name, host, port): + def __init__(self, name, **kwargs): """ init :param name: hash name @@ -41,7 +41,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port)) + self.__conn = Redis(connection_pool=BlockingConnectionPool(**kwargs)) def get(self, proxy): """ diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 8ea57be56..c4c31ab0e 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -45,6 +45,15 @@ def db_host(self): def db_port(self): return int(self.config_file.get('DB', 'port')) + @LazyProperty + def db_password(self): + try: + password = self.config_file.get('DB', 'password') + except Exception: + password = None + return password + + @LazyProperty def proxy_getter_functions(self): return self.config_file.options('ProxyGetter') From 0238d9f931425736c9d72e4ea3e429ff4f03ef64 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Mon, 29 Oct 2018 09:44:48 +0800 Subject: [PATCH 110/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8dc30eee9..47480fb92 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5) ### Release Notes From 8ac170e981fb08a892c27552782b4528d67f64eb Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 8 Nov 2018 21:35:31 +0800 Subject: [PATCH 111/399] =?UTF-8?q?=E5=AE=8C=E5=96=84Redis=E5=92=8CMongodb?= =?UTF-8?q?=E9=AA=8C=E8=AF=81=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加Config.ini的用户和密码 为username参数做兼容处理 --- Config.ini | 3 ++- DB/DbClient.py | 1 + DB/MongodbClient.py | 4 ++-- DB/RedisClient.py | 8 ++++++-- DB/SsdbClient.py | 7 +++++-- Util/GetConfig.py | 7 +++++++ 6 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Config.ini b/Config.ini index 24f570f01..cf3f8ded2 100644 --- a/Config.ini +++ b/Config.ini @@ -6,7 +6,8 @@ host = 127.0.0.1 port = 6379 ;port = 8888 name = proxy -#password = yourpassword +;username = your_username (Only Mongodb) +;password = your_password [ProxyGetter] ;register the proxy getter function diff --git a/DB/DbClient.py b/DB/DbClient.py index 0036434ae..40127cc11 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -76,6 +76,7 @@ def __initDbClient(self): self.client = getattr(__import__(__type), __type)(name=self.config.db_name, host=self.config.db_host, port=self.config.db_port, + username=self.config.db_username, password=self.config.db_password) def get(self, key, **kwargs): diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index bd0647f51..a30ef6cf1 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -17,9 +17,9 @@ class MongodbClient(object): - def __init__(self, name, host, port): + def __init__(self, name, host, port, **kwargs): self.name = name - self.client = MongoClient(host, port) + self.client = MongoClient(host, port, **kwargs) self.db = self.client.proxy def changeTable(self, name): diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 7d9af4386..1983d855e 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -22,7 +22,11 @@ class RedisClient(object): Reids client """ - def __init__(self, name, host, port): + # 为了保持DbClient的标准 + # 在RedisClient里面接受username参数, 但不进行使用. + # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: + # TypeError: __init__() got an unexpected keyword argument 'username' + def __init__(self, name, host, port, username, **kwargs): """ init :param name: @@ -31,7 +35,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0) + self.__conn = redis.Redis(host=host, port=port, db=0, **kwargs) def get(self): """ diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 2249fdcc1..202ddaa8f 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -31,8 +31,11 @@ class SsdbClient(object): 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ - - def __init__(self, name, **kwargs): + # 为了保持DbClient的标准 + # 在SsdbClient里面接受username参数, 但不进行使用. + # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: + # TypeError: __init__() got an unexpected keyword argument 'username' + def __init__(self, name, username, **kwargs): """ init :param name: hash name diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c4c31ab0e..c26b00f1e 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -53,6 +53,13 @@ def db_password(self): password = None return password + @LazyProperty + def db_username(self): + try: + username = self.config_file.get('DB', 'username') + except Exception: + username = None + return username @LazyProperty def proxy_getter_functions(self): From 4eaaa7dc12a5e318368f8eb4f1bb08ef8ee7ca48 Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 8 Nov 2018 22:22:43 +0800 Subject: [PATCH 112/399] =?UTF-8?q?=E4=BC=98=E5=8C=96Docker=E4=BD=BF?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 标准化Dockerfile 2. 添加Docker-compose的部署方式 3. 整理Docker相关的文件 --- Docker/Dockerfile | 13 +++++++++++++ Dockerfile => Docker/Dockerfile.develop | 0 Docker/docker-compose.yml | 14 ++++++++++++++ README.md | 17 +++++++++++++++++ Run/main.py | 3 ++- 5 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 Docker/Dockerfile rename Dockerfile => Docker/Dockerfile.develop (100%) create mode 100644 Docker/docker-compose.yml diff --git a/Docker/Dockerfile b/Docker/Dockerfile new file mode 100644 index 000000000..6ad6f5f53 --- /dev/null +++ b/Docker/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.6 +WORKDIR /usr/src/app +COPY . . + +ENV DEBIAN_FRONTEND noninteractive +ENV TZ Asia/Shanghai + +RUN pip install --no-cache-dir -r requirements.txt + +EXPOSE 5010 + +WORKDIR /usr/src/app/ +CMD [ "python", "Run/main.py" ] diff --git a/Dockerfile b/Docker/Dockerfile.develop similarity index 100% rename from Dockerfile rename to Docker/Dockerfile.develop diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml new file mode 100644 index 000000000..9529745d5 --- /dev/null +++ b/Docker/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + volumes: + - ..:/usr/src/app + ports: + - "5010:5010" + links: + - proxy_redis + image: "proxy_pool" + proxy_redis: + ports: + - "6379:6379" + image: "redis" \ No newline at end of file diff --git a/README.md b/README.md index 47480fb92..e5cece52a 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,23 @@ port = 5010 # 监听端口 # 依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可. ``` +* 生产环境 Docker/docker-compose + +```shell +# Workdir proxy_pool +docker build -t proxy_pool . +pip install docker-compose +docker-compose -f Docker/docker-compose.yml up -d +``` + +* 开发环境 Docker + +```shell +# Workdir proxy_pool +docker build -t proxy_pool . +docker run -it --rm -v $(pwd):/usr/src/app -p 5010:5010 proxy_pool +``` + ### 使用   启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 diff --git a/Run/main.py b/Run/main.py index 6b07654ee..fcd84f6f4 100644 --- a/Run/main.py +++ b/Run/main.py @@ -15,7 +15,8 @@ import sys from multiprocessing import Process -sys.path.append('../') +sys.path.append('.') +sys.path.append('..') from Api.ProxyApi import run as ProxyApiRun from Schedule.ProxyValidSchedule import run as ValidRun From 935929db18effd7cd319a7de1dc0871419ba3267 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 15:49:08 +0800 Subject: [PATCH 113/399] [fix] The Requests package through 2.19.1 before 2018-09-14 for Python sends an HTTP Authorization header to an http URI upon receiving a same-hostname https-to-http redirect, which makes it easier for remote attackers to discover credentials by sniffing the network. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5d00da69a..bc3581ff5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ APScheduler==3.2.0 werkzeug==0.11.15 Flask==0.12 -requests==2.12.4 +requests==2.20.0 lxml==3.7.2 pymongo From dcfa0e03777ee833ba06967c33b6cd39e0371384 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:29:29 +0800 Subject: [PATCH 114/399] =?UTF-8?q?[update]=20=E4=BC=98=E5=8C=96=E6=8A=93?= =?UTF-8?q?=E5=8E=BB=E5=87=BD=E6=95=B0=EF=BC=8C=E6=AF=8F=E6=AC=A1=E5=B0=91?= =?UTF-8?q?=E6=8A=93=E4=B8=80=E4=BA=9B=20=E5=87=8F=E5=B0=91=E8=80=97?= =?UTF-8?q?=E6=97=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 52 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index bf2e03f61..a560dc700 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -15,17 +15,10 @@ import sys import requests -try: - from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 -except: - reload(sys) - sys.setdefaultencoding('utf-8') - sys.path.append('..') from Util.WebRequest import WebRequest from Util.utilFunction import getHtmlTree -from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() @@ -48,9 +41,6 @@ class GetFreeProxy(object): proxy getter """ - def __init__(self): - pass - @staticmethod def freeProxyFirst(page=10): """ @@ -164,7 +154,7 @@ def freeProxySixth(): url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: - res = request.get(url).json() + res = request.get(url, timeout=10).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: @@ -180,7 +170,7 @@ def freeProxySeventh(): 'https://www.kuaidaili.com/free/intr/{page}/' ] for url in url_list: - for page in range(1, 5): + for page in range(1, 2): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('.//table//tr') @@ -192,14 +182,14 @@ def freeProxyEight(): """ 秘密代理 http://www.mimiip.com """ - url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 - url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 - url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 + url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 + url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 + url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 2)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -213,7 +203,7 @@ def freeProxyNinth(): urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -227,7 +217,7 @@ def freeProxyTen(): urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) @@ -246,14 +236,14 @@ def freeProxyEleven(): ] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) for proxy in proxies: yield ":".join(proxy) @staticmethod - def freeProxyTwelve(page_count=8): + def freeProxyTwelve(page_count=2): """ guobanjia http://ip.jiangxianli.com/?page= 免费代理库 @@ -278,7 +268,7 @@ def freeProxyWallFirst(): urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -293,7 +283,7 @@ def freeProxyWallSecond(): request = WebRequest() import base64 for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @@ -303,7 +293,7 @@ def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -312,7 +302,17 @@ def freeProxyWallThird(): if __name__ == '__main__': from CheckProxy import CheckProxy - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySixth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySeventh) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) - CheckProxy.checkAllGetProxyFunc() + # CheckProxy.checkAllGetProxyFunc() From f203ae19b6436b88d84d181a8f392c4044e04e09 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:30:02 +0800 Subject: [PATCH 115/399] =?UTF-8?q?[update]=20=E6=A3=80=E6=9F=A5=20getter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/CheckProxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index f6ba9b66a..f29824723 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -62,7 +62,7 @@ def checkGetProxyFunc(func): count = 0 for proxy in func(): if verifyProxyFormat(proxy): - log.info("fetch proxy: {}".format(proxy)) + log.info("{} fetch proxy: {}".format(func_name, proxy)) count += 1 log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) From 69eafeabdd11451adf2b6f42dac1620e729dcba3 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:30:37 +0800 Subject: [PATCH 116/399] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E5=8F=AF?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Config.ini b/Config.ini index 24f570f01..1d46fc857 100644 --- a/Config.ini +++ b/Config.ini @@ -1,10 +1,9 @@ [DB] ;Configure the database information -;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB +;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = 127.0.0.1 port = 6379 -;port = 8888 name = proxy #password = yourpassword @@ -15,17 +14,17 @@ freeProxySecond = 1 ;freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 -freeProxySixth = 1 +;freeProxySixth = 1 freeProxySeventh = 1 -freeProxyEight = 1 -freeProxyNinth = 1 +;freeProxyEight = 1 +;freeProxyNinth = 1 freeProxyTen = 1 freeProxyEleven = 1 freeProxyTwelve = 1 ;foreign website, outside the wall -freeProxyWallFirst = 1 -freeProxyWallSecond = 1 -freeProxyWallThird = 1 +;freeProxyWallFirst = 1 +;freeProxyWallSecond = 1 +;freeProxyWallThird = 1 [API] ; API config http://127.0.0.1:5010 From d77e1110e99c49bbe0d81a2beb3beb3f0bbe3205 Mon Sep 17 00:00:00 2001 From: 1again Date: Fri, 9 Nov 2018 20:34:35 +0800 Subject: [PATCH 117/399] =?UTF-8?q?[refine]=20Refine=20GetConfig=20?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 基于配置化的管理思想, 假设项目的任何地方都需要使用GetConfig 于是可以在GetConfig模块里生成一个config对象. 任何地方需要只要import即可. --- Api/ProxyApi.py | 3 +-- DB/DbClient.py | 19 +++++++++---------- Manager/ProxyManager.py | 5 ++--- Util/GetConfig.py | 2 ++ 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index b8977f9ca..99a0953a0 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -19,7 +19,7 @@ sys.path.append('../') -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Manager.ProxyManager import ProxyManager app = Flask(__name__) @@ -84,7 +84,6 @@ def getStatus(): def run(): - config = GetConfig() if sys.platform.startswith("win"): app.run(host=config.host_ip, port=config.host_port) else: diff --git a/DB/DbClient.py b/DB/DbClient.py index 0036434ae..869c93af1 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -16,7 +16,7 @@ import os import sys -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Util.utilClass import Singleton sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -55,7 +55,6 @@ def __init__(self): init :return: """ - self.config = GetConfig() self.__initDbClient() def __initDbClient(self): @@ -64,19 +63,19 @@ def __initDbClient(self): :return: """ __type = None - if "SSDB" == self.config.db_type: + if "SSDB" == config.db_type: __type = "SsdbClient" - elif "REDIS" == self.config.db_type: + elif "REDIS" == config.db_type: __type = "RedisClient" - elif "MONGODB" == self.config.db_type: + elif "MONGODB" == config.db_type: __type = "MongodbClient" else: pass - assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) - self.client = getattr(__import__(__type), __type)(name=self.config.db_name, - host=self.config.db_host, - port=self.config.db_port, - password=self.config.db_password) + assert __type, 'type error, Not support DB type: {}'.format(config.db_type) + self.client = getattr(__import__(__type), __type)(name=config.db_name, + host=config.db_host, + port=config.db_port, + password=config.db_password) def get(self, key, **kwargs): return self.client.get(key, **kwargs) diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 33aa76b39..a2f39b3c5 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -17,7 +17,7 @@ from Util import EnvUtil from DB.DbClient import DbClient -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Util.LogHandler import LogHandler from Util.utilFunction import verifyProxyFormat from ProxyGetter.getFreeProxy import GetFreeProxy @@ -30,7 +30,6 @@ class ProxyManager(object): def __init__(self): self.db = DbClient() - self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' @@ -41,7 +40,7 @@ def refresh(self): :return: """ self.db.changeTable(self.raw_proxy_queue) - for proxyGetter in self.config.proxy_getter_functions: + for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c4c31ab0e..efbbe5077 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -70,6 +70,8 @@ def host_port(self): def processes(self): return int(self.config_file.get('API', 'processes')) +config = GetConfig() + if __name__ == '__main__': gg = GetConfig() print(gg.db_type) From 40861f429011c53e25693e62daede4b47c253dd2 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 10:00:38 +0800 Subject: [PATCH 118/399] [update] config annotation --- Config.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index 1d46fc857..54f690397 100644 --- a/Config.ini +++ b/Config.ini @@ -27,7 +27,10 @@ freeProxyTwelve = 1 ;freeProxyWallThird = 1 [API] -; API config http://127.0.0.1:5010 +# API config http://127.0.0.1:5010 +# The ip specified when starting the web API ip = 0.0.0.0 +# he port on which to run the web API port = 5010 +# Flask processes option processes = 10 From 2591918c874a001435b3ff0af8604e5070b8ff58 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 10:32:01 +0800 Subject: [PATCH 119/399] [update] formatting code --- Util/GetConfig.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index efbbe5077..5dfae9912 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -53,14 +53,13 @@ def db_password(self): password = None return password - @LazyProperty def proxy_getter_functions(self): return self.config_file.options('ProxyGetter') @LazyProperty def host_ip(self): - return self.config_file.get('API','ip') + return self.config_file.get('API', 'ip') @LazyProperty def host_port(self): @@ -70,6 +69,7 @@ def host_port(self): def processes(self): return int(self.config_file.get('API', 'processes')) + config = GetConfig() if __name__ == '__main__': From 8a0404521ddcf17031a5975f83c7b6b5a8e3b662 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 11:27:13 +0800 Subject: [PATCH 120/399] [update] set default pwd option --- Util/GetConfig.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 5dfae9912..0f60fcd2f 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -47,11 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - try: - password = self.config_file.get('DB', 'password') - except Exception: - password = None - return password + return self.config_file.get('DB', 'password', fallback="default pwd") @LazyProperty def proxy_getter_functions(self): @@ -82,3 +78,4 @@ def processes(self): print(gg.host_ip) print(gg.host_port) print(gg.processes) + print(gg.db_password) From 6525ea8e09f3a128f0e2652d5d333005b41196c2 Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:28:31 +0800 Subject: [PATCH 121/399] =?UTF-8?q?[update]=20=E8=B0=83=E6=95=B4=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 38668072d..a61cc5a25 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -18,8 +18,7 @@ import time import logging from threading import Thread -# 使用后台调度,不使用阻塞式~ -from apscheduler.schedulers.background import BackgroundScheduler as Sch +from apscheduler.schedulers.background import BackgroundScheduler sys.path.append('../') @@ -74,7 +73,7 @@ def refreshPool(): pp.validProxy() -def batch_refresh(process_num=30): +def batchRefresh(process_num=30): # 检验新代理 pl = [] for num in range(process_num): @@ -89,21 +88,23 @@ def batch_refresh(process_num=30): pl[num].join() -def fetch_all(): +def fetchAll(): p = ProxyRefreshSchedule() # 获取新代理 p.refresh() def run(): - sch = Sch() - sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次 - sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次 - sch.start() - fetch_all() + scheduler = BackgroundScheduler() + # 不用太快, 网站更新速度比较慢, 太快会加大验证压力, 导致raw_proxy积压 + scheduler.add_job(fetchAll, 'interval', minutes=10, id="fetch_proxy") + scheduler.add_job(batchRefresh, "interval", minutes=1) # 每分钟检查一次 + scheduler.start() + + fetchAll() while True: - time.sleep(1) + time.sleep(3) if __name__ == '__main__': From c1e74b4237971caf9dfefede1405e0516c27fe7a Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:29:14 +0800 Subject: [PATCH 122/399] =?UTF-8?q?[update]=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Manager/ProxyManager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index a2f39b3c5..c770b6224 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -36,7 +36,7 @@ def __init__(self): def refresh(self): """ - fetch proxy into Db by ProxyGetter + fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) @@ -45,7 +45,7 @@ def refresh(self): try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度 + # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) From e41a9cbe796744f91e395e4064ebb5c9ef82e39c Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:30:01 +0800 Subject: [PATCH 123/399] [update] dbclient --- DB/DbClient.py | 10 ++++------ DB/SsdbClient.py | 14 ++++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/DB/DbClient.py b/DB/DbClient.py index 869c93af1..f79fc8511 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -44,7 +44,7 @@ class DbClient(object): 所有方法需要相应类去具体实现: SSDB:SsdbClient.py - REDIS:RedisClient.py + REDIS:RedisClient.py 停用 统一使用SsdbClient.py """ @@ -66,7 +66,7 @@ def __initDbClient(self): if "SSDB" == config.db_type: __type = "SsdbClient" elif "REDIS" == config.db_type: - __type = "RedisClient" + __type = "SsdbClient" elif "MONGODB" == config.db_type: __type = "MongodbClient" else: @@ -107,7 +107,5 @@ def getNumber(self): if __name__ == "__main__": account = DbClient() - print(account.get()) - account.changeTable('use') - account.put('ac') - print(account.get()) + account.changeTable('useful_proxy') + print(account.pop()) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 202ddaa8f..4ceedd1df 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -31,16 +31,13 @@ class SsdbClient(object): 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ - # 为了保持DbClient的标准 - # 在SsdbClient里面接受username参数, 但不进行使用. - # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: - # TypeError: __init__() got an unexpected keyword argument 'username' - def __init__(self, name, username, **kwargs): + def __init__(self, name, **kwargs): """ init :param name: hash name - :param host: ssdb host - :param port: ssdb port + :param host: host + :param port: port + :param password: password :return: """ self.name = name @@ -114,6 +111,7 @@ def getNumber(self): def changeTable(self, name): self.name = name + if __name__ == '__main__': - c = SsdbClient('useful_proxy', '118.24.52.95', 8899) + c = SsdbClient(name='useful_proxy', host='127.0.0.1', port=8899, password=None) print(c.getAll()) From 428359c8dada998481f038dbdc8d3923e5850c0e Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 14:02:03 +0800 Subject: [PATCH 124/399] Merge branch 'jhao104/master' of https://github.com/1again/proxy_pool into 1again-jhao104/master # Conflicts: # DB/DbClient.py # Util/GetConfig.py --- Config.ini | 2 +- README.md | 2 +- Util/GetConfig.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 44ef085d2..c8a9cc266 100644 --- a/Config.ini +++ b/Config.ini @@ -3,7 +3,7 @@ ;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = 127.0.0.1 -port = 6379 +port = 8888 name = proxy ;username = your_username (Only Mongodb) ;password = your_password diff --git a/README.md b/README.md index e5cece52a..8bdca40c5 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)| [@1again](https://github.com/1again) ### Release Notes diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 0f60fcd2f..cd354e20f 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -47,7 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - return self.config_file.get('DB', 'password', fallback="default pwd") + return self.config_file.get('DB', 'password', fallback=None) @LazyProperty def proxy_getter_functions(self): From 3c3ddaff09a346680c4bcfceb52fb5db0e690d1b Mon Sep 17 00:00:00 2001 From: incoding Date: Wed, 14 Nov 2018 13:17:00 +0800 Subject: [PATCH 125/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=9B=B4=E5=8A=A0?= =?UTF-8?q?=E4=B8=A5=E8=B0=A8=E7=9A=84=E4=BB=A3=E7=90=86=E6=A0=A1=E9=AA=8C?= =?UTF-8?q?=E8=A7=84=E5=88=99=EF=BC=88=E4=B8=80=E4=BA=9B=E9=9D=9E=E6=B3=95?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E4=B9=9F=E4=BC=9A=E8=BF=94=E5=9B=9E200?= =?UTF-8?q?=E7=8A=B6=E6=80=81=E7=A0=81=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/utilFunction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index fc26a59b1..ec86c1fe3 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -100,7 +100,7 @@ def validUsefulProxy(proxy): try: # 超过20秒的代理就不要了 r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) - if r.status_code == 200: + if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']: # logger.info('%s is ok' % proxy) return True except Exception as e: From e5c1b89c919bae95fcb14e715d7b2e91115dfbe3 Mon Sep 17 00:00:00 2001 From: incoding Date: Wed, 14 Nov 2018 13:33:47 +0800 Subject: [PATCH 126/399] =?UTF-8?q?=E6=B7=BB=E5=8A=A0my=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=B9=EF=BC=8C=E4=BF=9D=E5=AD=98=E5=AE=9A=E5=88=B6=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my/Config.ini | 37 +++++++++++++++++++++++++++++++++++++ my/Dockerfile | 16 ++++++++++++++++ my/build.sh | 1 + my/run.sh | 14 ++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 my/Config.ini create mode 100644 my/Dockerfile create mode 100755 my/build.sh create mode 100755 my/run.sh diff --git a/my/Config.ini b/my/Config.ini new file mode 100644 index 000000000..627092c1a --- /dev/null +++ b/my/Config.ini @@ -0,0 +1,37 @@ +[DB] +;Configure the database information +;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB +type = PROXY_POOL_DB_TYPE +host = PROXY_POOL_DB_HOST +port = PROXY_POOL_DB_PORT +name = proxy +;username = your_username (Only Mongodb) +;password = your_password + +[ProxyGetter] +;register the proxy getter function +freeProxyFirst = 1 +freeProxySecond = 1 +;freeProxyThird = 1 +freeProxyFourth = 1 +freeProxyFifth = 1 +;freeProxySixth = 1 +freeProxySeventh = 1 +;freeProxyEight = 1 +;freeProxyNinth = 1 +freeProxyTen = 1 +freeProxyEleven = 1 +freeProxyTwelve = 1 +;foreign website, outside the wall +;freeProxyWallFirst = 1 +;freeProxyWallSecond = 1 +;freeProxyWallThird = 1 + +[API] +# API config http://127.0.0.1:5010 +# The ip specified when starting the web API +ip = 0.0.0.0 +# he port on which to run the web API +port = 8080 +# Flask processes option +processes = 10 diff --git a/my/Dockerfile b/my/Dockerfile new file mode 100644 index 000000000..cb042627c --- /dev/null +++ b/my/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.6 + +WORKDIR /usr/src/app + +ENV TZ=Asia/Shanghai \ + PROXY_POOL_DB_TYPE=SSDB \ + PROXY_POOL_DB_HOST=redis \ + PROXY_POOL_DB_PORT=6379 + +COPY . . + +RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./ + +CMD [ "my/run.sh" ] + +EXPOSE 8080 diff --git a/my/build.sh b/my/build.sh new file mode 100755 index 000000000..328e9449d --- /dev/null +++ b/my/build.sh @@ -0,0 +1 @@ +docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile . \ No newline at end of file diff --git a/my/run.sh b/my/run.sh new file mode 100755 index 000000000..441ace853 --- /dev/null +++ b/my/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for var in \ + PROXY_POOL_DB_TYPE \ + PROXY_POOL_DB_HOST \ + PROXY_POOL_DB_PORT \ +; do + val="${!var}" + if [ "$val" ]; then + sed -ri "s/$var/$val/" Config.ini + fi +done + +python Run/main.py \ No newline at end of file From 110b0df1e29529346314378155890d740064ca0b Mon Sep 17 00:00:00 2001 From: jhao Date: Wed, 14 Nov 2018 16:51:41 +0800 Subject: [PATCH 127/399] Merge branch 'jhao104/master' of https://github.com/1again/proxy_pool into 1again-jhao104/master # Conflicts: # DB/DbClient.py # Util/GetConfig.py --- Api/ProxyApi.py | 5 +---- Config.ini | 4 +--- Schedule/ProxyValidSchedule.py | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 99a0953a0..fc759a363 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -84,10 +84,7 @@ def getStatus(): def run(): - if sys.platform.startswith("win"): - app.run(host=config.host_ip, port=config.host_port) - else: - app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) + app.run(host=config.host_ip, port=config.host_port) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index c8a9cc266..5bdf095a1 100644 --- a/Config.ini +++ b/Config.ini @@ -32,6 +32,4 @@ freeProxyTwelve = 1 # The ip specified when starting the web API ip = 0.0.0.0 # he port on which to run the web API -port = 5010 -# Flask processes option -processes = 10 +port = 8080 diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 9b075cf90..098c8a336 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -32,7 +32,7 @@ def __init__(self): self.queue = Queue() self.proxy_item = dict() - def __validProxy(self, threads=10): + def __validProxy(self, threads=20): """ 验证useful_proxy代理 :param threads: 线程数 From a3ba910f391fd0220f357f926ef2b5ab6e0a973f Mon Sep 17 00:00:00 2001 From: windhw Date: Thu, 6 Dec 2018 12:48:10 +0800 Subject: [PATCH 128/399] Update main.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加对SIGTERM的处理,这样在后台运行的时候,如果kill掉主进程,子进程也能kill --- Run/main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Run/main.py b/Run/main.py index fcd84f6f4..cce7b6142 100644 --- a/Run/main.py +++ b/Run/main.py @@ -12,7 +12,7 @@ """ __author__ = 'JHao' -import sys +import sys,signal from multiprocessing import Process sys.path.append('.') @@ -31,6 +31,14 @@ def run(): p_list.append(p2) p3 = Process(target=RefreshRun, name='RefreshRun') p_list.append(p3) + + def kill_child_processes(signum,frame): + for p in p_list: + p.terminate() + sys.exit(1) + + signal.signal(signal.SIGTERM, kill_child_processes) + for p in p_list: p.daemon = True From 2260c6d02f2374d7b4952787cac964f648ffd2b2 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 14:21:51 +0800 Subject: [PATCH 129/399] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0httpbin?= =?UTF-8?q?=E6=A3=80=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/utilFunction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index ec86c1fe3..f4e802263 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -100,7 +100,7 @@ def validUsefulProxy(proxy): try: # 超过20秒的代理就不要了 r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) - if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']: + if r.status_code == 200 and r.json().get("origin"): # logger.info('%s is ok' % proxy) return True except Exception as e: From 26aaf1851a5b9bf4bc84ab344835d37d857ab6d7 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 14:23:49 +0800 Subject: [PATCH 130/399] =?UTF-8?q?=E3=80=90del=E3=80=91delete=20un=20use?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my/Config.ini | 37 ------------------------------------- my/Dockerfile | 16 ---------------- my/build.sh | 1 - my/run.sh | 14 -------------- 4 files changed, 68 deletions(-) delete mode 100644 my/Config.ini delete mode 100644 my/Dockerfile delete mode 100755 my/build.sh delete mode 100755 my/run.sh diff --git a/my/Config.ini b/my/Config.ini deleted file mode 100644 index 627092c1a..000000000 --- a/my/Config.ini +++ /dev/null @@ -1,37 +0,0 @@ -[DB] -;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB -type = PROXY_POOL_DB_TYPE -host = PROXY_POOL_DB_HOST -port = PROXY_POOL_DB_PORT -name = proxy -;username = your_username (Only Mongodb) -;password = your_password - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -;freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 -;freeProxySixth = 1 -freeProxySeventh = 1 -;freeProxyEight = 1 -;freeProxyNinth = 1 -freeProxyTen = 1 -freeProxyEleven = 1 -freeProxyTwelve = 1 -;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 -;freeProxyWallThird = 1 - -[API] -# API config http://127.0.0.1:5010 -# The ip specified when starting the web API -ip = 0.0.0.0 -# he port on which to run the web API -port = 8080 -# Flask processes option -processes = 10 diff --git a/my/Dockerfile b/my/Dockerfile deleted file mode 100644 index cb042627c..000000000 --- a/my/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM python:3.6 - -WORKDIR /usr/src/app - -ENV TZ=Asia/Shanghai \ - PROXY_POOL_DB_TYPE=SSDB \ - PROXY_POOL_DB_HOST=redis \ - PROXY_POOL_DB_PORT=6379 - -COPY . . - -RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./ - -CMD [ "my/run.sh" ] - -EXPOSE 8080 diff --git a/my/build.sh b/my/build.sh deleted file mode 100755 index 328e9449d..000000000 --- a/my/build.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile . \ No newline at end of file diff --git a/my/run.sh b/my/run.sh deleted file mode 100755 index 441ace853..000000000 --- a/my/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -for var in \ - PROXY_POOL_DB_TYPE \ - PROXY_POOL_DB_HOST \ - PROXY_POOL_DB_PORT \ -; do - val="${!var}" - if [ "$val" ]; then - sed -ri "s/$var/$val/" Config.ini - fi -done - -python Run/main.py \ No newline at end of file From 223f57d1eb8d243b1d69e28b90a39f0529ec4407 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 15:29:55 +0800 Subject: [PATCH 131/399] [fix] fix password --- Util/GetConfig.py | 9 ++------- Util/utilClass.py | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index cd354e20f..c25035504 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -26,7 +26,7 @@ class GetConfig(object): def __init__(self): self.pwd = os.path.split(os.path.realpath(__file__))[0] self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse() + self.config_file = ConfigParse(defaults={"password": None}) self.config_file.read(self.config_path) @LazyProperty @@ -47,7 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - return self.config_file.get('DB', 'password', fallback=None) + return self.config_file.get('DB', 'password') @LazyProperty def proxy_getter_functions(self): @@ -61,10 +61,6 @@ def host_ip(self): def host_port(self): return int(self.config_file.get('API', 'port')) - @LazyProperty - def processes(self): - return int(self.config_file.get('API', 'processes')) - config = GetConfig() @@ -77,5 +73,4 @@ def processes(self): print(gg.proxy_getter_functions) print(gg.host_ip) print(gg.host_port) - print(gg.processes) print(gg.db_password) diff --git a/Util/utilClass.py b/Util/utilClass.py index 89112ffd8..b3a35f141 100644 --- a/Util/utilClass.py +++ b/Util/utilClass.py @@ -44,8 +44,8 @@ class ConfigParse(ConfigParser): rewrite ConfigParser, for support upper option """ - def __init__(self): - ConfigParser.__init__(self) + def __init__(self, *args, **kwargs): + ConfigParser.__init__(self, *args, **kwargs) def optionxform(self, optionstr): return optionstr From d49a66a6a1051e2eb86231e03a6a0ab3875dee1e Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:02:02 +0800 Subject: [PATCH 132/399] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 2 +- Config.ini | 2 +- Config/ConfigGetter.py | 71 ++++++++++++++++++++++++ {Test => Config}/__init__.py | 11 ++-- Config/setting.py | 54 ++++++++++++++++++ DB/DbClient.py | 2 +- DB/SsdbClient.py | 4 +- Manager/ProxyManager.py | 2 +- Schedule/ProxyValidSchedule.py | 4 +- Test/.pytest_cache/v/cache/lastfailed | 3 - Test/.pytest_cache/v/cache/nodeids | 3 - Test/{testGetConfig.py => testConfig.py} | 22 ++++---- Util/GetConfig.py | 7 +-- Util/utilClass.py | 19 ------- test.py | 5 +- 15 files changed, 154 insertions(+), 57 deletions(-) create mode 100644 Config/ConfigGetter.py rename {Test => Config}/__init__.py (56%) create mode 100644 Config/setting.py delete mode 100644 Test/.pytest_cache/v/cache/lastfailed delete mode 100644 Test/.pytest_cache/v/cache/nodeids rename Test/{testGetConfig.py => testConfig.py} (60%) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index fc759a363..91df76f88 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -19,7 +19,7 @@ sys.path.append('../') -from Util.GetConfig import config +from Config.ConfigGetter import config from Manager.ProxyManager import ProxyManager app = Flask(__name__) diff --git a/Config.ini b/Config.ini index 5bdf095a1..ee13eaf2c 100644 --- a/Config.ini +++ b/Config.ini @@ -1,6 +1,6 @@ [DB] ;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB +;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB type = SSDB host = 127.0.0.1 port = 8888 diff --git a/Config/ConfigGetter.py b/Config/ConfigGetter.py new file mode 100644 index 000000000..56c766c0d --- /dev/null +++ b/Config/ConfigGetter.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: ConfigGetter + Description : 读取配置 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" +__author__ = 'JHao' + + +from Util.utilClass import LazyProperty +from Config.setting import * + + +class ConfigGetter(object): + """ + get config + """ + + def __init__(self): + pass + + @LazyProperty + def db_type(self): + return DATABASES.get("default", {}).get("TYPE", "SSDB") + + @LazyProperty + def db_name(self): + return DATABASES.get("default", {}).get("NAME", "proxy") + + @LazyProperty + def db_host(self): + return DATABASES.get("default", {}).get("HOST", "127.0.0.1") + + @LazyProperty + def db_port(self): + return DATABASES.get("default", {}).get("PORT", 8080) + + @LazyProperty + def db_password(self): + return DATABASES.get("default", {}).get("PASSWORD", "") + + @LazyProperty + def proxy_getter_functions(self): + return PROXY_GETTER + + @LazyProperty + def host_ip(self): + return SERVER_API.get("HOST", "127.0.0.1") + + @LazyProperty + def host_port(self): + return SERVER_API.get("PORT", 5010) + + +config = ConfigGetter() + +if __name__ == '__main__': + print(config.db_type) + print(config.db_name) + print(config.db_host) + print(config.db_port) + print(config.proxy_getter_functions) + print(config.host_ip) + print(config.host_port) + print(config.db_password) diff --git a/Test/__init__.py b/Config/__init__.py similarity index 56% rename from Test/__init__.py rename to Config/__init__.py index 898942953..9a7d547ee 100644 --- a/Test/__init__.py +++ b/Config/__init__.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py - Description : - Author : J_hao - date: 2017/7/31 + File Name: __init__ + Description : + Author : JHao + date: 2019/2/15 ------------------------------------------------- Change Activity: - 2017/7/31: + 2019/2/15: ------------------------------------------------- """ -__author__ = 'J_hao' diff --git a/Config/setting.py b/Config/setting.py new file mode 100644 index 000000000..39ae36748 --- /dev/null +++ b/Config/setting.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: setting.py + Description : 配置文件 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" + +# database config + +DATABASES = { + "default": { + "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB + "HOST": "127.0.0.1", + "PORT": 8888, + "NAME": "proxy", + "PASSWORD": "" + + } +} + +# register the proxy getter function + +PROXY_GETTER = [ + "freeProxyFirst", + "freeProxySecond", + # "freeProxyThird", + "freeProxyFourth", + "freeProxyFifth", + # "freeProxySixth" + "freeProxySeventh", + # "freeProxyEight", + # "freeProxyNinth", + "freeProxyTen", + "freeProxyEleven", + "freeProxyTwelve", + # foreign website, outside the wall + "freeProxyWallFirst", + "freeProxyWallSecond", + "freeProxyWallThird" +] + + +# # API config http://127.0.0.1:5010 + +SERVER_API = { + "HOST": "0.0.0.0", # The ip specified which starting the web API + "PORT": 5010 # port number to which the server listens to +} \ No newline at end of file diff --git a/DB/DbClient.py b/DB/DbClient.py index f79fc8511..baa1f79fc 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -16,7 +16,7 @@ import os import sys -from Util.GetConfig import config +from Config.ConfigGetter import config from Util.utilClass import Singleton sys.path.append(os.path.dirname(os.path.abspath(__file__))) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 4ceedd1df..85545b355 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -3,7 +3,7 @@ """ ------------------------------------------------- File Name: SsdbClient.py - Description : 封装SSDB操作 + Description : 封装SSDB/Redis操作 Author : JHao date: 2016/12/2 ------------------------------------------------- @@ -27,7 +27,7 @@ class SsdbClient(object): SSDB client SSDB中代理存放的容器为hash: - 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性; + 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index c770b6224..fd007773b 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -17,7 +17,7 @@ from Util import EnvUtil from DB.DbClient import DbClient -from Util.GetConfig import config +from Config.ConfigGetter import config from Util.LogHandler import LogHandler from Util.utilFunction import verifyProxyFormat from ProxyGetter.getFreeProxy import GetFreeProxy diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 098c8a336..6b1fa6485 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -56,8 +56,8 @@ def main(self): self.log.info("Start valid useful proxy") self.__validProxy() else: - self.log.info('Valid Complete! sleep 5 minutes.') - time.sleep(60 * 5) + self.log.info('Valid Complete! sleep 5 sec.') + time.sleep(5) self.putQueue() def putQueue(self): diff --git a/Test/.pytest_cache/v/cache/lastfailed b/Test/.pytest_cache/v/cache/lastfailed deleted file mode 100644 index 65c9a06d6..000000000 --- a/Test/.pytest_cache/v/cache/lastfailed +++ /dev/null @@ -1,3 +0,0 @@ -{ - "testGetFreeProxy.py::testGetFreeProxy": true -} \ No newline at end of file diff --git a/Test/.pytest_cache/v/cache/nodeids b/Test/.pytest_cache/v/cache/nodeids deleted file mode 100644 index 0ce3684ce..000000000 --- a/Test/.pytest_cache/v/cache/nodeids +++ /dev/null @@ -1,3 +0,0 @@ -[ - "testGetFreeProxy.py::testGetFreeProxy" -] \ No newline at end of file diff --git a/Test/testGetConfig.py b/Test/testConfig.py similarity index 60% rename from Test/testGetConfig.py rename to Test/testConfig.py index 7f44fa6b4..7ed759387 100644 --- a/Test/testGetConfig.py +++ b/Test/testConfig.py @@ -12,22 +12,22 @@ """ __author__ = 'J_hao' -from Util.GetConfig import GetConfig +from Config.ConfigGetter import config # noinspection PyPep8Naming -def testGetConfig(): +def testConfig(): """ - test class GetConfig in Util/GetConfig :return: """ - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - assert isinstance(gg.proxy_getter_functions, list) - print(gg.proxy_getter_functions) + print(config.db_type) + print(config.db_name) + print(config.db_host) + print(config.db_port) + print(config.db_password) + assert isinstance(config.proxy_getter_functions, list) + print(config.proxy_getter_functions) + if __name__ == '__main__': - testGetConfig() + testConfig() diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c25035504..65554b317 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -13,8 +13,6 @@ """ __author__ = 'JHao' -import os -from Util.utilClass import ConfigParse from Util.utilClass import LazyProperty @@ -24,10 +22,7 @@ class GetConfig(object): """ def __init__(self): - self.pwd = os.path.split(os.path.realpath(__file__))[0] - self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse(defaults={"password": None}) - self.config_file.read(self.config_path) + pass @LazyProperty def db_type(self): diff --git a/Util/utilClass.py b/Util/utilClass.py index b3a35f141..cffe72443 100644 --- a/Util/utilClass.py +++ b/Util/utilClass.py @@ -9,7 +9,6 @@ ------------------------------------------------- Change Activity: 2016/12/3: Class LazyProperty - 2016/12/4: rewrite ConfigParser ------------------------------------------------- """ __author__ = 'JHao' @@ -33,24 +32,6 @@ def __get__(self, instance, owner): return value -try: - from configparser import ConfigParser # py3 -except: - from ConfigParser import ConfigParser # py2 - - -class ConfigParse(ConfigParser): - """ - rewrite ConfigParser, for support upper option - """ - - def __init__(self, *args, **kwargs): - ConfigParser.__init__(self, *args, **kwargs) - - def optionxform(self, optionstr): - return optionstr - - class Singleton(type): """ Singleton Metaclass diff --git a/test.py b/test.py index 518710d3b..d636535a9 100644 --- a/test.py +++ b/test.py @@ -12,4 +12,7 @@ """ __author__ = 'JHao' -from Schedule import ProxyRefreshSchedule \ No newline at end of file +from Test import testConfig + +if __name__ == '__main__': + testConfig.testConfig() From 2b54d4af03c96515198fada0ee630cf98ea52cf9 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:06:33 +0800 Subject: [PATCH 133/399] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Test/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 Test/__init__.py diff --git a/Test/__init__.py b/Test/__init__.py new file mode 100644 index 000000000..9b16c75ff --- /dev/null +++ b/Test/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: __init__ + Description : + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" +__author__ = 'JHao' \ No newline at end of file From f00a4569d26ef963656cf9b7617cec9f8780e666 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:24:37 +0800 Subject: [PATCH 134/399] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 61 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8bdca40c5..4f253af81 100644 --- a/README.md +++ b/README.md @@ -39,25 +39,41 @@ git clone git@github.com:jhao104/proxy_pool.git pip install -r requirements.txt ``` -* 配置Config.ini: +* 配置Config/setting.py: ```shell -# Config.ini 为项目配置文件 -# 配置DB -type = SSDB # 如果使用SSDB或redis数据库,均配置为SSDB -host = localhost # db host -port = 8888 # db port -name = proxy # 默认配置 +# Config/setting.py 为项目配置文件 + +# 配置DB +DATABASES = { + "default": { + "TYPE": "SSDB", # 如果使用SSDB或redis数据库,均配置为SSDB + "HOST": "127.0.0.1", # db host + "PORT": 8888, # db port + "NAME": "proxy", # 默认配置 + "PASSWORD": "" # db password + + } +} + # 配置 ProxyGetter -freeProxyFirst = 1 # 这里是启动的抓取函数,可在ProxyGetter/getFreeProxy.py 扩展 -freeProxySecond = 1 -.... -# 配置 HOST (api服务) -ip = 127.0.0.1 # 监听ip,0.0.0.0开启外网访问 -port = 5010 # 监听端口 -# 上面配置启动后,代理api地址为 http://127.0.0.1:5010 +PROXY_GETTER = [ + "freeProxyFirst", # 这里是启用的代理抓取函数名,可在ProxyGetter/getFreeProxy.py 扩展 + "freeProxySecond", + .... +] + + +# 配置 API服务 + +SERVER_API = { + "HOST": "0.0.0.0", # 监听ip, 0.0.0.0 监听所有IP + "PORT": 5010 # 监听端口 +} + +# 上面配置启动后,代理池访问地址为 http://127.0.0.1:5010 ``` @@ -164,18 +180,17 @@ class GetFreeProxy(object): # 确保每个proxy都是 host:ip正确的格式就行 ``` -* 2、添加好方法后,修改Config.ini文件中的`[ProxyGetter]`项: +* 2、添加好方法后,修改Config/setting.py文件中的`PROXY_GETTER`项: -  在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字: +  在`PROXY_GETTER`下添加自定义的方法的名字: ```shell - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可 -.... -freeProxyCustom = 1 # 确保名字和你添加方法名字一致 - +PROXY_GETTER = [ + "freeProxyFirst", + "freeProxySecond", + .... + "freeProxyCustom" # # 确保名字和你添加方法名字一致 +] ``` From 16c5a04ba43c05608261581a6affeee1a9d1728f Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:29:33 +0800 Subject: [PATCH 135/399] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 35 -------------------- Config/setting.py | 2 +- Test/testConfig.py | 2 +- Test/testGetFreeProxy.py | 11 +++---- Util/GetConfig.py | 71 ---------------------------------------- 5 files changed, 7 insertions(+), 114 deletions(-) delete mode 100644 Config.ini delete mode 100644 Util/GetConfig.py diff --git a/Config.ini b/Config.ini deleted file mode 100644 index ee13eaf2c..000000000 --- a/Config.ini +++ /dev/null @@ -1,35 +0,0 @@ -[DB] -;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB -type = SSDB -host = 127.0.0.1 -port = 8888 -name = proxy -;username = your_username (Only Mongodb) -;password = your_password - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -;freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 -;freeProxySixth = 1 -freeProxySeventh = 1 -;freeProxyEight = 1 -;freeProxyNinth = 1 -freeProxyTen = 1 -freeProxyEleven = 1 -freeProxyTwelve = 1 -;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 -;freeProxyWallThird = 1 - -[API] -# API config http://127.0.0.1:5010 -# The ip specified when starting the web API -ip = 0.0.0.0 -# he port on which to run the web API -port = 8080 diff --git a/Config/setting.py b/Config/setting.py index 39ae36748..8b87191fa 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -51,4 +51,4 @@ SERVER_API = { "HOST": "0.0.0.0", # The ip specified which starting the web API "PORT": 5010 # port number to which the server listens to -} \ No newline at end of file +} diff --git a/Test/testConfig.py b/Test/testConfig.py index 7ed759387..ebfd1171f 100644 --- a/Test/testConfig.py +++ b/Test/testConfig.py @@ -2,7 +2,7 @@ """ ------------------------------------------------- File Name: testGetConfig - Description : test all function in GetConfig.py + Description : testGetConfig Author : J_hao date: 2017/7/31 ------------------------------------------------- diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index 33c3f9e46..854172773 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -16,7 +16,6 @@ import sys import requests - try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: @@ -25,7 +24,7 @@ sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy -from Util.GetConfig import GetConfig +from Config.ConfigGetter import config # noinspection PyPep8Naming @@ -34,15 +33,15 @@ def testGetFreeProxy(): test class GetFreeProxy in ProxyGetter/GetFreeProxy :return: """ - gc = GetConfig() - proxy_getter_functions = gc.proxy_getter_functions + proxy_getter_functions = config.proxy_getter_functions for proxyGetter in proxy_getter_functions: proxy_count = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: - print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count)) + print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy, + proxy_count=proxy_count)) proxy_count += 1 - #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) + # assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) if __name__ == '__main__': diff --git a/Util/GetConfig.py b/Util/GetConfig.py deleted file mode 100644 index 65554b317..000000000 --- a/Util/GetConfig.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetConfig.py - Description : fetch config from config.ini - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: get db property func -------------------------------------------------- -""" -__author__ = 'JHao' - -from Util.utilClass import LazyProperty - - -class GetConfig(object): - """ - to get config from config.ini - """ - - def __init__(self): - pass - - @LazyProperty - def db_type(self): - return self.config_file.get('DB', 'type') - - @LazyProperty - def db_name(self): - return self.config_file.get('DB', 'name') - - @LazyProperty - def db_host(self): - return self.config_file.get('DB', 'host') - - @LazyProperty - def db_port(self): - return int(self.config_file.get('DB', 'port')) - - @LazyProperty - def db_password(self): - return self.config_file.get('DB', 'password') - - @LazyProperty - def proxy_getter_functions(self): - return self.config_file.options('ProxyGetter') - - @LazyProperty - def host_ip(self): - return self.config_file.get('API', 'ip') - - @LazyProperty - def host_port(self): - return int(self.config_file.get('API', 'port')) - - -config = GetConfig() - -if __name__ == '__main__': - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - print(gg.proxy_getter_functions) - print(gg.host_ip) - print(gg.host_port) - print(gg.db_password) From 55e71981168e57658371e27f7b9517011cca653f Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 10:53:03 +0800 Subject: [PATCH 136/399] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 4f253af81..b62864f2d 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,27 @@ PROXY_GETTER = [   `ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 +### 代理采集 + + 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): + + | 厂商名称 | 状态 | 更新速度 | 可用率 | 是否被墙 | 地址 | + | ----- | ---- | -------- | ------ | --------- | ----- | + | 无忧代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.data5u.com/free/index.html) | + | 66代理 | 可用 | 更新很慢 | * | 否 | [地址](http://www.66ip.cn/) | + | 西刺代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.xicidaili.com)| + | 全网代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.goubanjia.com/)| + | 训代理 | 已关闭免费代理 | * | * | 否 | [地址](http://www.xdaili.cn/)| + | 快代理 | 可用 |几分钟一次| * | 否 | [地址](https://www.kuaidaili.com/)| + | 云代理 | 可用 |几分钟一次| * | 否 | [地址](http://www.ip3366.net/)| + | IP海 | 可用 |几小时一次| * | 否 | [地址](http://www.iphai.com/)| + | 免费IP代理库 | 可用 |快| * | 否 | [地址](http://ip.jiangxianli.com/)| + | 中国IP地址 | 可用 |几分钟一次| * | 是 | [地址](http://cn-proxy.com/)| + | Proxy List | 可用 |几分钟一次| * | 是 | [地址](https://proxy-list.org/chinese/index.php)| + | ProxyList+ | 可用 |几分钟一次| * | 是 | [地址](https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1)| + + 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 + ### 问题反馈   任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 From 086074c4288167871a3c23b34346ab59db01f29c Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 11:17:38 +0800 Subject: [PATCH 137/399] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B066?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 48 ++++++++++++++----------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index a560dc700..caa5b6e9c 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -23,18 +23,6 @@ # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() -""" - 66ip.cn - data5u.com - xicidaili.com - goubanjia.com - xdaili.cn - kuaidaili.com - cn-proxy.com - proxy-list.org - www.mimiip.com to do -""" - class GetFreeProxy(object): """ @@ -64,24 +52,24 @@ def freeProxyFirst(page=10): print(e) @staticmethod - def freeProxySecond(area=33, page=1): + def freeProxySecond(count=20): """ 代理66 http://www.66ip.cn/ - :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页...... - :param page: 翻页 + :param count: 提取数量 :return: """ - area = 33 if area > 33 else area - for area_index in range(1, area + 1): - for i in range(1, page + 1): - url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) - html_tree = getHtmlTree(url) - tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") - if len(tr_list) == 0: - continue - for tr in tr_list: - yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] - break + urls = [ + "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", + "http://www.66ip.cn/nmtq.php?getnum={count}" + "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip", + ] + request = WebRequest() + for _ in urls: + url = _.format(count=count) + html = request.get(url).content + ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html) + for ip in ips: + yield ip.strip() @staticmethod def freeProxyThird(days=1): @@ -180,7 +168,7 @@ def freeProxySeventh(): @staticmethod def freeProxyEight(): """ - 秘密代理 http://www.mimiip.com + 秘密代理 http://www.mimiip.com 不能用 """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 @@ -197,7 +185,7 @@ def freeProxyEight(): @staticmethod def freeProxyNinth(): """ - 码农代理 https://proxy.coderbusy.com/ + 码农代理 https://proxy.coderbusy.com/ 不能用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -303,7 +291,7 @@ def freeProxyWallThird(): from CheckProxy import CheckProxy # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) @@ -313,6 +301,6 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() From 792fd13e780205823e872d1370daa46a8b088e97 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 14:54:44 +0800 Subject: [PATCH 138/399] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 10 +++++----- ProxyGetter/CheckProxy.py | 2 -- ProxyGetter/getFreeProxy.py | 28 +++++++++++++--------------- Test/testGetFreeProxy.py | 11 ----------- 4 files changed, 18 insertions(+), 33 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 8b87191fa..63b4f6153 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -29,10 +29,10 @@ PROXY_GETTER = [ "freeProxyFirst", "freeProxySecond", - # "freeProxyThird", + # "freeProxyThird", # 网站已不能访问 "freeProxyFourth", "freeProxyFifth", - # "freeProxySixth" + # "freeProxySixth" # 不再提供免费代理 "freeProxySeventh", # "freeProxyEight", # "freeProxyNinth", @@ -40,9 +40,9 @@ "freeProxyEleven", "freeProxyTwelve", # foreign website, outside the wall - "freeProxyWallFirst", - "freeProxyWallSecond", - "freeProxyWallThird" + # "freeProxyWallFirst", + # "freeProxyWallSecond", + # "freeProxyWallThird" ] diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index f29824723..2b3fc6a29 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -12,11 +12,9 @@ """ __author__ = 'JHao' -import sys from getFreeProxy import GetFreeProxy from Util.utilFunction import verifyProxyFormat -sys.path.append('../') from Util.LogHandler import LogHandler diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index caa5b6e9c..cdfa843a0 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -88,7 +88,7 @@ def freeProxyThird(days=1): pass @staticmethod - def freeProxyFourth(page_count=2): + def freeProxyFourth(page_count=1): """ 西刺代理 http://www.xicidaili.com :return: @@ -136,7 +136,7 @@ def freeProxyFifth(): @staticmethod def freeProxySixth(): """ - 讯代理 http://www.xdaili.cn/ + 讯代理 http://www.xdaili.cn/ 已停用 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' @@ -154,21 +154,19 @@ def freeProxySeventh(): 快代理 https://www.kuaidaili.com """ url_list = [ - 'https://www.kuaidaili.com/free/inha/{page}/', - 'https://www.kuaidaili.com/free/intr/{page}/' + 'https://www.kuaidaili.com/free/inha/', + 'https://www.kuaidaili.com/free/intr/' ] for url in url_list: - for page in range(1, 2): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + tree = getHtmlTree(url) + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod def freeProxyEight(): """ - 秘密代理 http://www.mimiip.com 不能用 + 秘密代理 http://www.mimiip.com 已停用 """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 @@ -185,7 +183,7 @@ def freeProxyEight(): @staticmethod def freeProxyNinth(): """ - 码农代理 https://proxy.coderbusy.com/ 不能用 + 码农代理 https://proxy.coderbusy.com/ 已停用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -233,7 +231,7 @@ def freeProxyEleven(): @staticmethod def freeProxyTwelve(page_count=2): """ - guobanjia http://ip.jiangxianli.com/?page= + http://ip.jiangxianli.com/?page= 免费代理库 超多量 :return: @@ -291,7 +289,7 @@ def freeProxyWallThird(): from CheckProxy import CheckProxy # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) @@ -300,7 +298,7 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index 854172773..5074945b4 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -12,22 +12,11 @@ """ __author__ = 'J_hao' -import re -import sys -import requests -try: - from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 -except: - reload(sys) - sys.setdefaultencoding('utf-8') - -sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy from Config.ConfigGetter import config -# noinspection PyPep8Naming def testGetFreeProxy(): """ test class GetFreeProxy in ProxyGetter/GetFreeProxy From 07f9845017836d2776272e87551b55fb4a677f1a Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 19 Feb 2019 15:24:23 +0800 Subject: [PATCH 139/399] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/release_notes.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/release_notes.md b/doc/release_notes.md index 0871a2db5..36e097726 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -1,5 +1,11 @@ ## Release Notes +* 1.13 (2019.02) + + 1.使用.py文件替换.ini作为配置文件; + + 2.更新代理采集部分; + * 1.12 (2018.4) 1.优化代理格式检查; From 0c48d9dc1a0e3dcb2f166882ea29ed7ad3213a21 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 5 Mar 2019 10:05:06 +0800 Subject: [PATCH 140/399] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b62864f2d..48edb4c98 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ * 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) -* 测试地址: http://123.207.35.36:5010 (单机勿压。感谢) +* 测试地址: http://118.24.52.95:5010 (单机勿压。感谢) ### 下载安装 From b568bd2092fc4aa405314968ead1102b1216f18d Mon Sep 17 00:00:00 2001 From: weak_ptr Date: Sun, 10 Mar 2019 17:21:54 +0800 Subject: [PATCH 141/399] =?UTF-8?q?[refine]=20=E5=85=81=E8=AE=B8=20docker-?= =?UTF-8?q?compose=20up=20=E7=9B=B4=E6=8E=A5=E8=BF=90=E8=A1=8C=E6=9C=8D?= =?UTF-8?q?=E5=8A=A1=E8=80=8C=E6=97=A0=E9=9C=80=E4=BF=AE=E6=94=B9=E9=85=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以下为修改内容。 - 移除现在看起来无用的 Dockerfile.develop - 将 Dockerfile 和 docker-compose.yml 移动到项目根目录下,删除 Docker 目录 - 修改 docker-compose.yml 内容,令 docker-compose 自行构建 proxy_pool,通过环境变量传递数据库类型和域名、端口等配置信息,不再暴露 redis 端口到 host 主机 - 修改 Dockerfile 内容,先复制 requirements.txt,完成依赖安装后,再复制代码文件,避免开发迭代时每次都要等 pip install - 修改 Config.setting 模块,先尝试通过环境变量获取配置信息,并提供未配置环境变量时的默认值。 --- Config/setting.py | 24 ++++++++++++++++++++---- Docker/Dockerfile.develop | 27 --------------------------- Docker/docker-compose.yml | 14 -------------- Docker/Dockerfile => Dockerfile | 9 +++------ docker-compose.yml | 14 ++++++++++++++ requirements.txt | 3 --- 6 files changed, 37 insertions(+), 54 deletions(-) delete mode 100644 Docker/Dockerfile.develop delete mode 100644 Docker/docker-compose.yml rename Docker/Dockerfile => Dockerfile (89%) create mode 100644 docker-compose.yml diff --git a/Config/setting.py b/Config/setting.py index 63b4f6153..a74e69a32 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -12,12 +12,29 @@ """ # database config +from os import getenv + + +class ConfigError(BaseException): + pass + + +DB_TYPE = getenv('db_type', 'SSDB') + +if DB_TYPE == 'SSDB': + DB_HOST = getenv('ssdb_host', '127.0.0.1') + DB_PORT = getenv('ssdb_port', '6379') +elif DB_TYPE == 'MONGODB': + DB_HOST = getenv('mongodb_host', '127.0.0.1') + DB_PORT = getenv('mongodb_host', '27017') +else: + raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') DATABASES = { "default": { - "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB - "HOST": "127.0.0.1", - "PORT": 8888, + "TYPE": DB_TYPE, # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB + "HOST": DB_HOST, + "PORT": DB_PORT, "NAME": "proxy", "PASSWORD": "" @@ -45,7 +62,6 @@ # "freeProxyWallThird" ] - # # API config http://127.0.0.1:5010 SERVER_API = { diff --git a/Docker/Dockerfile.develop b/Docker/Dockerfile.develop deleted file mode 100644 index d97495489..000000000 --- a/Docker/Dockerfile.develop +++ /dev/null @@ -1,27 +0,0 @@ -FROM python:3.6 -WORKDIR /usr/src/app -COPY . . -ENV DEBIAN_FRONTEND noninteractive -ENV TZ Asia/Shanghai - -RUN apt-get update -RUN apt-get install vim -y - -RUN apt-get install -y redis-server -RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \ - && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \ - && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf -# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \ -# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf - -RUN pip install --no-cache-dir -r requirements.txt - - -RUN echo "# ! /bin/sh " > run.sh \ - && echo "redis-server /etc/redis/redis.conf&" >> run.sh \ - && echo "cd Run" >> run.sh \ - && echo "python main.py" >> run.sh \ - && chmod 777 run.sh - -EXPOSE 5010 -CMD [ "sh", "run.sh" ] diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml deleted file mode 100644 index 9529745d5..000000000 --- a/Docker/docker-compose.yml +++ /dev/null @@ -1,14 +0,0 @@ -version: '2' -services: - proxy_pool: - volumes: - - ..:/usr/src/app - ports: - - "5010:5010" - links: - - proxy_redis - image: "proxy_pool" - proxy_redis: - ports: - - "6379:6379" - image: "redis" \ No newline at end of file diff --git a/Docker/Dockerfile b/Dockerfile similarity index 89% rename from Docker/Dockerfile rename to Dockerfile index 6ad6f5f53..abe8ddb07 100644 --- a/Docker/Dockerfile +++ b/Dockerfile @@ -1,13 +1,10 @@ FROM python:3.6 -WORKDIR /usr/src/app -COPY . . - ENV DEBIAN_FRONTEND noninteractive ENV TZ Asia/Shanghai - +WORKDIR /usr/src/app +COPY ./requirements.txt . RUN pip install --no-cache-dir -r requirements.txt - +COPY . . EXPOSE 5010 - WORKDIR /usr/src/app/ CMD [ "python", "Run/main.py" ] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..1c7f24659 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + build: . + ports: + - "5010:5010" + links: + - proxy_redis + environment: + db_type: SSDB + ssdb_host: proxy_redis + ssdb_port: 6379 + proxy_redis: + image: "redis" diff --git a/requirements.txt b/requirements.txt index bc3581ff5..3da935240 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,5 @@ werkzeug==0.11.15 Flask==0.12 requests==2.20.0 lxml==3.7.2 - pymongo redis - - From 595b08861abfa0e3a4e8dfa16132686292a5815c Mon Sep 17 00:00:00 2001 From: baiyan Date: Sun, 24 Mar 2019 00:42:18 +0800 Subject: [PATCH 142/399] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=97=A0=E5=BF=A7?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E8=A7=A3=E6=9E=90=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index cdfa843a0..470cbb3c2 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -33,7 +33,10 @@ class GetFreeProxy(object): def freeProxyFirst(page=10): """ 无忧代理 http://www.data5u.com/ - 几乎没有能用的 + 无忧代理有反爬虫机制。 + 需要获得元素的 classname。 + 匹配classname中每个字符在key中的位置,组合得到一个整数。 + 最后将整数右移3位得到的才是正确的端口号。 :param page: 页数 :return: """ @@ -42,12 +45,21 @@ def freeProxyFirst(page=10): 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml' ] + key = 'ABCDEFGHIZ' for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: - yield ':'.join(ul.xpath('.//li/text()')[0:2]) + ip = ul.xpath('./span[1]/li/text()')[0] + classnames = ul.xpath('./span[2]/li/attribute::class')[0] + classname = classnames.split(' ')[1] + port_sum = 0 + for c in classname: + port_sum *= 10 + port_sum += key.index(c) + port = port_sum >> 3 + yield '{}:{}'.format(ip, port) except Exception as e: print(e) From 35467fb3bc8ac5c63b6939df84aa027f820f3421 Mon Sep 17 00:00:00 2001 From: Oddcc Date: Fri, 29 Mar 2019 14:19:19 +0800 Subject: [PATCH 143/399] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新文档中生产环境部署命令 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48edb4c98..2bee689f4 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ SERVER_API = { # Workdir proxy_pool docker build -t proxy_pool . pip install docker-compose -docker-compose -f Docker/docker-compose.yml up -d +docker-compose -f docker-compose.yml up -d ``` * 开发环境 Docker From f8d039e61e0dc88ebfee43f96f9a584f07c9ca90 Mon Sep 17 00:00:00 2001 From: houbaron Date: Wed, 8 May 2019 21:40:11 +0800 Subject: [PATCH 144/399] =?UTF-8?q?[refine]=E5=85=81=E8=AE=B8=20docker-com?= =?UTF-8?q?pose.yml=20=E5=AE=9A=E4=B9=89=E5=AF=86=E7=A0=81=E8=80=8C?= =?UTF-8?q?=E6=97=A0=E9=A1=BB=E4=BF=AE=E6=94=B9=20setting.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Config/setting.py b/Config/setting.py index a74e69a32..66b8f0866 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -24,9 +24,11 @@ class ConfigError(BaseException): if DB_TYPE == 'SSDB': DB_HOST = getenv('ssdb_host', '127.0.0.1') DB_PORT = getenv('ssdb_port', '6379') + DB_PASSWORD = getenv('ssdb_password', '6379') elif DB_TYPE == 'MONGODB': DB_HOST = getenv('mongodb_host', '127.0.0.1') DB_PORT = getenv('mongodb_host', '27017') + DB_PASSWORD = getenv('mongodb_password', '6379') else: raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') @@ -36,7 +38,7 @@ class ConfigError(BaseException): "HOST": DB_HOST, "PORT": DB_PORT, "NAME": "proxy", - "PASSWORD": "" + "PASSWORD": DB_PASSWORD } } From bb4a7b9367a74645d1bfecbf92299260ef4bde0f Mon Sep 17 00:00:00 2001 From: houbaron Date: Wed, 8 May 2019 21:44:56 +0800 Subject: [PATCH 145/399] =?UTF-8?q?[refine]=E8=AE=BE=E7=BD=AE=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=AF=86=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 66b8f0866..358b0bfbc 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -24,11 +24,11 @@ class ConfigError(BaseException): if DB_TYPE == 'SSDB': DB_HOST = getenv('ssdb_host', '127.0.0.1') DB_PORT = getenv('ssdb_port', '6379') - DB_PASSWORD = getenv('ssdb_password', '6379') + DB_PASSWORD = getenv('ssdb_password', '') elif DB_TYPE == 'MONGODB': DB_HOST = getenv('mongodb_host', '127.0.0.1') DB_PORT = getenv('mongodb_host', '27017') - DB_PASSWORD = getenv('mongodb_password', '6379') + DB_PASSWORD = getenv('mongodb_password', '') else: raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') From f5a4317bbc96f6396d85337bba735545c437fecd Mon Sep 17 00:00:00 2001 From: hero Date: Sat, 11 May 2019 20:09:56 +0800 Subject: [PATCH 146/399] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=85=A8=E7=BD=91?= =?UTF-8?q?=E4=BB=A3=E7=90=86port=E9=94=99=E8=AF=AF=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index cdfa843a0..330bf090a 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -128,8 +128,20 @@ def freeProxyFifth(): try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] - yield '{}:{}'.format(ip_addr, port) + + # HTML中的port是随机数,真正的端口编码在class后面的字母中。 + # 比如这个: + # 9054 + # CFACE解码后对应的是3128。 + port = 0 + for _ in each_proxy.xpath(".//span[contains(@class, 'port')]" + "/attribute::class")[0]. \ + replace("port ", ""): + port *= 10 + port += (ord(_) - ord('A')) + port /= 8 + + yield '{}:{}'.format(ip_addr, int(port)) except Exception as e: pass From 35f43ecbe67ba869fcb3b7f044185f79a7452699 Mon Sep 17 00:00:00 2001 From: jhao Date: Wed, 10 Jul 2019 17:17:32 +0800 Subject: [PATCH 147/399] [update] fix 272 --- Schedule/ProxyCheck.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 4300f7bf7..782d993d1 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -15,6 +15,12 @@ import sys from threading import Thread + +try: + from Queue import Empty # py3 +except: + from queue import Empty # py2 + sys.path.append('../') from Util.utilFunction import validUsefulProxy @@ -35,7 +41,10 @@ def __init__(self, queue, item_dict): def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): - proxy = self.queue.get() + try: + proxy = self.queue.get() + except Empty: + break count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 @@ -53,8 +62,3 @@ def run(self): self.db.put(proxy, num=int(count) + 1) self.queue.task_done() - -if __name__ == '__main__': - # p = ProxyCheck() - # p.run() - pass From 2f39dedbf36c3838233f452323f18ddad25f9e7b Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 11 Jul 2019 16:39:23 +0800 Subject: [PATCH 148/399] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E5=AF=B9?= =?UTF-8?q?=E8=B1=A1=E7=B1=BB=E5=9E=8B=E5=B0=81=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyHelper/Proxy.py | 104 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 ProxyHelper/Proxy.py diff --git a/ProxyHelper/Proxy.py b/ProxyHelper/Proxy.py new file mode 100644 index 000000000..dce009e96 --- /dev/null +++ b/ProxyHelper/Proxy.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: Proxy + Description : 代理对象类型封装 + Author : JHao + date: 2019/7/11 +------------------------------------------------- + Change Activity: + 2019/7/11: 代理对象类型封装 +------------------------------------------------- +""" +__author__ = 'JHao' + + +class Proxy(object): + + def __init__(self, proxy): + if isinstance(proxy, basestring): + self._proxy = proxy + self._fail_count = 0 + self._region = "" + self._type = "" + self._last_status = "" + self._last_time = "" + + elif isinstance(proxy, dict): + self._proxy = proxy.get("proxy") + self._fail_count = proxy.get("fail_count") + self._region = proxy.get("region") + self._type = proxy.get("type") + self._last_status = proxy.get("last_status") + self._last_time = proxy.get("last_time") + + else: + raise TypeError("proxy arg invalid") + + @property + def proxy(self): + """ 代理 ip:port """ + return self._proxy + + @property + def fail_count(self): + """ 检测失败次数 """ + return self._fail_count + + @property + def region(self): + """ 地理位置(国家/城市) """ + return self._region + + @property + def type(self): + """ 透明/匿名/高匿 """ + return self._type + + @property + def last_status(self): + """ 最后一次检测结果 """ + return self._last_status + + @property + def last_time(self): + """ 最后一次检测时间 """ + return self._last_time + + # --- proxy method --- + @fail_count.setter + def fail_count(self, value): + self._fail_count = value + + @region.setter + def region(self, value): + self._region = value + + @type.setter + def type(self, value): + self._type = value + + @last_status.setter + def last_status(self, value): + self._last_status = value + + @last_time.setter + def last_time(self, value): + self._last_time = value + + +def proxy2Json(proxy): + return {"proxy": proxy.proxy, + "fail_count": proxy.fail_count, + "region": proxy.region, + "type": proxy.type, + "last_status": proxy.last_status, + "last_time": proxy.last_time} + + +if __name__ == '__main__': + p = Proxy("127.0.0.1:8080") + + import json + + print json.dumps(p, default=proxy2Json) From 964061e8e80baf2534652e290385f7131f880447 Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 11 Jul 2019 17:03:31 +0800 Subject: [PATCH 149/399] =?UTF-8?q?[update]=20=E6=97=A0=E5=BF=A7=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 330bf090a..60dd884c1 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -30,17 +30,14 @@ class GetFreeProxy(object): """ @staticmethod - def freeProxyFirst(page=10): + def freeProxy01(): """ 无忧代理 http://www.data5u.com/ 几乎没有能用的 - :param page: 页数 :return: """ url_list = [ 'http://www.data5u.com/', - 'http://www.data5u.com/free/gngn/index.shtml', - 'http://www.data5u.com/free/gnpt/index.shtml' ] for url in url_list: html_tree = getHtmlTree(url) @@ -300,7 +297,7 @@ def freeProxyWallThird(): if __name__ == '__main__': from CheckProxy import CheckProxy - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01()) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) @@ -310,7 +307,7 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() From f0c7a0f918cad270a508bed8296cd644b5dc1722 Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 18 Jul 2019 10:00:04 +0800 Subject: [PATCH 150/399] =?UTF-8?q?[update]=20=E7=A0=B4=E8=A7=A3=E4=BB=A3?= =?UTF-8?q?=E7=90=8666=20=E5=8A=A0=E9=80=9F=E4=B9=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 +-- ProxyGetter/CheckProxy.py | 2 +- ProxyGetter/getFreeProxy.py | 56 +++++++++++++++++++++++++++---------- Util/utilFunction.py | 12 -------- requirements.txt | 1 + 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 358b0bfbc..4ef1b76eb 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -46,8 +46,8 @@ class ConfigError(BaseException): # register the proxy getter function PROXY_GETTER = [ - "freeProxyFirst", - "freeProxySecond", + "freeProxy01", + "freeProxy02", # "freeProxyThird", # 网站已不能访问 "freeProxyFourth", "freeProxyFifth", diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index 2b3fc6a29..d15be49c9 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -67,4 +67,4 @@ def checkGetProxyFunc(func): if __name__ == '__main__': CheckProxy.checkAllGetProxyFunc() - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01) \ No newline at end of file diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 60dd884c1..1b1277af4 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -49,24 +49,52 @@ def freeProxy01(): print(e) @staticmethod - def freeProxySecond(count=20): + def freeProxy02(count=20): """ 代理66 http://www.66ip.cn/ :param count: 提取数量 :return: """ urls = [ - "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", - "http://www.66ip.cn/nmtq.php?getnum={count}" - "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip", - ] - request = WebRequest() - for _ in urls: - url = _.format(count=count) - html = request.get(url).content - ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html) - for ip in ips: - yield ip.strip() + "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", + "http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s" + "tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip" + ] + + try: + import execjs + import requests + + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + session = requests.session() + src = session.get("http://www.66ip.cn/", headers=headers).text + src = src.split("")[0] + '}' + src = src.replace("")[0] + '}' + src = src.replace("")[0] + '}' + src = src.replace("")[0] + '}' - src = src.replace("")[0] + '}' +# src = src.replace("")[0] + '}' -# src = src.replace("")[0] + '}' - src = src.replace("