From f9a8bf54054f0e7435c7e9718da9033988e0958c Mon Sep 17 00:00:00 2001
From: highroom <827148@163.com>
Date: Mon, 14 May 2018 23:39:30 +0800
Subject: [PATCH 001/298] =?UTF-8?q?=E5=A2=9E=E5=8A=A0fq=E4=BB=A3=E7=90=86?=
=?UTF-8?q?=E7=9A=84=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=85=8D=E7=BD=AE=E5=90=8E?=
=?UTF-8?q?=E8=AF=B7=E8=B0=83=E7=94=A8=E4=BB=A3=E7=90=86=E8=AE=BF=E9=97=AE?=
=?UTF-8?q?wallproxy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config.ini | 4 ++++
ProxyGetter/getFreeProxy.py | 44 ++++++++++++++++++++++++++++++++++---
Util/WebRequest.py | 2 +-
3 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/Config.ini b/Config.ini
index 5f417badc..dae17c1a2 100644
--- a/Config.ini
+++ b/Config.ini
@@ -29,3 +29,7 @@ freeProxyWallThird = 1
; API接口配置 http://127.0.0.1:5010
ip = 0.0.0.0
port = 5010
+
+[WallProxy]
+; fq代理配置
+; proxy = 127.0.0.1:1080
\ No newline at end of file
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index 78837d50a..edb71fa8b 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -14,6 +14,12 @@
import re
import sys
import requests
+import os
+
+try:
+ from configparser import ConfigParser # py3
+except:
+ from ConfigParser import ConfigParser # py2
try:
from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
@@ -46,6 +52,15 @@ class GetFreeProxy(object):
"""
proxy getter
"""
+ pwd = os.path.split(os.path.realpath(__file__))[0]
+ config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini')
+ config_file = ConfigParser()
+ config_file.read(config_path)
+ if config_file.has_option('WallProxy', 'proxy'):
+ WallProxy = config_file.get('WallProxy', 'proxy')
+ wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)}
+ else:
+ wall_proxies = None
def __init__(self):
pass
@@ -257,10 +272,17 @@ def freeProxyWallFirst():
墙外网站 cn-proxy
:return:
"""
+ kwargs = {}
+ if GetFreeProxy.wall_proxies:
+ kwargs['proxies'] = GetFreeProxy.wall_proxies
+ else:
+ return
+
urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ kwargs['url'] = url
+ r = request.get(**kwargs)
proxies = re.findall(r'
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -271,21 +293,35 @@ def freeProxyWallSecond():
https://proxy-list.org/english/index.php
:return:
"""
+ kwargs = {}
+ if GetFreeProxy.wall_proxies:
+ kwargs['proxies'] = GetFreeProxy.wall_proxies
+ else:
+ return
urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
request = WebRequest()
import base64
for url in urls:
- r = request.get(url)
+ kwargs['url'] = url
+ r = request.get(**kwargs)
proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
for proxy in proxies:
yield base64.b64decode(proxy).decode()
@staticmethod
def freeProxyWallThird():
+
+ kwargs = {}
+ if GetFreeProxy.wall_proxies:
+ kwargs['proxies'] = GetFreeProxy.wall_proxies
+ else:
+ return
+
urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ kwargs['url'] = url
+ r = request.get(**kwargs)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -319,3 +355,5 @@ def freeProxyWallThird():
# test_batch(gg.freeProxyWallSecond())
# test_batch(gg.freeProxyWallThird())
+ for e in gg.freeProxyWallThird():
+ print(e)
diff --git a/Util/WebRequest.py b/Util/WebRequest.py
index 68db87500..47286a225 100644
--- a/Util/WebRequest.py
+++ b/Util/WebRequest.py
@@ -70,7 +70,7 @@ def get(self, url, header=None, retry_time=5, timeout=30,
headers.update(header)
while True:
try:
- html = requests.get(url, headers=headers, timeout=timeout)
+ html = requests.get(url, headers=headers, timeout=timeout, **kwargs)
if any(f in html.content for f in retry_flag):
raise Exception
return html
From 413e41b2973e41742e55ab7bb7a1d642fe6ada8d Mon Sep 17 00:00:00 2001
From: jhao104
Date: Tue, 10 Jul 2018 16:50:31 +0800
Subject: [PATCH 002/298] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9ProxyGette?=
=?UTF-8?q?r=E6=A3=80=E6=9F=A5=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config.ini | 4 --
ProxyGetter/CheckProxy.py | 72 ++++++++++++++++++++++++++++
ProxyGetter/getFreeProxy.py | 96 ++++---------------------------------
3 files changed, 81 insertions(+), 91 deletions(-)
create mode 100644 ProxyGetter/CheckProxy.py
diff --git a/Config.ini b/Config.ini
index 95e33400d..ca011a01f 100644
--- a/Config.ini
+++ b/Config.ini
@@ -30,7 +30,3 @@ freeProxyWallThird = 1
; API接口配置 http://127.0.0.1:5010
ip = 0.0.0.0
port = 5010
-
-[WallProxy]
-; fq代理配置
-; proxy = 127.0.0.1:1080
\ No newline at end of file
diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py
new file mode 100644
index 000000000..f6ba9b66a
--- /dev/null
+++ b/ProxyGetter/CheckProxy.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+"""
+-------------------------------------------------
+ File Name: CheckProxy
+ Description : used for check getFreeProxy.py
+ Author : JHao
+ date: 2018/7/10
+-------------------------------------------------
+ Change Activity:
+ 2018/7/10: CheckProxy
+-------------------------------------------------
+"""
+__author__ = 'JHao'
+
+import sys
+from getFreeProxy import GetFreeProxy
+from Util.utilFunction import verifyProxyFormat
+
+sys.path.append('../')
+
+from Util.LogHandler import LogHandler
+
+log = LogHandler('check_proxy', file=False)
+
+
+class CheckProxy(object):
+
+ @staticmethod
+ def checkAllGetProxyFunc():
+ """
+ 检查getFreeProxy所有代理获取函数运行情况
+ Returns:
+ None
+ """
+ import inspect
+ member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction)
+ proxy_count_dict = dict()
+ for func_name, func in member_list:
+ log.info(u"开始运行 {}".format(func_name))
+ try:
+ proxy_list = [_ for _ in func() if verifyProxyFormat(_)]
+ proxy_count_dict[func_name] = len(proxy_list)
+ except Exception as e:
+ log.info(u"代理获取函数 {} 运行出错!".format(func_name))
+ log.error(str(e))
+ log.info(u"所有函数运行完毕 " + "***" * 5)
+ for func_name, func in member_list:
+ log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0)))
+
+ @staticmethod
+ def checkGetProxyFunc(func):
+ """
+ 检查指定的getFreeProxy某个function运行情况
+ Args:
+ func: getFreeProxy中某个可调用方法
+
+ Returns:
+ None
+ """
+ func_name = getattr(func, '__name__', "None")
+ log.info("start running func: {}".format(func_name))
+ count = 0
+ for proxy in func():
+ if verifyProxyFormat(proxy):
+ log.info("fetch proxy: {}".format(proxy))
+ count += 1
+ log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
+
+
+if __name__ == '__main__':
+ CheckProxy.checkAllGetProxyFunc()
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index 23542a5b7..bf2e03f61 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -14,13 +14,6 @@
import re
import sys
import requests
-import os
-
-try:
- from configparser import ConfigParser # py3
-except:
- from ConfigParser import ConfigParser # py2
-
try:
from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
@@ -30,8 +23,8 @@
sys.path.append('..')
-from Util.utilFunction import robustCrawl, getHtmlTree
from Util.WebRequest import WebRequest
+from Util.utilFunction import getHtmlTree
from Util.utilFunction import verifyProxyFormat
# for debug to disable insecureWarning
@@ -54,15 +47,6 @@ class GetFreeProxy(object):
"""
proxy getter
"""
- pwd = os.path.split(os.path.realpath(__file__))[0]
- config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini')
- config_file = ConfigParser()
- config_file.read(config_path)
- if config_file.has_option('WallProxy', 'proxy'):
- WallProxy = config_file.get('WallProxy', 'proxy')
- wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)}
- else:
- wall_proxies = None
def __init__(self):
pass
@@ -215,7 +199,7 @@ def freeProxyEight():
request = WebRequest()
for url in url_list:
- r = request.get(url, use_proxy=True)
+ r = request.get(url)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W].*(\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -278,7 +262,6 @@ def freeProxyTwelve(page_count=8):
"""
for i in range(1, page_count + 1):
url = 'http://ip.jiangxianli.com/?page={}'.format(i)
- # print(url)
html_tree = getHtmlTree(url)
tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
if len(tr_list) == 0:
@@ -292,17 +275,10 @@ def freeProxyWallFirst():
墙外网站 cn-proxy
:return:
"""
- kwargs = {}
- if GetFreeProxy.wall_proxies:
- kwargs['proxies'] = GetFreeProxy.wall_proxies
- else:
- return
-
urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
request = WebRequest()
for url in urls:
- kwargs['url'] = url
- r = request.get(**kwargs)
+ r = request.get(url)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -313,84 +289,30 @@ def freeProxyWallSecond():
https://proxy-list.org/english/index.php
:return:
"""
- kwargs = {}
- if GetFreeProxy.wall_proxies:
- kwargs['proxies'] = GetFreeProxy.wall_proxies
- else:
- return
urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
request = WebRequest()
import base64
for url in urls:
- kwargs['url'] = url
- r = request.get(**kwargs)
+ r = request.get(url)
proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
for proxy in proxies:
yield base64.b64decode(proxy).decode()
@staticmethod
def freeProxyWallThird():
-
- kwargs = {}
- if GetFreeProxy.wall_proxies:
- kwargs['proxies'] = GetFreeProxy.wall_proxies
- else:
- return
-
urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
request = WebRequest()
for url in urls:
- kwargs['url'] = url
- r = request.get(**kwargs)
+ r = request.get(url)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
if __name__ == '__main__':
- gg = GetFreeProxy()
-
- # test_batch(gg.freeProxyFirst())
-
- # test_batch(gg.freeProxySecond())
-
- # test_batch(gg.freeProxyFourth())
-
- # test_batch(gg.freeProxyFifth())
-
- # test_batch(gg.freeProxySixth())
-
- # test_batch(gg.freeProxySeventh())
-
- # test_batch(gg.freeProxyEight())
-
- # test_batch(gg.freeProxyNinth())
-
- # test_batch(gg.freeProxyTen())
-
- # test_batch(gg.freeProxyEleven())
-
- proxy_iter = gg.freeProxyTwelve()
- proxy_set = set()
- for proxy in proxy_iter:
- proxy = proxy.strip()
- if proxy and verifyProxyFormat(proxy):
- #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
- proxy_set.add(proxy)
- #else:
- #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy))
-
- # store
- for proxy in proxy_set:
- print(proxy)
-
-
- # test_batch(gg.freeProxyTwelve())
-
- # test_batch(gg.freeProxyWallFirst())
+ from CheckProxy import CheckProxy
- # test_batch(gg.freeProxyWallSecond())
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
- # test_batch(gg.freeProxyWallThird())
- for e in gg.freeProxyWallThird():
- print(e)
+ CheckProxy.checkAllGetProxyFunc()
From edac60ce8ea6340834e1e4afa53d37f3e1a783a8 Mon Sep 17 00:00:00 2001
From: jhao104
Date: Tue, 10 Jul 2018 16:54:13 +0800
Subject: [PATCH 003/298] [update] readme
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 8c59cd631..8dc30eee9 100644
--- a/README.md
+++ b/README.md
@@ -178,10 +178,10 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致
这里感谢以下contributor的无私奉献:
- [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)
+ [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)
### Release Notes
- [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) [@luocaodan](https://github.com/luocaodan)
+ [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md)
From 62b05856fbed3f104842010defe0f46fd5e5c242 Mon Sep 17 00:00:00 2001
From: YeClimEric
Date: Wed, 10 Oct 2018 17:50:34 +0800
Subject: [PATCH 004/298] =?UTF-8?q?1.flask=E6=94=AF=E6=8C=81=E5=A4=9A?=
=?UTF-8?q?=E8=BF=9B=E7=A8=8B=E5=A4=84=E7=90=86=E4=BB=BB=E5=8A=A1=202.?=
=?UTF-8?q?=E4=BC=98=E5=8C=96=20proxy=20=E9=87=87=E9=9B=86=E3=80=81?=
=?UTF-8?q?=E6=A0=A1=E9=AA=8C=E6=B5=81=E7=A8=8B=EF=BC=8C=E5=8A=A0=E5=BF=AB?=
=?UTF-8?q?=20userfull=20proxy=20=E6=A0=A1=E9=AA=8C=E9=80=9F=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Api/ProxyApi.py | 9 ++++-----
Config.ini | 8 +++++---
Manager/ProxyManager.py | 32 ++++++++++++--------------------
Schedule/ProxyRefreshSchedule.py | 23 +++++++++++++----------
Util/GetConfig.py | 11 ++++++++---
5 files changed, 42 insertions(+), 41 deletions(-)
diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py
index 724dc35e6..2e3733013 100644
--- a/Api/ProxyApi.py
+++ b/Api/ProxyApi.py
@@ -2,13 +2,13 @@
# !/usr/bin/env python
"""
-------------------------------------------------
- File Name: ProxyApi.py
- Description :
+ File Name: ProxyApi.py
+ Description :
Author : JHao
date: 2016/12/4
-------------------------------------------------
Change Activity:
- 2016/12/4:
+ 2016/12/4:
-------------------------------------------------
"""
__author__ = 'JHao'
@@ -26,7 +26,6 @@
class JsonResponse(Response):
-
@classmethod
def force_type(cls, response, environ=None):
if isinstance(response, (dict, list)):
@@ -86,7 +85,7 @@ def getStatus():
def run():
config = GetConfig()
- app.run(host=config.host_ip, port=config.host_port)
+ app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes)
if __name__ == '__main__':
diff --git a/Config.ini b/Config.ini
index ca011a01f..d1ab07bb4 100644
--- a/Config.ini
+++ b/Config.ini
@@ -9,11 +9,11 @@ name = proxy
[ProxyGetter]
;register the proxy getter function
-freeProxyFirst = 1
+freeProxyFirst = 1
freeProxySecond = 1
;freeProxyThird = 1
freeProxyFourth = 1
-freeProxyFifth = 1
+freeProxyFifth = 1
freeProxySixth = 1
freeProxySeventh = 1
freeProxyEight = 1
@@ -26,7 +26,9 @@ freeProxyWallFirst = 1
freeProxyWallSecond = 1
freeProxyWallThird = 1
-[HOST]
+[API]
; API接口配置 http://127.0.0.1:5010
ip = 0.0.0.0
port = 5010
+; flask多进程处理请求
+processes = 10
diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py
index 6131c089a..33aa76b39 100644
--- a/Manager/ProxyManager.py
+++ b/Manager/ProxyManager.py
@@ -2,13 +2,13 @@
# !/usr/bin/env python
"""
-------------------------------------------------
- File Name: ProxyManager.py
- Description :
+ File Name: ProxyManager.py
+ Description :
Author : JHao
date: 2016/12/3
-------------------------------------------------
Change Activity:
- 2016/12/3:
+ 2016/12/3:
-------------------------------------------------
"""
__author__ = 'JHao'
@@ -40,30 +40,22 @@ def refresh(self):
fetch proxy into Db by ProxyGetter
:return:
"""
+ self.db.changeTable(self.raw_proxy_queue)
for proxyGetter in self.config.proxy_getter_functions:
# fetch
- proxy_set = set()
try:
self.log.info("{func}: fetch proxy start".format(func=proxyGetter))
- proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()]
+ for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
+ # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度
+ proxy = proxy.strip()
+ if proxy and verifyProxyFormat(proxy):
+ self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
+ self.db.put(proxy)
+ else:
+ self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy))
except Exception as e:
self.log.error("{func}: fetch proxy fail".format(func=proxyGetter))
continue
- for proxy in proxy_iter:
- proxy = proxy.strip()
- if proxy and verifyProxyFormat(proxy):
- self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
- proxy_set.add(proxy)
- else:
- self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy))
-
- # store
- for proxy in proxy_set:
- self.db.changeTable(self.useful_proxy_queue)
- if self.db.exists(proxy):
- continue
- self.db.changeTable(self.raw_proxy_queue)
- self.db.put(proxy)
def get(self):
"""
diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py
index 7dac2aa34..6088fcb0a 100644
--- a/Schedule/ProxyRefreshSchedule.py
+++ b/Schedule/ProxyRefreshSchedule.py
@@ -18,7 +18,8 @@
import time
import logging
from threading import Thread
-from apscheduler.schedulers.blocking import BlockingScheduler
+# 使用后台调度,不使用阻塞式~
+from apscheduler.schedulers.background import BackgroundScheduler as Sch
sys.path.append('../')
@@ -73,12 +74,7 @@ def refreshPool():
pp.validProxy()
-def main(process_num=30):
- p = ProxyRefreshSchedule()
-
- # 获取新代理
- p.refresh()
-
+def batch_refresh(process_num=30):
# 检验新代理
pl = []
for num in range(process_num):
@@ -93,11 +89,18 @@ def main(process_num=30):
pl[num].join()
+def fetch_all():
+ p = ProxyRefreshSchedule()
+ # 获取新代理
+ p.refresh()
+
+
def run():
- main()
- sch = BlockingScheduler()
- sch.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次
+ sch = Sch()
+ sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次
+ sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次
sch.start()
+ fetch_all()
if __name__ == '__main__':
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index 24b003f28..8ea57be56 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -2,7 +2,7 @@
# !/usr/bin/env python
"""
-------------------------------------------------
- File Name: GetConfig.py
+ File Name: GetConfig.py
Description : fetch config from config.ini
Author : JHao
date: 2016/12/3
@@ -51,11 +51,15 @@ def proxy_getter_functions(self):
@LazyProperty
def host_ip(self):
- return self.config_file.get('HOST','ip')
+ return self.config_file.get('API','ip')
@LazyProperty
def host_port(self):
- return int(self.config_file.get('HOST', 'port'))
+ return int(self.config_file.get('API', 'port'))
+
+ @LazyProperty
+ def processes(self):
+ return int(self.config_file.get('API', 'processes'))
if __name__ == '__main__':
gg = GetConfig()
@@ -66,3 +70,4 @@ def host_port(self):
print(gg.proxy_getter_functions)
print(gg.host_ip)
print(gg.host_port)
+ print(gg.processes)
From a0b152a968e073c0c35f8dc03d862f783ba4ee86 Mon Sep 17 00:00:00 2001
From: YeClimEric
Date: Wed, 10 Oct 2018 18:15:47 +0800
Subject: [PATCH 005/298] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Dockerfile | 43 ++++++++++++++++++++-----------------------
1 file changed, 20 insertions(+), 23 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index 7c815a4e7..d97495489 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,28 +3,25 @@ WORKDIR /usr/src/app
COPY . .
ENV DEBIAN_FRONTEND noninteractive
ENV TZ Asia/Shanghai
-RUN pip install --no-cache-dir -r requirements.txt && \
- apt-get update && \
- apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \
- git clone --depth 1 https://github.com/ideawu/ssdb.git ssdb && \
- cd ssdb && make && make install && cp ssdb-server /usr/bin && \
- apt-get remove -y --force-yes git make gcc g++ autoconf && \
- apt-get autoremove -y && \
- rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
- cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \
- mkdir -p /var/lib/ssdb && \
- sed \
- -e 's@home.*@home /var/lib@' \
- -e 's/loglevel.*/loglevel info/' \
- -e 's@work_dir = .*@work_dir = /var/lib/ssdb@' \
- -e 's@pidfile = .*@pidfile = /run/ssdb.pid@' \
- -e 's@level:.*@level: info@' \
- -e 's@ip:.*@ip: 0.0.0.0@' \
- -i /etc/ssdb.conf && \
- echo "# ! /bin/sh " > /usr/src/app/run.sh && \
- echo "cd Run" >> /usr/src/app/run.sh && \
- echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \
- echo "python main.py" >> /usr/src/app/run.sh && \
- chmod 777 run.sh
+
+RUN apt-get update
+RUN apt-get install vim -y
+
+RUN apt-get install -y redis-server
+RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \
+ && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \
+ && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf
+# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \
+# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+
+RUN echo "# ! /bin/sh " > run.sh \
+ && echo "redis-server /etc/redis/redis.conf&" >> run.sh \
+ && echo "cd Run" >> run.sh \
+ && echo "python main.py" >> run.sh \
+ && chmod 777 run.sh
+
EXPOSE 5010
CMD [ "sh", "run.sh" ]
From 5de6b7d3793337f7c5aa05dd3539c7db3b31fc9e Mon Sep 17 00:00:00 2001
From: YeClimEric
Date: Wed, 10 Oct 2018 19:29:58 +0800
Subject: [PATCH 006/298] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Schedule/ProxyRefreshSchedule.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py
index 6088fcb0a..38668072d 100644
--- a/Schedule/ProxyRefreshSchedule.py
+++ b/Schedule/ProxyRefreshSchedule.py
@@ -102,6 +102,9 @@ def run():
sch.start()
fetch_all()
+ while True:
+ time.sleep(1)
+
if __name__ == '__main__':
run()
From 2086a52ecc21c3099c328fa0df40281399feebaf Mon Sep 17 00:00:00 2001
From: jhao104
Date: Wed, 17 Oct 2018 14:21:09 +0800
Subject: [PATCH 007/298] [fix] fix198
---
Api/ProxyApi.py | 5 ++++-
Config.ini | 3 +--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py
index 2e3733013..b8977f9ca 100644
--- a/Api/ProxyApi.py
+++ b/Api/ProxyApi.py
@@ -85,7 +85,10 @@ def getStatus():
def run():
config = GetConfig()
- app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes)
+ if sys.platform.startswith("win"):
+ app.run(host=config.host_ip, port=config.host_port)
+ else:
+ app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes)
if __name__ == '__main__':
diff --git a/Config.ini b/Config.ini
index d1ab07bb4..9394f744e 100644
--- a/Config.ini
+++ b/Config.ini
@@ -27,8 +27,7 @@ freeProxyWallSecond = 1
freeProxyWallThird = 1
[API]
-; API接口配置 http://127.0.0.1:5010
+; API config http://127.0.0.1:5010
ip = 0.0.0.0
port = 5010
-; flask多进程处理请求
processes = 10
From 7449f7dabb9449a6eedf67f2ff4d20df39a9e5ae Mon Sep 17 00:00:00 2001
From: vc5
Date: Thu, 25 Oct 2018 00:01:36 +0800
Subject: [PATCH 008/298] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=86=E7=A0=81?=
=?UTF-8?q?=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config.ini | 1 +
DB/DbClient.py | 3 ++-
DB/SsdbClient.py | 4 ++--
Util/GetConfig.py | 9 +++++++++
4 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/Config.ini b/Config.ini
index 9394f744e..24f570f01 100644
--- a/Config.ini
+++ b/Config.ini
@@ -6,6 +6,7 @@ host = 127.0.0.1
port = 6379
;port = 8888
name = proxy
+#password = yourpassword
[ProxyGetter]
;register the proxy getter function
diff --git a/DB/DbClient.py b/DB/DbClient.py
index 68c5db7a7..0036434ae 100644
--- a/DB/DbClient.py
+++ b/DB/DbClient.py
@@ -75,7 +75,8 @@ def __initDbClient(self):
assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type)
self.client = getattr(__import__(__type), __type)(name=self.config.db_name,
host=self.config.db_host,
- port=self.config.db_port)
+ port=self.config.db_port,
+ password=self.config.db_password)
def get(self, key, **kwargs):
return self.client.get(key, **kwargs)
diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py
index 2522e0071..2249fdcc1 100644
--- a/DB/SsdbClient.py
+++ b/DB/SsdbClient.py
@@ -32,7 +32,7 @@ class SsdbClient(object):
"""
- def __init__(self, name, host, port):
+ def __init__(self, name, **kwargs):
"""
init
:param name: hash name
@@ -41,7 +41,7 @@ def __init__(self, name, host, port):
:return:
"""
self.name = name
- self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port))
+ self.__conn = Redis(connection_pool=BlockingConnectionPool(**kwargs))
def get(self, proxy):
"""
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index 8ea57be56..c4c31ab0e 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -45,6 +45,15 @@ def db_host(self):
def db_port(self):
return int(self.config_file.get('DB', 'port'))
+ @LazyProperty
+ def db_password(self):
+ try:
+ password = self.config_file.get('DB', 'password')
+ except Exception:
+ password = None
+ return password
+
+
@LazyProperty
def proxy_getter_functions(self):
return self.config_file.options('ProxyGetter')
From 0238d9f931425736c9d72e4ea3e429ff4f03ef64 Mon Sep 17 00:00:00 2001
From: J_hao104
Date: Mon, 29 Oct 2018 09:44:48 +0800
Subject: [PATCH 009/298] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 8dc30eee9..47480fb92 100644
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致
这里感谢以下contributor的无私奉献:
- [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)
+ [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)
### Release Notes
From 8ac170e981fb08a892c27552782b4528d67f64eb Mon Sep 17 00:00:00 2001
From: Jacob
Date: Thu, 8 Nov 2018 21:35:31 +0800
Subject: [PATCH 010/298] =?UTF-8?q?=E5=AE=8C=E5=96=84Redis=E5=92=8CMongodb?=
=?UTF-8?q?=E9=AA=8C=E8=AF=81=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
添加Config.ini的用户和密码
为username参数做兼容处理
---
Config.ini | 3 ++-
DB/DbClient.py | 1 +
DB/MongodbClient.py | 4 ++--
DB/RedisClient.py | 8 ++++++--
DB/SsdbClient.py | 7 +++++--
Util/GetConfig.py | 7 +++++++
6 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/Config.ini b/Config.ini
index 24f570f01..cf3f8ded2 100644
--- a/Config.ini
+++ b/Config.ini
@@ -6,7 +6,8 @@ host = 127.0.0.1
port = 6379
;port = 8888
name = proxy
-#password = yourpassword
+;username = your_username (Only Mongodb)
+;password = your_password
[ProxyGetter]
;register the proxy getter function
diff --git a/DB/DbClient.py b/DB/DbClient.py
index 0036434ae..40127cc11 100644
--- a/DB/DbClient.py
+++ b/DB/DbClient.py
@@ -76,6 +76,7 @@ def __initDbClient(self):
self.client = getattr(__import__(__type), __type)(name=self.config.db_name,
host=self.config.db_host,
port=self.config.db_port,
+ username=self.config.db_username,
password=self.config.db_password)
def get(self, key, **kwargs):
diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py
index bd0647f51..a30ef6cf1 100644
--- a/DB/MongodbClient.py
+++ b/DB/MongodbClient.py
@@ -17,9 +17,9 @@
class MongodbClient(object):
- def __init__(self, name, host, port):
+ def __init__(self, name, host, port, **kwargs):
self.name = name
- self.client = MongoClient(host, port)
+ self.client = MongoClient(host, port, **kwargs)
self.db = self.client.proxy
def changeTable(self, name):
diff --git a/DB/RedisClient.py b/DB/RedisClient.py
index 7d9af4386..1983d855e 100644
--- a/DB/RedisClient.py
+++ b/DB/RedisClient.py
@@ -22,7 +22,11 @@ class RedisClient(object):
Reids client
"""
- def __init__(self, name, host, port):
+ # 为了保持DbClient的标准
+ # 在RedisClient里面接受username参数, 但不进行使用.
+ # 因为不能将username通过kwargs传进redis.Redis里面, 会报错:
+ # TypeError: __init__() got an unexpected keyword argument 'username'
+ def __init__(self, name, host, port, username, **kwargs):
"""
init
:param name:
@@ -31,7 +35,7 @@ def __init__(self, name, host, port):
:return:
"""
self.name = name
- self.__conn = redis.Redis(host=host, port=port, db=0)
+ self.__conn = redis.Redis(host=host, port=port, db=0, **kwargs)
def get(self):
"""
diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py
index 2249fdcc1..202ddaa8f 100644
--- a/DB/SsdbClient.py
+++ b/DB/SsdbClient.py
@@ -31,8 +31,11 @@ class SsdbClient(object):
验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1;
"""
-
- def __init__(self, name, **kwargs):
+ # 为了保持DbClient的标准
+ # 在SsdbClient里面接受username参数, 但不进行使用.
+ # 因为不能将username通过kwargs传进redis.Redis里面, 会报错:
+ # TypeError: __init__() got an unexpected keyword argument 'username'
+ def __init__(self, name, username, **kwargs):
"""
init
:param name: hash name
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index c4c31ab0e..c26b00f1e 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -53,6 +53,13 @@ def db_password(self):
password = None
return password
+ @LazyProperty
+ def db_username(self):
+ try:
+ username = self.config_file.get('DB', 'username')
+ except Exception:
+ username = None
+ return username
@LazyProperty
def proxy_getter_functions(self):
From 4eaaa7dc12a5e318368f8eb4f1bb08ef8ee7ca48 Mon Sep 17 00:00:00 2001
From: Jacob
Date: Thu, 8 Nov 2018 22:22:43 +0800
Subject: [PATCH 011/298] =?UTF-8?q?=E4=BC=98=E5=8C=96Docker=E4=BD=BF?=
=?UTF-8?q?=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
1. 标准化Dockerfile
2. 添加Docker-compose的部署方式
3. 整理Docker相关的文件
---
Docker/Dockerfile | 13 +++++++++++++
Dockerfile => Docker/Dockerfile.develop | 0
Docker/docker-compose.yml | 14 ++++++++++++++
README.md | 17 +++++++++++++++++
Run/main.py | 3 ++-
5 files changed, 46 insertions(+), 1 deletion(-)
create mode 100644 Docker/Dockerfile
rename Dockerfile => Docker/Dockerfile.develop (100%)
create mode 100644 Docker/docker-compose.yml
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
new file mode 100644
index 000000000..6ad6f5f53
--- /dev/null
+++ b/Docker/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.6
+WORKDIR /usr/src/app
+COPY . .
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV TZ Asia/Shanghai
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+EXPOSE 5010
+
+WORKDIR /usr/src/app/
+CMD [ "python", "Run/main.py" ]
diff --git a/Dockerfile b/Docker/Dockerfile.develop
similarity index 100%
rename from Dockerfile
rename to Docker/Dockerfile.develop
diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml
new file mode 100644
index 000000000..9529745d5
--- /dev/null
+++ b/Docker/docker-compose.yml
@@ -0,0 +1,14 @@
+version: '2'
+services:
+ proxy_pool:
+ volumes:
+ - ..:/usr/src/app
+ ports:
+ - "5010:5010"
+ links:
+ - proxy_redis
+ image: "proxy_pool"
+ proxy_redis:
+ ports:
+ - "6379:6379"
+ image: "redis"
\ No newline at end of file
diff --git a/README.md b/README.md
index 47480fb92..e5cece52a 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,23 @@ port = 5010 # 监听端口
# 依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可.
```
+* 生产环境 Docker/docker-compose
+
+```shell
+# Workdir proxy_pool
+docker build -t proxy_pool .
+pip install docker-compose
+docker-compose -f Docker/docker-compose.yml up -d
+```
+
+* 开发环境 Docker
+
+```shell
+# Workdir proxy_pool
+docker build -t proxy_pool .
+docker run -it --rm -v $(pwd):/usr/src/app -p 5010:5010 proxy_pool
+```
+
### 使用
启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。
diff --git a/Run/main.py b/Run/main.py
index 6b07654ee..fcd84f6f4 100644
--- a/Run/main.py
+++ b/Run/main.py
@@ -15,7 +15,8 @@
import sys
from multiprocessing import Process
-sys.path.append('../')
+sys.path.append('.')
+sys.path.append('..')
from Api.ProxyApi import run as ProxyApiRun
from Schedule.ProxyValidSchedule import run as ValidRun
From 935929db18effd7cd319a7de1dc0871419ba3267 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 9 Nov 2018 15:49:08 +0800
Subject: [PATCH 012/298] [fix] The Requests package through 2.19.1 before
2018-09-14 for Python sends an HTTP Authorization header to an http URI upon
receiving a same-hostname https-to-http redirect, which makes it easier for
remote attackers to discover credentials by sniffing the network.
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 5d00da69a..bc3581ff5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
APScheduler==3.2.0
werkzeug==0.11.15
Flask==0.12
-requests==2.12.4
+requests==2.20.0
lxml==3.7.2
pymongo
From dcfa0e03777ee833ba06967c33b6cd39e0371384 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 9 Nov 2018 16:29:29 +0800
Subject: [PATCH 013/298] =?UTF-8?q?[update]=20=E4=BC=98=E5=8C=96=E6=8A=93?=
=?UTF-8?q?=E5=8E=BB=E5=87=BD=E6=95=B0=EF=BC=8C=E6=AF=8F=E6=AC=A1=E5=B0=91?=
=?UTF-8?q?=E6=8A=93=E4=B8=80=E4=BA=9B=20=E5=87=8F=E5=B0=91=E8=80=97?=
=?UTF-8?q?=E6=97=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/getFreeProxy.py | 52 ++++++++++++++++++-------------------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index bf2e03f61..a560dc700 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -15,17 +15,10 @@
import sys
import requests
-try:
- from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
-except:
- reload(sys)
- sys.setdefaultencoding('utf-8')
-
sys.path.append('..')
from Util.WebRequest import WebRequest
from Util.utilFunction import getHtmlTree
-from Util.utilFunction import verifyProxyFormat
# for debug to disable insecureWarning
requests.packages.urllib3.disable_warnings()
@@ -48,9 +41,6 @@ class GetFreeProxy(object):
proxy getter
"""
- def __init__(self):
- pass
-
@staticmethod
def freeProxyFirst(page=10):
"""
@@ -164,7 +154,7 @@ def freeProxySixth():
url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
request = WebRequest()
try:
- res = request.get(url).json()
+ res = request.get(url, timeout=10).json()
for row in res['RESULT']['rows']:
yield '{}:{}'.format(row['ip'], row['port'])
except Exception as e:
@@ -180,7 +170,7 @@ def freeProxySeventh():
'https://www.kuaidaili.com/free/intr/{page}/'
]
for url in url_list:
- for page in range(1, 5):
+ for page in range(1, 2):
page_url = url.format(page=page)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath('.//table//tr')
@@ -192,14 +182,14 @@ def freeProxyEight():
"""
秘密代理 http://www.mimiip.com
"""
- url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿
- url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿
- url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明
+ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿
+ url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿
+ url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 2)] # 国内透明
url_list = url_gngao + url_gnpu + url_gntou
request = WebRequest()
for url in url_list:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W].*(\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -213,7 +203,7 @@ def freeProxyNinth():
urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -227,7 +217,7 @@ def freeProxyTen():
urls = ['http://www.ip3366.net/free/']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
for proxy in proxies:
yield ":".join(proxy)
@@ -246,14 +236,14 @@ def freeProxyEleven():
]
request = WebRequest()
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*? | [\s\S]*?\s*?(\d+)\s*? | ',
r.text)
for proxy in proxies:
yield ":".join(proxy)
@staticmethod
- def freeProxyTwelve(page_count=8):
+ def freeProxyTwelve(page_count=2):
"""
guobanjia http://ip.jiangxianli.com/?page=
免费代理库
@@ -278,7 +268,7 @@ def freeProxyWallFirst():
urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -293,7 +283,7 @@ def freeProxyWallSecond():
request = WebRequest()
import base64
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
for proxy in proxies:
yield base64.b64decode(proxy).decode()
@@ -303,7 +293,7 @@ def freeProxyWallThird():
urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
request = WebRequest()
for url in urls:
- r = request.get(url)
+ r = request.get(url, timeout=10)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
for proxy in proxies:
yield ':'.join(proxy)
@@ -312,7 +302,17 @@ def freeProxyWallThird():
if __name__ == '__main__':
from CheckProxy import CheckProxy
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth)
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySixth)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySeventh)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve)
- CheckProxy.checkAllGetProxyFunc()
+ # CheckProxy.checkAllGetProxyFunc()
From f203ae19b6436b88d84d181a8f392c4044e04e09 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 9 Nov 2018 16:30:02 +0800
Subject: [PATCH 014/298] =?UTF-8?q?[update]=20=E6=A3=80=E6=9F=A5=20getter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/CheckProxy.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py
index f6ba9b66a..f29824723 100644
--- a/ProxyGetter/CheckProxy.py
+++ b/ProxyGetter/CheckProxy.py
@@ -62,7 +62,7 @@ def checkGetProxyFunc(func):
count = 0
for proxy in func():
if verifyProxyFormat(proxy):
- log.info("fetch proxy: {}".format(proxy))
+ log.info("{} fetch proxy: {}".format(func_name, proxy))
count += 1
log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
From 69eafeabdd11451adf2b6f42dac1620e729dcba3 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 9 Nov 2018 16:30:37 +0800
Subject: [PATCH 015/298] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E5=8F=AF?=
=?UTF-8?q?=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config.ini | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/Config.ini b/Config.ini
index 24f570f01..1d46fc857 100644
--- a/Config.ini
+++ b/Config.ini
@@ -1,10 +1,9 @@
[DB]
;Configure the database information
-;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB
+;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB
type = SSDB
host = 127.0.0.1
port = 6379
-;port = 8888
name = proxy
#password = yourpassword
@@ -15,17 +14,17 @@ freeProxySecond = 1
;freeProxyThird = 1
freeProxyFourth = 1
freeProxyFifth = 1
-freeProxySixth = 1
+;freeProxySixth = 1
freeProxySeventh = 1
-freeProxyEight = 1
-freeProxyNinth = 1
+;freeProxyEight = 1
+;freeProxyNinth = 1
freeProxyTen = 1
freeProxyEleven = 1
freeProxyTwelve = 1
;foreign website, outside the wall
-freeProxyWallFirst = 1
-freeProxyWallSecond = 1
-freeProxyWallThird = 1
+;freeProxyWallFirst = 1
+;freeProxyWallSecond = 1
+;freeProxyWallThird = 1
[API]
; API config http://127.0.0.1:5010
From d77e1110e99c49bbe0d81a2beb3beb3f0bbe3205 Mon Sep 17 00:00:00 2001
From: 1again
Date: Fri, 9 Nov 2018 20:34:35 +0800
Subject: [PATCH 016/298] =?UTF-8?q?[refine]=20Refine=20GetConfig=20?=
=?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
基于配置化的管理思想,
假设项目的任何地方都需要使用GetConfig
于是可以在GetConfig模块里生成一个config对象.
任何地方需要只要import即可.
---
Api/ProxyApi.py | 3 +--
DB/DbClient.py | 19 +++++++++----------
Manager/ProxyManager.py | 5 ++---
Util/GetConfig.py | 2 ++
4 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py
index b8977f9ca..99a0953a0 100644
--- a/Api/ProxyApi.py
+++ b/Api/ProxyApi.py
@@ -19,7 +19,7 @@
sys.path.append('../')
-from Util.GetConfig import GetConfig
+from Util.GetConfig import config
from Manager.ProxyManager import ProxyManager
app = Flask(__name__)
@@ -84,7 +84,6 @@ def getStatus():
def run():
- config = GetConfig()
if sys.platform.startswith("win"):
app.run(host=config.host_ip, port=config.host_port)
else:
diff --git a/DB/DbClient.py b/DB/DbClient.py
index 0036434ae..869c93af1 100644
--- a/DB/DbClient.py
+++ b/DB/DbClient.py
@@ -16,7 +16,7 @@
import os
import sys
-from Util.GetConfig import GetConfig
+from Util.GetConfig import config
from Util.utilClass import Singleton
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
@@ -55,7 +55,6 @@ def __init__(self):
init
:return:
"""
- self.config = GetConfig()
self.__initDbClient()
def __initDbClient(self):
@@ -64,19 +63,19 @@ def __initDbClient(self):
:return:
"""
__type = None
- if "SSDB" == self.config.db_type:
+ if "SSDB" == config.db_type:
__type = "SsdbClient"
- elif "REDIS" == self.config.db_type:
+ elif "REDIS" == config.db_type:
__type = "RedisClient"
- elif "MONGODB" == self.config.db_type:
+ elif "MONGODB" == config.db_type:
__type = "MongodbClient"
else:
pass
- assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type)
- self.client = getattr(__import__(__type), __type)(name=self.config.db_name,
- host=self.config.db_host,
- port=self.config.db_port,
- password=self.config.db_password)
+ assert __type, 'type error, Not support DB type: {}'.format(config.db_type)
+ self.client = getattr(__import__(__type), __type)(name=config.db_name,
+ host=config.db_host,
+ port=config.db_port,
+ password=config.db_password)
def get(self, key, **kwargs):
return self.client.get(key, **kwargs)
diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py
index 33aa76b39..a2f39b3c5 100644
--- a/Manager/ProxyManager.py
+++ b/Manager/ProxyManager.py
@@ -17,7 +17,7 @@
from Util import EnvUtil
from DB.DbClient import DbClient
-from Util.GetConfig import GetConfig
+from Util.GetConfig import config
from Util.LogHandler import LogHandler
from Util.utilFunction import verifyProxyFormat
from ProxyGetter.getFreeProxy import GetFreeProxy
@@ -30,7 +30,6 @@ class ProxyManager(object):
def __init__(self):
self.db = DbClient()
- self.config = GetConfig()
self.raw_proxy_queue = 'raw_proxy'
self.log = LogHandler('proxy_manager')
self.useful_proxy_queue = 'useful_proxy'
@@ -41,7 +40,7 @@ def refresh(self):
:return:
"""
self.db.changeTable(self.raw_proxy_queue)
- for proxyGetter in self.config.proxy_getter_functions:
+ for proxyGetter in config.proxy_getter_functions:
# fetch
try:
self.log.info("{func}: fetch proxy start".format(func=proxyGetter))
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index c4c31ab0e..efbbe5077 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -70,6 +70,8 @@ def host_port(self):
def processes(self):
return int(self.config_file.get('API', 'processes'))
+config = GetConfig()
+
if __name__ == '__main__':
gg = GetConfig()
print(gg.db_type)
From 40861f429011c53e25693e62daede4b47c253dd2 Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 12 Nov 2018 10:00:38 +0800
Subject: [PATCH 017/298] [update] config annotation
---
Config.ini | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/Config.ini b/Config.ini
index 1d46fc857..54f690397 100644
--- a/Config.ini
+++ b/Config.ini
@@ -27,7 +27,10 @@ freeProxyTwelve = 1
;freeProxyWallThird = 1
[API]
-; API config http://127.0.0.1:5010
+# API config http://127.0.0.1:5010
+# The ip specified when starting the web API
ip = 0.0.0.0
+# he port on which to run the web API
port = 5010
+# Flask processes option
processes = 10
From 2591918c874a001435b3ff0af8604e5070b8ff58 Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 12 Nov 2018 10:32:01 +0800
Subject: [PATCH 018/298] [update] formatting code
---
Util/GetConfig.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index efbbe5077..5dfae9912 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -53,14 +53,13 @@ def db_password(self):
password = None
return password
-
@LazyProperty
def proxy_getter_functions(self):
return self.config_file.options('ProxyGetter')
@LazyProperty
def host_ip(self):
- return self.config_file.get('API','ip')
+ return self.config_file.get('API', 'ip')
@LazyProperty
def host_port(self):
@@ -70,6 +69,7 @@ def host_port(self):
def processes(self):
return int(self.config_file.get('API', 'processes'))
+
config = GetConfig()
if __name__ == '__main__':
From 8a0404521ddcf17031a5975f83c7b6b5a8e3b662 Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 12 Nov 2018 11:27:13 +0800
Subject: [PATCH 019/298] [update] set default pwd option
---
Util/GetConfig.py | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index 5dfae9912..0f60fcd2f 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -47,11 +47,7 @@ def db_port(self):
@LazyProperty
def db_password(self):
- try:
- password = self.config_file.get('DB', 'password')
- except Exception:
- password = None
- return password
+ return self.config_file.get('DB', 'password', fallback="default pwd")
@LazyProperty
def proxy_getter_functions(self):
@@ -82,3 +78,4 @@ def processes(self):
print(gg.host_ip)
print(gg.host_port)
print(gg.processes)
+ print(gg.db_password)
From 6525ea8e09f3a128f0e2652d5d333005b41196c2 Mon Sep 17 00:00:00 2001
From: jhao
Date: Tue, 13 Nov 2018 10:28:31 +0800
Subject: [PATCH 020/298] =?UTF-8?q?[update]=20=E8=B0=83=E6=95=B4=E6=9B=B4?=
=?UTF-8?q?=E6=96=B0=E9=80=9F=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Schedule/ProxyRefreshSchedule.py | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py
index 38668072d..a61cc5a25 100644
--- a/Schedule/ProxyRefreshSchedule.py
+++ b/Schedule/ProxyRefreshSchedule.py
@@ -18,8 +18,7 @@
import time
import logging
from threading import Thread
-# 使用后台调度,不使用阻塞式~
-from apscheduler.schedulers.background import BackgroundScheduler as Sch
+from apscheduler.schedulers.background import BackgroundScheduler
sys.path.append('../')
@@ -74,7 +73,7 @@ def refreshPool():
pp.validProxy()
-def batch_refresh(process_num=30):
+def batchRefresh(process_num=30):
# 检验新代理
pl = []
for num in range(process_num):
@@ -89,21 +88,23 @@ def batch_refresh(process_num=30):
pl[num].join()
-def fetch_all():
+def fetchAll():
p = ProxyRefreshSchedule()
# 获取新代理
p.refresh()
def run():
- sch = Sch()
- sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次
- sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次
- sch.start()
- fetch_all()
+ scheduler = BackgroundScheduler()
+ # 不用太快, 网站更新速度比较慢, 太快会加大验证压力, 导致raw_proxy积压
+ scheduler.add_job(fetchAll, 'interval', minutes=10, id="fetch_proxy")
+ scheduler.add_job(batchRefresh, "interval", minutes=1) # 每分钟检查一次
+ scheduler.start()
+
+ fetchAll()
while True:
- time.sleep(1)
+ time.sleep(3)
if __name__ == '__main__':
From c1e74b4237971caf9dfefede1405e0516c27fe7a Mon Sep 17 00:00:00 2001
From: jhao
Date: Tue, 13 Nov 2018 10:29:14 +0800
Subject: [PATCH 021/298] =?UTF-8?q?[update]=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Manager/ProxyManager.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py
index a2f39b3c5..c770b6224 100644
--- a/Manager/ProxyManager.py
+++ b/Manager/ProxyManager.py
@@ -36,7 +36,7 @@ def __init__(self):
def refresh(self):
"""
- fetch proxy into Db by ProxyGetter
+ fetch proxy into Db by ProxyGetter/getFreeProxy.py
:return:
"""
self.db.changeTable(self.raw_proxy_queue)
@@ -45,7 +45,7 @@ def refresh(self):
try:
self.log.info("{func}: fetch proxy start".format(func=proxyGetter))
for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
- # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度
+ # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
proxy = proxy.strip()
if proxy and verifyProxyFormat(proxy):
self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
From e41a9cbe796744f91e395e4064ebb5c9ef82e39c Mon Sep 17 00:00:00 2001
From: jhao
Date: Tue, 13 Nov 2018 10:30:01 +0800
Subject: [PATCH 022/298] [update] dbclient
---
DB/DbClient.py | 10 ++++------
DB/SsdbClient.py | 14 ++++++--------
2 files changed, 10 insertions(+), 14 deletions(-)
diff --git a/DB/DbClient.py b/DB/DbClient.py
index 869c93af1..f79fc8511 100644
--- a/DB/DbClient.py
+++ b/DB/DbClient.py
@@ -44,7 +44,7 @@ class DbClient(object):
所有方法需要相应类去具体实现:
SSDB:SsdbClient.py
- REDIS:RedisClient.py
+ REDIS:RedisClient.py 停用 统一使用SsdbClient.py
"""
@@ -66,7 +66,7 @@ def __initDbClient(self):
if "SSDB" == config.db_type:
__type = "SsdbClient"
elif "REDIS" == config.db_type:
- __type = "RedisClient"
+ __type = "SsdbClient"
elif "MONGODB" == config.db_type:
__type = "MongodbClient"
else:
@@ -107,7 +107,5 @@ def getNumber(self):
if __name__ == "__main__":
account = DbClient()
- print(account.get())
- account.changeTable('use')
- account.put('ac')
- print(account.get())
+ account.changeTable('useful_proxy')
+ print(account.pop())
diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py
index 202ddaa8f..4ceedd1df 100644
--- a/DB/SsdbClient.py
+++ b/DB/SsdbClient.py
@@ -31,16 +31,13 @@ class SsdbClient(object):
验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1;
"""
- # 为了保持DbClient的标准
- # 在SsdbClient里面接受username参数, 但不进行使用.
- # 因为不能将username通过kwargs传进redis.Redis里面, 会报错:
- # TypeError: __init__() got an unexpected keyword argument 'username'
- def __init__(self, name, username, **kwargs):
+ def __init__(self, name, **kwargs):
"""
init
:param name: hash name
- :param host: ssdb host
- :param port: ssdb port
+ :param host: host
+ :param port: port
+ :param password: password
:return:
"""
self.name = name
@@ -114,6 +111,7 @@ def getNumber(self):
def changeTable(self, name):
self.name = name
+
if __name__ == '__main__':
- c = SsdbClient('useful_proxy', '118.24.52.95', 8899)
+ c = SsdbClient(name='useful_proxy', host='127.0.0.1', port=8899, password=None)
print(c.getAll())
From 428359c8dada998481f038dbdc8d3923e5850c0e Mon Sep 17 00:00:00 2001
From: jhao
Date: Tue, 13 Nov 2018 14:02:03 +0800
Subject: [PATCH 023/298] Merge branch 'jhao104/master' of
https://github.com/1again/proxy_pool into 1again-jhao104/master
# Conflicts:
# DB/DbClient.py
# Util/GetConfig.py
---
Config.ini | 2 +-
README.md | 2 +-
Util/GetConfig.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/Config.ini b/Config.ini
index 44ef085d2..c8a9cc266 100644
--- a/Config.ini
+++ b/Config.ini
@@ -3,7 +3,7 @@
;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB
type = SSDB
host = 127.0.0.1
-port = 6379
+port = 8888
name = proxy
;username = your_username (Only Mongodb)
;password = your_password
diff --git a/README.md b/README.md
index e5cece52a..8bdca40c5 100644
--- a/README.md
+++ b/README.md
@@ -195,7 +195,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致
这里感谢以下contributor的无私奉献:
- [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)
+ [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)| [@1again](https://github.com/1again)
### Release Notes
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index 0f60fcd2f..cd354e20f 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -47,7 +47,7 @@ def db_port(self):
@LazyProperty
def db_password(self):
- return self.config_file.get('DB', 'password', fallback="default pwd")
+ return self.config_file.get('DB', 'password', fallback=None)
@LazyProperty
def proxy_getter_functions(self):
From 3c3ddaff09a346680c4bcfceb52fb5db0e690d1b Mon Sep 17 00:00:00 2001
From: incoding
Date: Wed, 14 Nov 2018 13:17:00 +0800
Subject: [PATCH 024/298] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=9B=B4=E5=8A=A0?=
=?UTF-8?q?=E4=B8=A5=E8=B0=A8=E7=9A=84=E4=BB=A3=E7=90=86=E6=A0=A1=E9=AA=8C?=
=?UTF-8?q?=E8=A7=84=E5=88=99=EF=BC=88=E4=B8=80=E4=BA=9B=E9=9D=9E=E6=B3=95?=
=?UTF-8?q?=E4=BB=A3=E7=90=86=E4=B9=9F=E4=BC=9A=E8=BF=94=E5=9B=9E200?=
=?UTF-8?q?=E7=8A=B6=E6=80=81=E7=A0=81=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Util/utilFunction.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Util/utilFunction.py b/Util/utilFunction.py
index fc26a59b1..ec86c1fe3 100644
--- a/Util/utilFunction.py
+++ b/Util/utilFunction.py
@@ -100,7 +100,7 @@ def validUsefulProxy(proxy):
try:
# 超过20秒的代理就不要了
r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False)
- if r.status_code == 200:
+ if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']:
# logger.info('%s is ok' % proxy)
return True
except Exception as e:
From e5c1b89c919bae95fcb14e715d7b2e91115dfbe3 Mon Sep 17 00:00:00 2001
From: incoding
Date: Wed, 14 Nov 2018 13:33:47 +0800
Subject: [PATCH 025/298] =?UTF-8?q?=E6=B7=BB=E5=8A=A0my=E6=96=87=E4=BB=B6?=
=?UTF-8?q?=E5=A4=B9=EF=BC=8C=E4=BF=9D=E5=AD=98=E5=AE=9A=E5=88=B6=E4=BF=AE?=
=?UTF-8?q?=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
my/Config.ini | 37 +++++++++++++++++++++++++++++++++++++
my/Dockerfile | 16 ++++++++++++++++
my/build.sh | 1 +
my/run.sh | 14 ++++++++++++++
4 files changed, 68 insertions(+)
create mode 100644 my/Config.ini
create mode 100644 my/Dockerfile
create mode 100755 my/build.sh
create mode 100755 my/run.sh
diff --git a/my/Config.ini b/my/Config.ini
new file mode 100644
index 000000000..627092c1a
--- /dev/null
+++ b/my/Config.ini
@@ -0,0 +1,37 @@
+[DB]
+;Configure the database information
+;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB
+type = PROXY_POOL_DB_TYPE
+host = PROXY_POOL_DB_HOST
+port = PROXY_POOL_DB_PORT
+name = proxy
+;username = your_username (Only Mongodb)
+;password = your_password
+
+[ProxyGetter]
+;register the proxy getter function
+freeProxyFirst = 1
+freeProxySecond = 1
+;freeProxyThird = 1
+freeProxyFourth = 1
+freeProxyFifth = 1
+;freeProxySixth = 1
+freeProxySeventh = 1
+;freeProxyEight = 1
+;freeProxyNinth = 1
+freeProxyTen = 1
+freeProxyEleven = 1
+freeProxyTwelve = 1
+;foreign website, outside the wall
+;freeProxyWallFirst = 1
+;freeProxyWallSecond = 1
+;freeProxyWallThird = 1
+
+[API]
+# API config http://127.0.0.1:5010
+# The ip specified when starting the web API
+ip = 0.0.0.0
+# he port on which to run the web API
+port = 8080
+# Flask processes option
+processes = 10
diff --git a/my/Dockerfile b/my/Dockerfile
new file mode 100644
index 000000000..cb042627c
--- /dev/null
+++ b/my/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.6
+
+WORKDIR /usr/src/app
+
+ENV TZ=Asia/Shanghai \
+ PROXY_POOL_DB_TYPE=SSDB \
+ PROXY_POOL_DB_HOST=redis \
+ PROXY_POOL_DB_PORT=6379
+
+COPY . .
+
+RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./
+
+CMD [ "my/run.sh" ]
+
+EXPOSE 8080
diff --git a/my/build.sh b/my/build.sh
new file mode 100755
index 000000000..328e9449d
--- /dev/null
+++ b/my/build.sh
@@ -0,0 +1 @@
+docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile .
\ No newline at end of file
diff --git a/my/run.sh b/my/run.sh
new file mode 100755
index 000000000..441ace853
--- /dev/null
+++ b/my/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for var in \
+ PROXY_POOL_DB_TYPE \
+ PROXY_POOL_DB_HOST \
+ PROXY_POOL_DB_PORT \
+; do
+ val="${!var}"
+ if [ "$val" ]; then
+ sed -ri "s/$var/$val/" Config.ini
+ fi
+done
+
+python Run/main.py
\ No newline at end of file
From 110b0df1e29529346314378155890d740064ca0b Mon Sep 17 00:00:00 2001
From: jhao
Date: Wed, 14 Nov 2018 16:51:41 +0800
Subject: [PATCH 026/298] Merge branch 'jhao104/master' of
https://github.com/1again/proxy_pool into 1again-jhao104/master
# Conflicts:
# DB/DbClient.py
# Util/GetConfig.py
---
Api/ProxyApi.py | 5 +----
Config.ini | 4 +---
Schedule/ProxyValidSchedule.py | 2 +-
3 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py
index 99a0953a0..fc759a363 100644
--- a/Api/ProxyApi.py
+++ b/Api/ProxyApi.py
@@ -84,10 +84,7 @@ def getStatus():
def run():
- if sys.platform.startswith("win"):
- app.run(host=config.host_ip, port=config.host_port)
- else:
- app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes)
+ app.run(host=config.host_ip, port=config.host_port)
if __name__ == '__main__':
diff --git a/Config.ini b/Config.ini
index c8a9cc266..5bdf095a1 100644
--- a/Config.ini
+++ b/Config.ini
@@ -32,6 +32,4 @@ freeProxyTwelve = 1
# The ip specified when starting the web API
ip = 0.0.0.0
# he port on which to run the web API
-port = 5010
-# Flask processes option
-processes = 10
+port = 8080
diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py
index 9b075cf90..098c8a336 100644
--- a/Schedule/ProxyValidSchedule.py
+++ b/Schedule/ProxyValidSchedule.py
@@ -32,7 +32,7 @@ def __init__(self):
self.queue = Queue()
self.proxy_item = dict()
- def __validProxy(self, threads=10):
+ def __validProxy(self, threads=20):
"""
验证useful_proxy代理
:param threads: 线程数
From a3ba910f391fd0220f357f926ef2b5ab6e0a973f Mon Sep 17 00:00:00 2001
From: windhw
Date: Thu, 6 Dec 2018 12:48:10 +0800
Subject: [PATCH 027/298] Update main.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
增加对SIGTERM的处理,这样在后台运行的时候,如果kill掉主进程,子进程也能kill
---
Run/main.py | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/Run/main.py b/Run/main.py
index fcd84f6f4..cce7b6142 100644
--- a/Run/main.py
+++ b/Run/main.py
@@ -12,7 +12,7 @@
"""
__author__ = 'JHao'
-import sys
+import sys,signal
from multiprocessing import Process
sys.path.append('.')
@@ -31,6 +31,14 @@ def run():
p_list.append(p2)
p3 = Process(target=RefreshRun, name='RefreshRun')
p_list.append(p3)
+
+ def kill_child_processes(signum,frame):
+ for p in p_list:
+ p.terminate()
+ sys.exit(1)
+
+ signal.signal(signal.SIGTERM, kill_child_processes)
+
for p in p_list:
p.daemon = True
From 2260c6d02f2374d7b4952787cac964f648ffd2b2 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 7 Dec 2018 14:21:51 +0800
Subject: [PATCH 028/298] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0httpbin?=
=?UTF-8?q?=E6=A3=80=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Util/utilFunction.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Util/utilFunction.py b/Util/utilFunction.py
index ec86c1fe3..f4e802263 100644
--- a/Util/utilFunction.py
+++ b/Util/utilFunction.py
@@ -100,7 +100,7 @@ def validUsefulProxy(proxy):
try:
# 超过20秒的代理就不要了
r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False)
- if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']:
+ if r.status_code == 200 and r.json().get("origin"):
# logger.info('%s is ok' % proxy)
return True
except Exception as e:
From 26aaf1851a5b9bf4bc84ab344835d37d857ab6d7 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 7 Dec 2018 14:23:49 +0800
Subject: [PATCH 029/298] =?UTF-8?q?=E3=80=90del=E3=80=91delete=20un=20use?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
my/Config.ini | 37 -------------------------------------
my/Dockerfile | 16 ----------------
my/build.sh | 1 -
my/run.sh | 14 --------------
4 files changed, 68 deletions(-)
delete mode 100644 my/Config.ini
delete mode 100644 my/Dockerfile
delete mode 100755 my/build.sh
delete mode 100755 my/run.sh
diff --git a/my/Config.ini b/my/Config.ini
deleted file mode 100644
index 627092c1a..000000000
--- a/my/Config.ini
+++ /dev/null
@@ -1,37 +0,0 @@
-[DB]
-;Configure the database information
-;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB
-type = PROXY_POOL_DB_TYPE
-host = PROXY_POOL_DB_HOST
-port = PROXY_POOL_DB_PORT
-name = proxy
-;username = your_username (Only Mongodb)
-;password = your_password
-
-[ProxyGetter]
-;register the proxy getter function
-freeProxyFirst = 1
-freeProxySecond = 1
-;freeProxyThird = 1
-freeProxyFourth = 1
-freeProxyFifth = 1
-;freeProxySixth = 1
-freeProxySeventh = 1
-;freeProxyEight = 1
-;freeProxyNinth = 1
-freeProxyTen = 1
-freeProxyEleven = 1
-freeProxyTwelve = 1
-;foreign website, outside the wall
-;freeProxyWallFirst = 1
-;freeProxyWallSecond = 1
-;freeProxyWallThird = 1
-
-[API]
-# API config http://127.0.0.1:5010
-# The ip specified when starting the web API
-ip = 0.0.0.0
-# he port on which to run the web API
-port = 8080
-# Flask processes option
-processes = 10
diff --git a/my/Dockerfile b/my/Dockerfile
deleted file mode 100644
index cb042627c..000000000
--- a/my/Dockerfile
+++ /dev/null
@@ -1,16 +0,0 @@
-FROM python:3.6
-
-WORKDIR /usr/src/app
-
-ENV TZ=Asia/Shanghai \
- PROXY_POOL_DB_TYPE=SSDB \
- PROXY_POOL_DB_HOST=redis \
- PROXY_POOL_DB_PORT=6379
-
-COPY . .
-
-RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./
-
-CMD [ "my/run.sh" ]
-
-EXPOSE 8080
diff --git a/my/build.sh b/my/build.sh
deleted file mode 100755
index 328e9449d..000000000
--- a/my/build.sh
+++ /dev/null
@@ -1 +0,0 @@
-docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile .
\ No newline at end of file
diff --git a/my/run.sh b/my/run.sh
deleted file mode 100755
index 441ace853..000000000
--- a/my/run.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-for var in \
- PROXY_POOL_DB_TYPE \
- PROXY_POOL_DB_HOST \
- PROXY_POOL_DB_PORT \
-; do
- val="${!var}"
- if [ "$val" ]; then
- sed -ri "s/$var/$val/" Config.ini
- fi
-done
-
-python Run/main.py
\ No newline at end of file
From 223f57d1eb8d243b1d69e28b90a39f0529ec4407 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 7 Dec 2018 15:29:55 +0800
Subject: [PATCH 030/298] [fix] fix password
---
Util/GetConfig.py | 9 ++-------
Util/utilClass.py | 4 ++--
2 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index cd354e20f..c25035504 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -26,7 +26,7 @@ class GetConfig(object):
def __init__(self):
self.pwd = os.path.split(os.path.realpath(__file__))[0]
self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini')
- self.config_file = ConfigParse()
+ self.config_file = ConfigParse(defaults={"password": None})
self.config_file.read(self.config_path)
@LazyProperty
@@ -47,7 +47,7 @@ def db_port(self):
@LazyProperty
def db_password(self):
- return self.config_file.get('DB', 'password', fallback=None)
+ return self.config_file.get('DB', 'password')
@LazyProperty
def proxy_getter_functions(self):
@@ -61,10 +61,6 @@ def host_ip(self):
def host_port(self):
return int(self.config_file.get('API', 'port'))
- @LazyProperty
- def processes(self):
- return int(self.config_file.get('API', 'processes'))
-
config = GetConfig()
@@ -77,5 +73,4 @@ def processes(self):
print(gg.proxy_getter_functions)
print(gg.host_ip)
print(gg.host_port)
- print(gg.processes)
print(gg.db_password)
diff --git a/Util/utilClass.py b/Util/utilClass.py
index 89112ffd8..b3a35f141 100644
--- a/Util/utilClass.py
+++ b/Util/utilClass.py
@@ -44,8 +44,8 @@ class ConfigParse(ConfigParser):
rewrite ConfigParser, for support upper option
"""
- def __init__(self):
- ConfigParser.__init__(self)
+ def __init__(self, *args, **kwargs):
+ ConfigParser.__init__(self, *args, **kwargs)
def optionxform(self, optionstr):
return optionstr
From d49a66a6a1051e2eb86231e03a6a0ab3875dee1e Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 15 Feb 2019 16:02:02 +0800
Subject: [PATCH 031/298] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?=
=?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?=
=?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Api/ProxyApi.py | 2 +-
Config.ini | 2 +-
Config/ConfigGetter.py | 71 ++++++++++++++++++++++++
{Test => Config}/__init__.py | 11 ++--
Config/setting.py | 54 ++++++++++++++++++
DB/DbClient.py | 2 +-
DB/SsdbClient.py | 4 +-
Manager/ProxyManager.py | 2 +-
Schedule/ProxyValidSchedule.py | 4 +-
Test/.pytest_cache/v/cache/lastfailed | 3 -
Test/.pytest_cache/v/cache/nodeids | 3 -
Test/{testGetConfig.py => testConfig.py} | 22 ++++----
Util/GetConfig.py | 7 +--
Util/utilClass.py | 19 -------
test.py | 5 +-
15 files changed, 154 insertions(+), 57 deletions(-)
create mode 100644 Config/ConfigGetter.py
rename {Test => Config}/__init__.py (56%)
create mode 100644 Config/setting.py
delete mode 100644 Test/.pytest_cache/v/cache/lastfailed
delete mode 100644 Test/.pytest_cache/v/cache/nodeids
rename Test/{testGetConfig.py => testConfig.py} (60%)
diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py
index fc759a363..91df76f88 100644
--- a/Api/ProxyApi.py
+++ b/Api/ProxyApi.py
@@ -19,7 +19,7 @@
sys.path.append('../')
-from Util.GetConfig import config
+from Config.ConfigGetter import config
from Manager.ProxyManager import ProxyManager
app = Flask(__name__)
diff --git a/Config.ini b/Config.ini
index 5bdf095a1..ee13eaf2c 100644
--- a/Config.ini
+++ b/Config.ini
@@ -1,6 +1,6 @@
[DB]
;Configure the database information
-;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB
+;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB
type = SSDB
host = 127.0.0.1
port = 8888
diff --git a/Config/ConfigGetter.py b/Config/ConfigGetter.py
new file mode 100644
index 000000000..56c766c0d
--- /dev/null
+++ b/Config/ConfigGetter.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+"""
+-------------------------------------------------
+ File Name: ConfigGetter
+ Description : 读取配置
+ Author : JHao
+ date: 2019/2/15
+-------------------------------------------------
+ Change Activity:
+ 2019/2/15:
+-------------------------------------------------
+"""
+__author__ = 'JHao'
+
+
+from Util.utilClass import LazyProperty
+from Config.setting import *
+
+
+class ConfigGetter(object):
+ """
+ get config
+ """
+
+ def __init__(self):
+ pass
+
+ @LazyProperty
+ def db_type(self):
+ return DATABASES.get("default", {}).get("TYPE", "SSDB")
+
+ @LazyProperty
+ def db_name(self):
+ return DATABASES.get("default", {}).get("NAME", "proxy")
+
+ @LazyProperty
+ def db_host(self):
+ return DATABASES.get("default", {}).get("HOST", "127.0.0.1")
+
+ @LazyProperty
+ def db_port(self):
+ return DATABASES.get("default", {}).get("PORT", 8080)
+
+ @LazyProperty
+ def db_password(self):
+ return DATABASES.get("default", {}).get("PASSWORD", "")
+
+ @LazyProperty
+ def proxy_getter_functions(self):
+ return PROXY_GETTER
+
+ @LazyProperty
+ def host_ip(self):
+ return SERVER_API.get("HOST", "127.0.0.1")
+
+ @LazyProperty
+ def host_port(self):
+ return SERVER_API.get("PORT", 5010)
+
+
+config = ConfigGetter()
+
+if __name__ == '__main__':
+ print(config.db_type)
+ print(config.db_name)
+ print(config.db_host)
+ print(config.db_port)
+ print(config.proxy_getter_functions)
+ print(config.host_ip)
+ print(config.host_port)
+ print(config.db_password)
diff --git a/Test/__init__.py b/Config/__init__.py
similarity index 56%
rename from Test/__init__.py
rename to Config/__init__.py
index 898942953..9a7d547ee 100644
--- a/Test/__init__.py
+++ b/Config/__init__.py
@@ -1,13 +1,12 @@
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
- File Name: __init__.py
- Description :
- Author : J_hao
- date: 2017/7/31
+ File Name: __init__
+ Description :
+ Author : JHao
+ date: 2019/2/15
-------------------------------------------------
Change Activity:
- 2017/7/31:
+ 2019/2/15:
-------------------------------------------------
"""
-__author__ = 'J_hao'
diff --git a/Config/setting.py b/Config/setting.py
new file mode 100644
index 000000000..39ae36748
--- /dev/null
+++ b/Config/setting.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""
+-------------------------------------------------
+ File Name: setting.py
+ Description : 配置文件
+ Author : JHao
+ date: 2019/2/15
+-------------------------------------------------
+ Change Activity:
+ 2019/2/15:
+-------------------------------------------------
+"""
+
+# database config
+
+DATABASES = {
+ "default": {
+ "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB
+ "HOST": "127.0.0.1",
+ "PORT": 8888,
+ "NAME": "proxy",
+ "PASSWORD": ""
+
+ }
+}
+
+# register the proxy getter function
+
+PROXY_GETTER = [
+ "freeProxyFirst",
+ "freeProxySecond",
+ # "freeProxyThird",
+ "freeProxyFourth",
+ "freeProxyFifth",
+ # "freeProxySixth"
+ "freeProxySeventh",
+ # "freeProxyEight",
+ # "freeProxyNinth",
+ "freeProxyTen",
+ "freeProxyEleven",
+ "freeProxyTwelve",
+ # foreign website, outside the wall
+ "freeProxyWallFirst",
+ "freeProxyWallSecond",
+ "freeProxyWallThird"
+]
+
+
+# # API config http://127.0.0.1:5010
+
+SERVER_API = {
+ "HOST": "0.0.0.0", # The ip specified which starting the web API
+ "PORT": 5010 # port number to which the server listens to
+}
\ No newline at end of file
diff --git a/DB/DbClient.py b/DB/DbClient.py
index f79fc8511..baa1f79fc 100644
--- a/DB/DbClient.py
+++ b/DB/DbClient.py
@@ -16,7 +16,7 @@
import os
import sys
-from Util.GetConfig import config
+from Config.ConfigGetter import config
from Util.utilClass import Singleton
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py
index 4ceedd1df..85545b355 100644
--- a/DB/SsdbClient.py
+++ b/DB/SsdbClient.py
@@ -3,7 +3,7 @@
"""
-------------------------------------------------
File Name: SsdbClient.py
- Description : 封装SSDB操作
+ Description : 封装SSDB/Redis操作
Author : JHao
date: 2016/12/2
-------------------------------------------------
@@ -27,7 +27,7 @@ class SsdbClient(object):
SSDB client
SSDB中代理存放的容器为hash:
- 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性;
+ 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性;
验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1;
"""
diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py
index c770b6224..fd007773b 100644
--- a/Manager/ProxyManager.py
+++ b/Manager/ProxyManager.py
@@ -17,7 +17,7 @@
from Util import EnvUtil
from DB.DbClient import DbClient
-from Util.GetConfig import config
+from Config.ConfigGetter import config
from Util.LogHandler import LogHandler
from Util.utilFunction import verifyProxyFormat
from ProxyGetter.getFreeProxy import GetFreeProxy
diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py
index 098c8a336..6b1fa6485 100644
--- a/Schedule/ProxyValidSchedule.py
+++ b/Schedule/ProxyValidSchedule.py
@@ -56,8 +56,8 @@ def main(self):
self.log.info("Start valid useful proxy")
self.__validProxy()
else:
- self.log.info('Valid Complete! sleep 5 minutes.')
- time.sleep(60 * 5)
+ self.log.info('Valid Complete! sleep 5 sec.')
+ time.sleep(5)
self.putQueue()
def putQueue(self):
diff --git a/Test/.pytest_cache/v/cache/lastfailed b/Test/.pytest_cache/v/cache/lastfailed
deleted file mode 100644
index 65c9a06d6..000000000
--- a/Test/.pytest_cache/v/cache/lastfailed
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "testGetFreeProxy.py::testGetFreeProxy": true
-}
\ No newline at end of file
diff --git a/Test/.pytest_cache/v/cache/nodeids b/Test/.pytest_cache/v/cache/nodeids
deleted file mode 100644
index 0ce3684ce..000000000
--- a/Test/.pytest_cache/v/cache/nodeids
+++ /dev/null
@@ -1,3 +0,0 @@
-[
- "testGetFreeProxy.py::testGetFreeProxy"
-]
\ No newline at end of file
diff --git a/Test/testGetConfig.py b/Test/testConfig.py
similarity index 60%
rename from Test/testGetConfig.py
rename to Test/testConfig.py
index 7f44fa6b4..7ed759387 100644
--- a/Test/testGetConfig.py
+++ b/Test/testConfig.py
@@ -12,22 +12,22 @@
"""
__author__ = 'J_hao'
-from Util.GetConfig import GetConfig
+from Config.ConfigGetter import config
# noinspection PyPep8Naming
-def testGetConfig():
+def testConfig():
"""
- test class GetConfig in Util/GetConfig
:return:
"""
- gg = GetConfig()
- print(gg.db_type)
- print(gg.db_name)
- print(gg.db_host)
- print(gg.db_port)
- assert isinstance(gg.proxy_getter_functions, list)
- print(gg.proxy_getter_functions)
+ print(config.db_type)
+ print(config.db_name)
+ print(config.db_host)
+ print(config.db_port)
+ print(config.db_password)
+ assert isinstance(config.proxy_getter_functions, list)
+ print(config.proxy_getter_functions)
+
if __name__ == '__main__':
- testGetConfig()
+ testConfig()
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
index c25035504..65554b317 100644
--- a/Util/GetConfig.py
+++ b/Util/GetConfig.py
@@ -13,8 +13,6 @@
"""
__author__ = 'JHao'
-import os
-from Util.utilClass import ConfigParse
from Util.utilClass import LazyProperty
@@ -24,10 +22,7 @@ class GetConfig(object):
"""
def __init__(self):
- self.pwd = os.path.split(os.path.realpath(__file__))[0]
- self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini')
- self.config_file = ConfigParse(defaults={"password": None})
- self.config_file.read(self.config_path)
+ pass
@LazyProperty
def db_type(self):
diff --git a/Util/utilClass.py b/Util/utilClass.py
index b3a35f141..cffe72443 100644
--- a/Util/utilClass.py
+++ b/Util/utilClass.py
@@ -9,7 +9,6 @@
-------------------------------------------------
Change Activity:
2016/12/3: Class LazyProperty
- 2016/12/4: rewrite ConfigParser
-------------------------------------------------
"""
__author__ = 'JHao'
@@ -33,24 +32,6 @@ def __get__(self, instance, owner):
return value
-try:
- from configparser import ConfigParser # py3
-except:
- from ConfigParser import ConfigParser # py2
-
-
-class ConfigParse(ConfigParser):
- """
- rewrite ConfigParser, for support upper option
- """
-
- def __init__(self, *args, **kwargs):
- ConfigParser.__init__(self, *args, **kwargs)
-
- def optionxform(self, optionstr):
- return optionstr
-
-
class Singleton(type):
"""
Singleton Metaclass
diff --git a/test.py b/test.py
index 518710d3b..d636535a9 100644
--- a/test.py
+++ b/test.py
@@ -12,4 +12,7 @@
"""
__author__ = 'JHao'
-from Schedule import ProxyRefreshSchedule
\ No newline at end of file
+from Test import testConfig
+
+if __name__ == '__main__':
+ testConfig.testConfig()
From 2b54d4af03c96515198fada0ee630cf98ea52cf9 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 15 Feb 2019 16:06:33 +0800
Subject: [PATCH 032/298] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?=
=?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?=
=?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Test/__init__.py | 13 +++++++++++++
1 file changed, 13 insertions(+)
create mode 100644 Test/__init__.py
diff --git a/Test/__init__.py b/Test/__init__.py
new file mode 100644
index 000000000..9b16c75ff
--- /dev/null
+++ b/Test/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""
+-------------------------------------------------
+ File Name: __init__
+ Description :
+ Author : JHao
+ date: 2019/2/15
+-------------------------------------------------
+ Change Activity:
+ 2019/2/15:
+-------------------------------------------------
+"""
+__author__ = 'JHao'
\ No newline at end of file
From f00a4569d26ef963656cf9b7617cec9f8780e666 Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 15 Feb 2019 16:24:37 +0800
Subject: [PATCH 033/298] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?=
=?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?=
=?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
README.md | 61 ++++++++++++++++++++++++++++++++++---------------------
1 file changed, 38 insertions(+), 23 deletions(-)
diff --git a/README.md b/README.md
index 8bdca40c5..4f253af81 100644
--- a/README.md
+++ b/README.md
@@ -39,25 +39,41 @@ git clone git@github.com:jhao104/proxy_pool.git
pip install -r requirements.txt
```
-* 配置Config.ini:
+* 配置Config/setting.py:
```shell
-# Config.ini 为项目配置文件
-# 配置DB
-type = SSDB # 如果使用SSDB或redis数据库,均配置为SSDB
-host = localhost # db host
-port = 8888 # db port
-name = proxy # 默认配置
+# Config/setting.py 为项目配置文件
+
+# 配置DB
+DATABASES = {
+ "default": {
+ "TYPE": "SSDB", # 如果使用SSDB或redis数据库,均配置为SSDB
+ "HOST": "127.0.0.1", # db host
+ "PORT": 8888, # db port
+ "NAME": "proxy", # 默认配置
+ "PASSWORD": "" # db password
+
+ }
+}
+
# 配置 ProxyGetter
-freeProxyFirst = 1 # 这里是启动的抓取函数,可在ProxyGetter/getFreeProxy.py 扩展
-freeProxySecond = 1
-....
-# 配置 HOST (api服务)
-ip = 127.0.0.1 # 监听ip,0.0.0.0开启外网访问
-port = 5010 # 监听端口
-# 上面配置启动后,代理api地址为 http://127.0.0.1:5010
+PROXY_GETTER = [
+ "freeProxyFirst", # 这里是启用的代理抓取函数名,可在ProxyGetter/getFreeProxy.py 扩展
+ "freeProxySecond",
+ ....
+]
+
+
+# 配置 API服务
+
+SERVER_API = {
+ "HOST": "0.0.0.0", # 监听ip, 0.0.0.0 监听所有IP
+ "PORT": 5010 # 监听端口
+}
+
+# 上面配置启动后,代理池访问地址为 http://127.0.0.1:5010
```
@@ -164,18 +180,17 @@ class GetFreeProxy(object):
# 确保每个proxy都是 host:ip正确的格式就行
```
-* 2、添加好方法后,修改Config.ini文件中的`[ProxyGetter]`项:
+* 2、添加好方法后,修改Config/setting.py文件中的`PROXY_GETTER`项:
- 在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字:
+ 在`PROXY_GETTER`下添加自定义的方法的名字:
```shell
-
-[ProxyGetter]
-;register the proxy getter function
-freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可
-....
-freeProxyCustom = 1 # 确保名字和你添加方法名字一致
-
+PROXY_GETTER = [
+ "freeProxyFirst",
+ "freeProxySecond",
+ ....
+ "freeProxyCustom" # # 确保名字和你添加方法名字一致
+]
```
From 16c5a04ba43c05608261581a6affeee1a9d1728f Mon Sep 17 00:00:00 2001
From: jhao
Date: Fri, 15 Feb 2019 16:29:33 +0800
Subject: [PATCH 034/298] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?=
=?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?=
=?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config.ini | 35 --------------------
Config/setting.py | 2 +-
Test/testConfig.py | 2 +-
Test/testGetFreeProxy.py | 11 +++----
Util/GetConfig.py | 71 ----------------------------------------
5 files changed, 7 insertions(+), 114 deletions(-)
delete mode 100644 Config.ini
delete mode 100644 Util/GetConfig.py
diff --git a/Config.ini b/Config.ini
deleted file mode 100644
index ee13eaf2c..000000000
--- a/Config.ini
+++ /dev/null
@@ -1,35 +0,0 @@
-[DB]
-;Configure the database information
-;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB
-type = SSDB
-host = 127.0.0.1
-port = 8888
-name = proxy
-;username = your_username (Only Mongodb)
-;password = your_password
-
-[ProxyGetter]
-;register the proxy getter function
-freeProxyFirst = 1
-freeProxySecond = 1
-;freeProxyThird = 1
-freeProxyFourth = 1
-freeProxyFifth = 1
-;freeProxySixth = 1
-freeProxySeventh = 1
-;freeProxyEight = 1
-;freeProxyNinth = 1
-freeProxyTen = 1
-freeProxyEleven = 1
-freeProxyTwelve = 1
-;foreign website, outside the wall
-;freeProxyWallFirst = 1
-;freeProxyWallSecond = 1
-;freeProxyWallThird = 1
-
-[API]
-# API config http://127.0.0.1:5010
-# The ip specified when starting the web API
-ip = 0.0.0.0
-# he port on which to run the web API
-port = 8080
diff --git a/Config/setting.py b/Config/setting.py
index 39ae36748..8b87191fa 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -51,4 +51,4 @@
SERVER_API = {
"HOST": "0.0.0.0", # The ip specified which starting the web API
"PORT": 5010 # port number to which the server listens to
-}
\ No newline at end of file
+}
diff --git a/Test/testConfig.py b/Test/testConfig.py
index 7ed759387..ebfd1171f 100644
--- a/Test/testConfig.py
+++ b/Test/testConfig.py
@@ -2,7 +2,7 @@
"""
-------------------------------------------------
File Name: testGetConfig
- Description : test all function in GetConfig.py
+ Description : testGetConfig
Author : J_hao
date: 2017/7/31
-------------------------------------------------
diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py
index 33c3f9e46..854172773 100644
--- a/Test/testGetFreeProxy.py
+++ b/Test/testGetFreeProxy.py
@@ -16,7 +16,6 @@
import sys
import requests
-
try:
from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
except:
@@ -25,7 +24,7 @@
sys.path.append('..')
from ProxyGetter.getFreeProxy import GetFreeProxy
-from Util.GetConfig import GetConfig
+from Config.ConfigGetter import config
# noinspection PyPep8Naming
@@ -34,15 +33,15 @@ def testGetFreeProxy():
test class GetFreeProxy in ProxyGetter/GetFreeProxy
:return:
"""
- gc = GetConfig()
- proxy_getter_functions = gc.proxy_getter_functions
+ proxy_getter_functions = config.proxy_getter_functions
for proxyGetter in proxy_getter_functions:
proxy_count = 0
for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
if proxy:
- print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count))
+ print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,
+ proxy_count=proxy_count))
proxy_count += 1
- #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter)
+ # assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter)
if __name__ == '__main__':
diff --git a/Util/GetConfig.py b/Util/GetConfig.py
deleted file mode 100644
index 65554b317..000000000
--- a/Util/GetConfig.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-# !/usr/bin/env python
-"""
--------------------------------------------------
- File Name: GetConfig.py
- Description : fetch config from config.ini
- Author : JHao
- date: 2016/12/3
--------------------------------------------------
- Change Activity:
- 2016/12/3: get db property func
--------------------------------------------------
-"""
-__author__ = 'JHao'
-
-from Util.utilClass import LazyProperty
-
-
-class GetConfig(object):
- """
- to get config from config.ini
- """
-
- def __init__(self):
- pass
-
- @LazyProperty
- def db_type(self):
- return self.config_file.get('DB', 'type')
-
- @LazyProperty
- def db_name(self):
- return self.config_file.get('DB', 'name')
-
- @LazyProperty
- def db_host(self):
- return self.config_file.get('DB', 'host')
-
- @LazyProperty
- def db_port(self):
- return int(self.config_file.get('DB', 'port'))
-
- @LazyProperty
- def db_password(self):
- return self.config_file.get('DB', 'password')
-
- @LazyProperty
- def proxy_getter_functions(self):
- return self.config_file.options('ProxyGetter')
-
- @LazyProperty
- def host_ip(self):
- return self.config_file.get('API', 'ip')
-
- @LazyProperty
- def host_port(self):
- return int(self.config_file.get('API', 'port'))
-
-
-config = GetConfig()
-
-if __name__ == '__main__':
- gg = GetConfig()
- print(gg.db_type)
- print(gg.db_name)
- print(gg.db_host)
- print(gg.db_port)
- print(gg.proxy_getter_functions)
- print(gg.host_ip)
- print(gg.host_port)
- print(gg.db_password)
From 55e71981168e57658371e27f7b9517011cca653f Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 18 Feb 2019 10:53:03 +0800
Subject: [PATCH 035/298] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?=
=?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?=
=?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
README.md | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/README.md b/README.md
index 4f253af81..b62864f2d 100644
--- a/README.md
+++ b/README.md
@@ -196,6 +196,27 @@ PROXY_GETTER = [
`ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。
+### 代理采集
+
+ 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)):
+
+ | 厂商名称 | 状态 | 更新速度 | 可用率 | 是否被墙 | 地址 |
+ | ----- | ---- | -------- | ------ | --------- | ----- |
+ | 无忧代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.data5u.com/free/index.html) |
+ | 66代理 | 可用 | 更新很慢 | * | 否 | [地址](http://www.66ip.cn/) |
+ | 西刺代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.xicidaili.com)|
+ | 全网代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.goubanjia.com/)|
+ | 训代理 | 已关闭免费代理 | * | * | 否 | [地址](http://www.xdaili.cn/)|
+ | 快代理 | 可用 |几分钟一次| * | 否 | [地址](https://www.kuaidaili.com/)|
+ | 云代理 | 可用 |几分钟一次| * | 否 | [地址](http://www.ip3366.net/)|
+ | IP海 | 可用 |几小时一次| * | 否 | [地址](http://www.iphai.com/)|
+ | 免费IP代理库 | 可用 |快| * | 否 | [地址](http://ip.jiangxianli.com/)|
+ | 中国IP地址 | 可用 |几分钟一次| * | 是 | [地址](http://cn-proxy.com/)|
+ | Proxy List | 可用 |几分钟一次| * | 是 | [地址](https://proxy-list.org/chinese/index.php)|
+ | ProxyList+ | 可用 |几分钟一次| * | 是 | [地址](https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1)|
+
+ 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。
+
### 问题反馈
任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。
From 086074c4288167871a3c23b34346ab59db01f29c Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 18 Feb 2019 11:17:38 +0800
Subject: [PATCH 036/298] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B066?=
=?UTF-8?q?=E4=BB=A3=E7=90=86=E9=87=87=E9=9B=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/getFreeProxy.py | 48 ++++++++++++++-----------------------
1 file changed, 18 insertions(+), 30 deletions(-)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index a560dc700..caa5b6e9c 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -23,18 +23,6 @@
# for debug to disable insecureWarning
requests.packages.urllib3.disable_warnings()
-"""
- 66ip.cn
- data5u.com
- xicidaili.com
- goubanjia.com
- xdaili.cn
- kuaidaili.com
- cn-proxy.com
- proxy-list.org
- www.mimiip.com to do
-"""
-
class GetFreeProxy(object):
"""
@@ -64,24 +52,24 @@ def freeProxyFirst(page=10):
print(e)
@staticmethod
- def freeProxySecond(area=33, page=1):
+ def freeProxySecond(count=20):
"""
代理66 http://www.66ip.cn/
- :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页......
- :param page: 翻页
+ :param count: 提取数量
:return:
"""
- area = 33 if area > 33 else area
- for area_index in range(1, area + 1):
- for i in range(1, page + 1):
- url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i)
- html_tree = getHtmlTree(url)
- tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]")
- if len(tr_list) == 0:
- continue
- for tr in tr_list:
- yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0]
- break
+ urls = [
+ "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
+ "http://www.66ip.cn/nmtq.php?getnum={count}"
+ "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip",
+ ]
+ request = WebRequest()
+ for _ in urls:
+ url = _.format(count=count)
+ html = request.get(url).content
+ ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html)
+ for ip in ips:
+ yield ip.strip()
@staticmethod
def freeProxyThird(days=1):
@@ -180,7 +168,7 @@ def freeProxySeventh():
@staticmethod
def freeProxyEight():
"""
- 秘密代理 http://www.mimiip.com
+ 秘密代理 http://www.mimiip.com 不能用
"""
url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿
url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿
@@ -197,7 +185,7 @@ def freeProxyEight():
@staticmethod
def freeProxyNinth():
"""
- 码农代理 https://proxy.coderbusy.com/
+ 码农代理 https://proxy.coderbusy.com/ 不能用
:return:
"""
urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
@@ -303,7 +291,7 @@ def freeProxyWallThird():
from CheckProxy import CheckProxy
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
- # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth)
@@ -313,6 +301,6 @@ def freeProxyWallThird():
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve)
# CheckProxy.checkAllGetProxyFunc()
From 792fd13e780205823e872d1370daa46a8b088e97 Mon Sep 17 00:00:00 2001
From: jhao
Date: Mon, 18 Feb 2019 14:54:44 +0800
Subject: [PATCH 037/298] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?=
=?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config/setting.py | 10 +++++-----
ProxyGetter/CheckProxy.py | 2 --
ProxyGetter/getFreeProxy.py | 28 +++++++++++++---------------
Test/testGetFreeProxy.py | 11 -----------
4 files changed, 18 insertions(+), 33 deletions(-)
diff --git a/Config/setting.py b/Config/setting.py
index 8b87191fa..63b4f6153 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -29,10 +29,10 @@
PROXY_GETTER = [
"freeProxyFirst",
"freeProxySecond",
- # "freeProxyThird",
+ # "freeProxyThird", # 网站已不能访问
"freeProxyFourth",
"freeProxyFifth",
- # "freeProxySixth"
+ # "freeProxySixth" # 不再提供免费代理
"freeProxySeventh",
# "freeProxyEight",
# "freeProxyNinth",
@@ -40,9 +40,9 @@
"freeProxyEleven",
"freeProxyTwelve",
# foreign website, outside the wall
- "freeProxyWallFirst",
- "freeProxyWallSecond",
- "freeProxyWallThird"
+ # "freeProxyWallFirst",
+ # "freeProxyWallSecond",
+ # "freeProxyWallThird"
]
diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py
index f29824723..2b3fc6a29 100644
--- a/ProxyGetter/CheckProxy.py
+++ b/ProxyGetter/CheckProxy.py
@@ -12,11 +12,9 @@
"""
__author__ = 'JHao'
-import sys
from getFreeProxy import GetFreeProxy
from Util.utilFunction import verifyProxyFormat
-sys.path.append('../')
from Util.LogHandler import LogHandler
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index caa5b6e9c..cdfa843a0 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -88,7 +88,7 @@ def freeProxyThird(days=1):
pass
@staticmethod
- def freeProxyFourth(page_count=2):
+ def freeProxyFourth(page_count=1):
"""
西刺代理 http://www.xicidaili.com
:return:
@@ -136,7 +136,7 @@ def freeProxyFifth():
@staticmethod
def freeProxySixth():
"""
- 讯代理 http://www.xdaili.cn/
+ 讯代理 http://www.xdaili.cn/ 已停用
:return:
"""
url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
@@ -154,21 +154,19 @@ def freeProxySeventh():
快代理 https://www.kuaidaili.com
"""
url_list = [
- 'https://www.kuaidaili.com/free/inha/{page}/',
- 'https://www.kuaidaili.com/free/intr/{page}/'
+ 'https://www.kuaidaili.com/free/inha/',
+ 'https://www.kuaidaili.com/free/intr/'
]
for url in url_list:
- for page in range(1, 2):
- page_url = url.format(page=page)
- tree = getHtmlTree(page_url)
- proxy_list = tree.xpath('.//table//tr')
- for tr in proxy_list[1:]:
- yield ':'.join(tr.xpath('./td/text()')[0:2])
+ tree = getHtmlTree(url)
+ proxy_list = tree.xpath('.//table//tr')
+ for tr in proxy_list[1:]:
+ yield ':'.join(tr.xpath('./td/text()')[0:2])
@staticmethod
def freeProxyEight():
"""
- 秘密代理 http://www.mimiip.com 不能用
+ 秘密代理 http://www.mimiip.com 已停用
"""
url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿
url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿
@@ -185,7 +183,7 @@ def freeProxyEight():
@staticmethod
def freeProxyNinth():
"""
- 码农代理 https://proxy.coderbusy.com/ 不能用
+ 码农代理 https://proxy.coderbusy.com/ 已停用
:return:
"""
urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
@@ -233,7 +231,7 @@ def freeProxyEleven():
@staticmethod
def freeProxyTwelve(page_count=2):
"""
- guobanjia http://ip.jiangxianli.com/?page=
+ http://ip.jiangxianli.com/?page=
免费代理库
超多量
:return:
@@ -291,7 +289,7 @@ def freeProxyWallThird():
from CheckProxy import CheckProxy
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth)
@@ -300,7 +298,7 @@ def freeProxyWallThird():
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen)
- # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve)
# CheckProxy.checkAllGetProxyFunc()
diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py
index 854172773..5074945b4 100644
--- a/Test/testGetFreeProxy.py
+++ b/Test/testGetFreeProxy.py
@@ -12,22 +12,11 @@
"""
__author__ = 'J_hao'
-import re
-import sys
-import requests
-try:
- from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
-except:
- reload(sys)
- sys.setdefaultencoding('utf-8')
-
-sys.path.append('..')
from ProxyGetter.getFreeProxy import GetFreeProxy
from Config.ConfigGetter import config
-# noinspection PyPep8Naming
def testGetFreeProxy():
"""
test class GetFreeProxy in ProxyGetter/GetFreeProxy
From 07f9845017836d2776272e87551b55fb4a677f1a Mon Sep 17 00:00:00 2001
From: jhao
Date: Tue, 19 Feb 2019 15:24:23 +0800
Subject: [PATCH 038/298] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?=
=?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
doc/release_notes.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/doc/release_notes.md b/doc/release_notes.md
index 0871a2db5..36e097726 100644
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -1,5 +1,11 @@
## Release Notes
+* 1.13 (2019.02)
+
+ 1.使用.py文件替换.ini作为配置文件;
+
+ 2.更新代理采集部分;
+
* 1.12 (2018.4)
1.优化代理格式检查;
From 0c48d9dc1a0e3dcb2f166882ea29ed7ad3213a21 Mon Sep 17 00:00:00 2001
From: J_hao104
Date: Tue, 5 Mar 2019 10:05:06 +0800
Subject: [PATCH 039/298] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index b62864f2d..48edb4c98 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
* 支持版本:  
-* 测试地址: http://123.207.35.36:5010 (单机勿压。感谢)
+* 测试地址: http://118.24.52.95:5010 (单机勿压。感谢)
### 下载安装
From b568bd2092fc4aa405314968ead1102b1216f18d Mon Sep 17 00:00:00 2001
From: weak_ptr
Date: Sun, 10 Mar 2019 17:21:54 +0800
Subject: [PATCH 040/298] =?UTF-8?q?[refine]=20=E5=85=81=E8=AE=B8=20docker-?=
=?UTF-8?q?compose=20up=20=E7=9B=B4=E6=8E=A5=E8=BF=90=E8=A1=8C=E6=9C=8D?=
=?UTF-8?q?=E5=8A=A1=E8=80=8C=E6=97=A0=E9=9C=80=E4=BF=AE=E6=94=B9=E9=85=8D?=
=?UTF-8?q?=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
以下为修改内容。
- 移除现在看起来无用的 Dockerfile.develop
- 将 Dockerfile 和 docker-compose.yml 移动到项目根目录下,删除 Docker 目录
- 修改 docker-compose.yml 内容,令 docker-compose 自行构建 proxy_pool,通过环境变量传递数据库类型和域名、端口等配置信息,不再暴露 redis 端口到 host 主机
- 修改 Dockerfile 内容,先复制 requirements.txt,完成依赖安装后,再复制代码文件,避免开发迭代时每次都要等 pip install
- 修改 Config.setting 模块,先尝试通过环境变量获取配置信息,并提供未配置环境变量时的默认值。
---
Config/setting.py | 24 ++++++++++++++++++++----
Docker/Dockerfile.develop | 27 ---------------------------
Docker/docker-compose.yml | 14 --------------
Docker/Dockerfile => Dockerfile | 9 +++------
docker-compose.yml | 14 ++++++++++++++
requirements.txt | 3 ---
6 files changed, 37 insertions(+), 54 deletions(-)
delete mode 100644 Docker/Dockerfile.develop
delete mode 100644 Docker/docker-compose.yml
rename Docker/Dockerfile => Dockerfile (89%)
create mode 100644 docker-compose.yml
diff --git a/Config/setting.py b/Config/setting.py
index 63b4f6153..a74e69a32 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -12,12 +12,29 @@
"""
# database config
+from os import getenv
+
+
+class ConfigError(BaseException):
+ pass
+
+
+DB_TYPE = getenv('db_type', 'SSDB')
+
+if DB_TYPE == 'SSDB':
+ DB_HOST = getenv('ssdb_host', '127.0.0.1')
+ DB_PORT = getenv('ssdb_port', '6379')
+elif DB_TYPE == 'MONGODB':
+ DB_HOST = getenv('mongodb_host', '127.0.0.1')
+ DB_PORT = getenv('mongodb_host', '27017')
+else:
+ raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.')
DATABASES = {
"default": {
- "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB
- "HOST": "127.0.0.1",
- "PORT": 8888,
+ "TYPE": DB_TYPE, # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB
+ "HOST": DB_HOST,
+ "PORT": DB_PORT,
"NAME": "proxy",
"PASSWORD": ""
@@ -45,7 +62,6 @@
# "freeProxyWallThird"
]
-
# # API config http://127.0.0.1:5010
SERVER_API = {
diff --git a/Docker/Dockerfile.develop b/Docker/Dockerfile.develop
deleted file mode 100644
index d97495489..000000000
--- a/Docker/Dockerfile.develop
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM python:3.6
-WORKDIR /usr/src/app
-COPY . .
-ENV DEBIAN_FRONTEND noninteractive
-ENV TZ Asia/Shanghai
-
-RUN apt-get update
-RUN apt-get install vim -y
-
-RUN apt-get install -y redis-server
-RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \
- && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \
- && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf
-# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \
-# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf
-
-RUN pip install --no-cache-dir -r requirements.txt
-
-
-RUN echo "# ! /bin/sh " > run.sh \
- && echo "redis-server /etc/redis/redis.conf&" >> run.sh \
- && echo "cd Run" >> run.sh \
- && echo "python main.py" >> run.sh \
- && chmod 777 run.sh
-
-EXPOSE 5010
-CMD [ "sh", "run.sh" ]
diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml
deleted file mode 100644
index 9529745d5..000000000
--- a/Docker/docker-compose.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-version: '2'
-services:
- proxy_pool:
- volumes:
- - ..:/usr/src/app
- ports:
- - "5010:5010"
- links:
- - proxy_redis
- image: "proxy_pool"
- proxy_redis:
- ports:
- - "6379:6379"
- image: "redis"
\ No newline at end of file
diff --git a/Docker/Dockerfile b/Dockerfile
similarity index 89%
rename from Docker/Dockerfile
rename to Dockerfile
index 6ad6f5f53..abe8ddb07 100644
--- a/Docker/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,10 @@
FROM python:3.6
-WORKDIR /usr/src/app
-COPY . .
-
ENV DEBIAN_FRONTEND noninteractive
ENV TZ Asia/Shanghai
-
+WORKDIR /usr/src/app
+COPY ./requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
-
+COPY . .
EXPOSE 5010
-
WORKDIR /usr/src/app/
CMD [ "python", "Run/main.py" ]
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 000000000..1c7f24659
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,14 @@
+version: '2'
+services:
+ proxy_pool:
+ build: .
+ ports:
+ - "5010:5010"
+ links:
+ - proxy_redis
+ environment:
+ db_type: SSDB
+ ssdb_host: proxy_redis
+ ssdb_port: 6379
+ proxy_redis:
+ image: "redis"
diff --git a/requirements.txt b/requirements.txt
index bc3581ff5..3da935240 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,8 +3,5 @@ werkzeug==0.11.15
Flask==0.12
requests==2.20.0
lxml==3.7.2
-
pymongo
redis
-
-
From 595b08861abfa0e3a4e8dfa16132686292a5815c Mon Sep 17 00:00:00 2001
From: baiyan
Date: Sun, 24 Mar 2019 00:42:18 +0800
Subject: [PATCH 041/298] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=97=A0=E5=BF=A7?=
=?UTF-8?q?=E4=BB=A3=E7=90=86=E8=A7=A3=E6=9E=90=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/getFreeProxy.py | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index cdfa843a0..470cbb3c2 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -33,7 +33,10 @@ class GetFreeProxy(object):
def freeProxyFirst(page=10):
"""
无忧代理 http://www.data5u.com/
- 几乎没有能用的
+ 无忧代理有反爬虫机制。
+ 需要获得元素的 classname。
+ 匹配classname中每个字符在key中的位置,组合得到一个整数。
+ 最后将整数右移3位得到的才是正确的端口号。
:param page: 页数
:return:
"""
@@ -42,12 +45,21 @@ def freeProxyFirst(page=10):
'http://www.data5u.com/free/gngn/index.shtml',
'http://www.data5u.com/free/gnpt/index.shtml'
]
+ key = 'ABCDEFGHIZ'
for url in url_list:
html_tree = getHtmlTree(url)
ul_list = html_tree.xpath('//ul[@class="l2"]')
for ul in ul_list:
try:
- yield ':'.join(ul.xpath('.//li/text()')[0:2])
+ ip = ul.xpath('./span[1]/li/text()')[0]
+ classnames = ul.xpath('./span[2]/li/attribute::class')[0]
+ classname = classnames.split(' ')[1]
+ port_sum = 0
+ for c in classname:
+ port_sum *= 10
+ port_sum += key.index(c)
+ port = port_sum >> 3
+ yield '{}:{}'.format(ip, port)
except Exception as e:
print(e)
From 35467fb3bc8ac5c63b6939df84aa027f820f3421 Mon Sep 17 00:00:00 2001
From: Oddcc
Date: Fri, 29 Mar 2019 14:19:19 +0800
Subject: [PATCH 042/298] Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
更新文档中生产环境部署命令
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 48edb4c98..2bee689f4 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ SERVER_API = {
# Workdir proxy_pool
docker build -t proxy_pool .
pip install docker-compose
-docker-compose -f Docker/docker-compose.yml up -d
+docker-compose -f docker-compose.yml up -d
```
* 开发环境 Docker
From f8d039e61e0dc88ebfee43f96f9a584f07c9ca90 Mon Sep 17 00:00:00 2001
From: houbaron
Date: Wed, 8 May 2019 21:40:11 +0800
Subject: [PATCH 043/298] =?UTF-8?q?[refine]=E5=85=81=E8=AE=B8=20docker-com?=
=?UTF-8?q?pose.yml=20=E5=AE=9A=E4=B9=89=E5=AF=86=E7=A0=81=E8=80=8C?=
=?UTF-8?q?=E6=97=A0=E9=A1=BB=E4=BF=AE=E6=94=B9=20setting.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config/setting.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/Config/setting.py b/Config/setting.py
index a74e69a32..66b8f0866 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -24,9 +24,11 @@ class ConfigError(BaseException):
if DB_TYPE == 'SSDB':
DB_HOST = getenv('ssdb_host', '127.0.0.1')
DB_PORT = getenv('ssdb_port', '6379')
+ DB_PASSWORD = getenv('ssdb_password', '6379')
elif DB_TYPE == 'MONGODB':
DB_HOST = getenv('mongodb_host', '127.0.0.1')
DB_PORT = getenv('mongodb_host', '27017')
+ DB_PASSWORD = getenv('mongodb_password', '6379')
else:
raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.')
@@ -36,7 +38,7 @@ class ConfigError(BaseException):
"HOST": DB_HOST,
"PORT": DB_PORT,
"NAME": "proxy",
- "PASSWORD": ""
+ "PASSWORD": DB_PASSWORD
}
}
From bb4a7b9367a74645d1bfecbf92299260ef4bde0f Mon Sep 17 00:00:00 2001
From: houbaron
Date: Wed, 8 May 2019 21:44:56 +0800
Subject: [PATCH 044/298] =?UTF-8?q?[refine]=E8=AE=BE=E7=BD=AE=E9=BB=98?=
=?UTF-8?q?=E8=AE=A4=E5=AF=86=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config/setting.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Config/setting.py b/Config/setting.py
index 66b8f0866..358b0bfbc 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -24,11 +24,11 @@ class ConfigError(BaseException):
if DB_TYPE == 'SSDB':
DB_HOST = getenv('ssdb_host', '127.0.0.1')
DB_PORT = getenv('ssdb_port', '6379')
- DB_PASSWORD = getenv('ssdb_password', '6379')
+ DB_PASSWORD = getenv('ssdb_password', '')
elif DB_TYPE == 'MONGODB':
DB_HOST = getenv('mongodb_host', '127.0.0.1')
DB_PORT = getenv('mongodb_host', '27017')
- DB_PASSWORD = getenv('mongodb_password', '6379')
+ DB_PASSWORD = getenv('mongodb_password', '')
else:
raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.')
From f5a4317bbc96f6396d85337bba735545c437fecd Mon Sep 17 00:00:00 2001
From: hero
Date: Sat, 11 May 2019 20:09:56 +0800
Subject: [PATCH 045/298] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=85=A8=E7=BD=91?=
=?UTF-8?q?=E4=BB=A3=E7=90=86port=E9=94=99=E8=AF=AF=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/getFreeProxy.py | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index cdfa843a0..330bf090a 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -128,8 +128,20 @@ def freeProxyFifth():
try:
# :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))
- port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
- yield '{}:{}'.format(ip_addr, port)
+
+ # HTML中的port是随机数,真正的端口编码在class后面的字母中。
+ # 比如这个:
+ # 9054
+ # CFACE解码后对应的是3128。
+ port = 0
+ for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
+ "/attribute::class")[0]. \
+ replace("port ", ""):
+ port *= 10
+ port += (ord(_) - ord('A'))
+ port /= 8
+
+ yield '{}:{}'.format(ip_addr, int(port))
except Exception as e:
pass
From 35f43ecbe67ba869fcb3b7f044185f79a7452699 Mon Sep 17 00:00:00 2001
From: jhao
Date: Wed, 10 Jul 2019 17:17:32 +0800
Subject: [PATCH 046/298] [update] fix 272
---
Schedule/ProxyCheck.py | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py
index 4300f7bf7..782d993d1 100644
--- a/Schedule/ProxyCheck.py
+++ b/Schedule/ProxyCheck.py
@@ -15,6 +15,12 @@
import sys
from threading import Thread
+
+try:
+ from Queue import Empty # py3
+except:
+ from queue import Empty # py2
+
sys.path.append('../')
from Util.utilFunction import validUsefulProxy
@@ -35,7 +41,10 @@ def __init__(self, queue, item_dict):
def run(self):
self.db.changeTable(self.useful_proxy_queue)
while self.queue.qsize():
- proxy = self.queue.get()
+ try:
+ proxy = self.queue.get()
+ except Empty:
+ break
count = self.item_dict[proxy]
if validUsefulProxy(proxy):
# 验证通过计数器减1
@@ -53,8 +62,3 @@ def run(self):
self.db.put(proxy, num=int(count) + 1)
self.queue.task_done()
-
-if __name__ == '__main__':
- # p = ProxyCheck()
- # p.run()
- pass
From 2f39dedbf36c3838233f452323f18ddad25f9e7b Mon Sep 17 00:00:00 2001
From: jhao
Date: Thu, 11 Jul 2019 16:39:23 +0800
Subject: [PATCH 047/298] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E5=AF=B9?=
=?UTF-8?q?=E8=B1=A1=E7=B1=BB=E5=9E=8B=E5=B0=81=E8=A3=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyHelper/Proxy.py | 104 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 104 insertions(+)
create mode 100644 ProxyHelper/Proxy.py
diff --git a/ProxyHelper/Proxy.py b/ProxyHelper/Proxy.py
new file mode 100644
index 000000000..dce009e96
--- /dev/null
+++ b/ProxyHelper/Proxy.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+"""
+-------------------------------------------------
+ File Name: Proxy
+ Description : 代理对象类型封装
+ Author : JHao
+ date: 2019/7/11
+-------------------------------------------------
+ Change Activity:
+ 2019/7/11: 代理对象类型封装
+-------------------------------------------------
+"""
+__author__ = 'JHao'
+
+
+class Proxy(object):
+
+ def __init__(self, proxy):
+ if isinstance(proxy, basestring):
+ self._proxy = proxy
+ self._fail_count = 0
+ self._region = ""
+ self._type = ""
+ self._last_status = ""
+ self._last_time = ""
+
+ elif isinstance(proxy, dict):
+ self._proxy = proxy.get("proxy")
+ self._fail_count = proxy.get("fail_count")
+ self._region = proxy.get("region")
+ self._type = proxy.get("type")
+ self._last_status = proxy.get("last_status")
+ self._last_time = proxy.get("last_time")
+
+ else:
+ raise TypeError("proxy arg invalid")
+
+ @property
+ def proxy(self):
+ """ 代理 ip:port """
+ return self._proxy
+
+ @property
+ def fail_count(self):
+ """ 检测失败次数 """
+ return self._fail_count
+
+ @property
+ def region(self):
+ """ 地理位置(国家/城市) """
+ return self._region
+
+ @property
+ def type(self):
+ """ 透明/匿名/高匿 """
+ return self._type
+
+ @property
+ def last_status(self):
+ """ 最后一次检测结果 """
+ return self._last_status
+
+ @property
+ def last_time(self):
+ """ 最后一次检测时间 """
+ return self._last_time
+
+ # --- proxy method ---
+ @fail_count.setter
+ def fail_count(self, value):
+ self._fail_count = value
+
+ @region.setter
+ def region(self, value):
+ self._region = value
+
+ @type.setter
+ def type(self, value):
+ self._type = value
+
+ @last_status.setter
+ def last_status(self, value):
+ self._last_status = value
+
+ @last_time.setter
+ def last_time(self, value):
+ self._last_time = value
+
+
+def proxy2Json(proxy):
+ return {"proxy": proxy.proxy,
+ "fail_count": proxy.fail_count,
+ "region": proxy.region,
+ "type": proxy.type,
+ "last_status": proxy.last_status,
+ "last_time": proxy.last_time}
+
+
+if __name__ == '__main__':
+ p = Proxy("127.0.0.1:8080")
+
+ import json
+
+ print json.dumps(p, default=proxy2Json)
From 964061e8e80baf2534652e290385f7131f880447 Mon Sep 17 00:00:00 2001
From: jhao
Date: Thu, 11 Jul 2019 17:03:31 +0800
Subject: [PATCH 048/298] =?UTF-8?q?[update]=20=E6=97=A0=E5=BF=A7=E4=BB=A3?=
=?UTF-8?q?=E7=90=86=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ProxyGetter/getFreeProxy.py | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index 330bf090a..60dd884c1 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -30,17 +30,14 @@ class GetFreeProxy(object):
"""
@staticmethod
- def freeProxyFirst(page=10):
+ def freeProxy01():
"""
无忧代理 http://www.data5u.com/
几乎没有能用的
- :param page: 页数
:return:
"""
url_list = [
'http://www.data5u.com/',
- 'http://www.data5u.com/free/gngn/index.shtml',
- 'http://www.data5u.com/free/gnpt/index.shtml'
]
for url in url_list:
html_tree = getHtmlTree(url)
@@ -300,7 +297,7 @@ def freeProxyWallThird():
if __name__ == '__main__':
from CheckProxy import CheckProxy
- # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01())
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth)
@@ -310,7 +307,7 @@ def freeProxyWallThird():
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen)
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
+ # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve)
# CheckProxy.checkAllGetProxyFunc()
From f0c7a0f918cad270a508bed8296cd644b5dc1722 Mon Sep 17 00:00:00 2001
From: jhao
Date: Thu, 18 Jul 2019 10:00:04 +0800
Subject: [PATCH 049/298] =?UTF-8?q?[update]=20=E7=A0=B4=E8=A7=A3=E4=BB=A3?=
=?UTF-8?q?=E7=90=8666=20=E5=8A=A0=E9=80=9F=E4=B9=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
Config/setting.py | 4 +--
ProxyGetter/CheckProxy.py | 2 +-
ProxyGetter/getFreeProxy.py | 56 +++++++++++++++++++++++++++----------
Util/utilFunction.py | 12 --------
requirements.txt | 1 +
5 files changed, 46 insertions(+), 29 deletions(-)
diff --git a/Config/setting.py b/Config/setting.py
index 358b0bfbc..4ef1b76eb 100644
--- a/Config/setting.py
+++ b/Config/setting.py
@@ -46,8 +46,8 @@ class ConfigError(BaseException):
# register the proxy getter function
PROXY_GETTER = [
- "freeProxyFirst",
- "freeProxySecond",
+ "freeProxy01",
+ "freeProxy02",
# "freeProxyThird", # 网站已不能访问
"freeProxyFourth",
"freeProxyFifth",
diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py
index 2b3fc6a29..d15be49c9 100644
--- a/ProxyGetter/CheckProxy.py
+++ b/ProxyGetter/CheckProxy.py
@@ -67,4 +67,4 @@ def checkGetProxyFunc(func):
if __name__ == '__main__':
CheckProxy.checkAllGetProxyFunc()
- CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
+ CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01)
\ No newline at end of file
diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py
index 60dd884c1..1b1277af4 100644
--- a/ProxyGetter/getFreeProxy.py
+++ b/ProxyGetter/getFreeProxy.py
@@ -49,24 +49,52 @@ def freeProxy01():
print(e)
@staticmethod
- def freeProxySecond(count=20):
+ def freeProxy02(count=20):
"""
代理66 http://www.66ip.cn/
:param count: 提取数量
:return:
"""
urls = [
- "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
- "http://www.66ip.cn/nmtq.php?getnum={count}"
- "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip",
- ]
- request = WebRequest()
- for _ in urls:
- url = _.format(count=count)
- html = request.get(url).content
- ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html)
- for ip in ips:
- yield ip.strip()
+ "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
+ "http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s"
+ "tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip"
+ ]
+
+ try:
+ import execjs
+ import requests
+
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
+ 'Accept': '*/*',
+ 'Connection': 'keep-alive',
+ 'Accept-Language': 'zh-CN,zh;q=0.8'}
+ session = requests.session()
+ src = session.get("http://www.66ip.cn/", headers=headers).text
+ src = src.split("")[0] + '}'
+ src = src.replace("")[0] + '}'
+ src = src.replace("")[0] + '}'
+ src = src.replace("")[0] + '}'
- src = src.replace("")[0] + '}'
+# src = src.replace("")[0] + '}'
-# src = src.replace("")[0] + '}'
- src = src.replace("