diff --git a/.github/workflows/docker-image-latest.yml b/.github/workflows/docker-image-latest.yml new file mode 100644 index 000000000..6c7e00ac6 --- /dev/null +++ b/.github/workflows/docker-image-latest.yml @@ -0,0 +1,35 @@ +name: Publish Docker image latest + +on: + push: + branches: + - 'master' + +jobs: + + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: jhao104/proxy_pool + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: jhao104/proxy_pool:latest diff --git a/.github/workflows/docker-image-tags.yml b/.github/workflows/docker-image-tags.yml new file mode 100644 index 000000000..9a59645ad --- /dev/null +++ b/.github/workflows/docker-image-tags.yml @@ -0,0 +1,36 @@ +name: Publish Docker image tags + +on: + push: + tags: + - '*' + +jobs: + + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: jhao104/proxy_pool + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore index ee40bb363..4dae2645e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .idea/ -*.pyc \ No newline at end of file +docs/_build +*.pyc +*.log diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..529d8cf2d --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +language: python +python: + - "2.7" + - "3.5" + - "3.6" + - "3.7" + - "3.8" + - "3.9" + - "3.10" + - "3.11" +os: + - linux +install: + - pip install -r requirements.txt + +script: python test.py diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py deleted file mode 100644 index 9a59d61c9..000000000 --- a/Api/ProxyApi.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyApi.py - Description : - Author : JHao - date: 2016/12/4 -------------------------------------------------- - Change Activity: - 2016/12/4: -------------------------------------------------- -""" -__author__ = 'JHao' - -from flask import Flask, jsonify, request -import sys - -sys.path.append('../') - -from Manager.ProxyManager import ProxyManager - -app = Flask(__name__) - -api_list = { - 'get': u'get an usable proxy', - 'refresh': u'refresh proxy pool', - 'get_all': u'get all proxy from proxy pool', - 'delete?proxy=127.0.0.1:8080': u'delete an unable proxy', -} - - -@app.route('/') -def index(): - return jsonify(api_list) - - -@app.route('/get/') -def get(): - proxy = ProxyManager().get() - return proxy - - -@app.route('/refresh/') -def refresh(): - ProxyManager().refresh() - return 'success' - - -@app.route('/get_all/') -def getAll(): - proxys = ProxyManager().getAll() - return jsonify(proxys) - - -@app.route('/delete/', methods=['GET']) -def delete(): - proxy = request.args.get('proxy') - ProxyManager().delete(proxy) - return 'success' - - -if __name__ == '__main__': - app.run() diff --git a/Config.ini b/Config.ini deleted file mode 100644 index 1306e98af..000000000 --- a/Config.ini +++ /dev/null @@ -1,14 +0,0 @@ -[DB] -type = SSDB -host = security -port = 57888 -name = proxy - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 - diff --git a/DB/DbClient.py b/DB/DbClient.py deleted file mode 100644 index 1a41de08d..000000000 --- a/DB/DbClient.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: DbClient.py - Description : DB工厂类 - Author : JHao - date: 2016/12/2 -------------------------------------------------- - Change Activity: - 2016/12/2: -------------------------------------------------- -""" -__author__ = 'JHao' - -import os -import sys -from Util.GetConfig import GetConfig -from Util.utilClass import Singleton - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - - -class DbClient(object): - """ - DbClient DB工厂类 提供get/put/pop/delete/getAll/changeTable方法 - - 目前存放代理的table/collection/hash有两种: - raw_proxy: 存放原始的代理; - useful_proxy_queue: 存放检验后的代理; - - 抽象方法定义: - get: 随机返回一个代理; - put: 放回一个代理; - delete: 删除指定代理; - getAll: 返回所有代理; - changeTable: 切换 table or collection or hash - - 所有方法需要相应类去具体实现: - SSDB:SsdbClient.py - - """ - - __metaclass__ = Singleton - - def __init__(self): - """ - init - :return: - """ - self.config = GetConfig() - self.__initDbClient() - - def __initDbClient(self): - """ - init DB Client - :return: - """ - __type = None - if "SSDB" == self.config.db_type: - __type = "SsdbClient" - else: - pass - assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) - self.client = getattr(__import__(__type), __type)(name=self.config.db_name, - host=self.config.db_host, - port=self.config.db_port) - - def get(self, **kwargs): - return self.client.get(**kwargs) - - def put(self, value, **kwargs): - return self.client.put(value, **kwargs) - - def pop(self, **kwargs): - return self.client.pop(**kwargs) - - def delete(self, value, **kwargs): - return self.client.delete(value, **kwargs) - - def getAll(self): - return self.client.getAll() - - def changeTable(self, name): - self.client.changeTable(name) - - -if __name__ == "__main__": - account = DbClient() - print account.get() - account.changeTable('use') - account.put('ac') - print(account) diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py deleted file mode 100644 index 89b3ccbb2..000000000 --- a/DB/MongodbClient.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 - -__author__ = 'Maps' - -from pymongo import MongoClient -import random -import json - - -class MongodbClient(object): - - def __init__(self, name, host, port): - self.name = name - self.client = MongoClient(host, port) - self.db = self.client.proxy - - - def changeTable(self, name): - self.name = name - - - def get(self): - proxy = self.getAll() - return random.choice(proxy) if proxy else None - - - def put(self, value): - if self.db[self.name].find_one({'proxy': value}): - return None - else: - self.db[self.name].insert({'proxy': value}) - - - def pop(self): - value = self.get() - if value: - self.delete(value) - return value - - - def delete(self, value): - self.db[self.name].remove({'proxy': value}) - - - def getAll(self): - return [p['proxy'] for p in self.db[self.name].find()] - - - def clean(self): - self.client.drop_database('proxy') - - - def delete_all(self): - self.db[self.name].remove() - - -if __name__ == "__main__": - db = MongodbClient('first', 'localhost', 27017) - db.put('127.0.0.1:1') - db2 = MongodbClient('second', 'localhost', 27017) - db2.put('127.0.0.1:2') - db.clean() - diff --git a/DB/RedisClient.py b/DB/RedisClient.py deleted file mode 100644 index 0f4b4ee53..000000000 --- a/DB/RedisClient.py +++ /dev/null @@ -1,67 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python - -''' -self.name为Redis中的一个key -''' - -import random -import json -import redis - -class ReidsClient(object): - """ - Reids client - """ - - def __init__(self, name, host, port): - """ - init - :param name: - :param host: - :param port: - :return: - """ - self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0) - - def get(self): - """ - get an item - :return: - """ - values = self.__conn.smembers(name=self.name) - - return random.choice(list(values)) if values else None - - def put(self, value): - """ - put an item - :param value: - :return: - """ - value = json.dump(value, ensure_ascii=False).encode('utf-8') if isinstance(value, (dict, list)) else value - return self.__conn.sadd(self.name, value) - - def pop(self): - """ - pop an item - :return: - """ - value = self.get() - if value: - self.__conn.spop(self.name, value) - return value - - def delete(self, value): - """ - delete an item - :param key: - :return: - """ - self.__conn.srem(self.name, value) - - def getAll(self): - return self.__conn.smembers(self.name) - - diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py deleted file mode 100644 index 125e2d590..000000000 --- a/DB/SsdbClient.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: SsdbClient.py - Description : 封装SSDB操作 - Author : JHao - date: 2016/12/2 -------------------------------------------------- - Change Activity: - 2016/12/2: -------------------------------------------------- -""" -__author__ = 'JHao' - -from ssdb.connection import BlockingConnectionPool -from ssdb import SSDB -import random -import json - - -class SsdbClient(object): - """ - SSDB client - - SSDB中代理存放的容器为hash: - 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; - 验证后供flask使用的代理存放在name为useful_proxy_queue的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; - - """ - - def __init__(self, name, host, port): - """ - init - :param name: hash name - :param host: ssdb host - :param port: ssdb port - :return: - """ - self.name = name - self.__conn = SSDB(connection_pool=BlockingConnectionPool(host=host, port=port)) - - def get(self): - """ - get an item - - 从useful_proxy_queue随机获取一个可用代理, 使用前需要调用changeTable("useful_proxy_queue") - :return: - """ - values = self.__conn.hgetall(name=self.name) - return random.choice(values.keys()) if values else None - - def put(self, value): - """ - put an item - - 将代理放入hash, 使用changeTable指定hash name - :param value: - :return: - """ - value = json.dump(value, ensure_ascii=False).encode('utf-8') if isinstance(value, (dict, list)) else value - return self.__conn.hset(self.name, value, None) - - def pop(self): - """ - pop an item - - 弹出一个代理, 使用changeTable指定hash name - :return: - """ - key = self.get() - if key: - self.__conn.hdel(self.name, key) - return key - - def delete(self, key): - """ - delete an item - :param key: - :return: - """ - self.__conn.hdel(self.name, key) - - def getAll(self): - return self.__conn.hgetall(self.name).keys() - - def changeTable(self, name): - self.name = name diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..89019cd7f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.6-alpine + +MAINTAINER jhao104 + +WORKDIR /app + +COPY ./requirements.txt . + +# apk repository +RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositories + +# timezone +RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata + +# runtime environment +RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ + pip install --no-cache-dir -r requirements.txt && \ + apk del gcc musl-dev + +COPY . . + +EXPOSE 5010 + +ENTRYPOINT [ "sh", "start.sh" ] diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..158b3ac08 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 J_hao104 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py deleted file mode 100644 index a1096ca76..000000000 --- a/Manager/ProxyManager.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyManager.py - Description : - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: -------------------------------------------------- -""" -__author__ = 'JHao' - -from DB.DbClient import DbClient -from Util.GetConfig import GetConfig -from ProxyGetter.getFreeProxy import GetFreeProxy - - -class ProxyManager(object): - """ - ProxyManager - """ - - def __init__(self): - self.db = DbClient() - self.config = GetConfig() - self.raw_proxy_queue = 'raw_proxy' - self.useful_proxy_queue = 'useful_proxy_queue' - - def refresh(self): - """ - fetch proxy into Db by ProxyGetter - :return: - """ - for proxyGetter in self.config.proxy_getter_functions: - proxy_set = set() - # fetch raw proxy - for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - proxy_set.add(proxy) - - # store raw proxy - self.db.changeTable(self.raw_proxy_queue) - for proxy in proxy_set: - self.db.put(proxy) - - def get(self): - """ - return a useful proxy - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - return self.db.pop() - - def delete(self, proxy): - """ - delete proxy from pool - :param proxy: - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - self.db.delete(proxy) - - def getAll(self): - """ - get all proxy from pool - :return: - """ - self.db.changeTable(self.useful_proxy_queue) - return self.db.getAll() - - -if __name__ == '__main__': - pp = ProxyManager() - pp.refresh() diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py deleted file mode 100644 index add5aee41..000000000 --- a/ProxyGetter/getFreeProxy.py +++ /dev/null @@ -1,110 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetFreeProxy.py - Description : 抓取免费代理 - Author : JHao - date: 2016/11/25 -------------------------------------------------- - Change Activity: - 2016/11/25: -------------------------------------------------- -""" -import re -import sys -import requests - -reload(sys) -sys.setdefaultencoding('utf-8') - -from Util.utilFunction import robustCrawl, getHtmlTree - - -class GetFreeProxy(object): - """ - proxy getter - """ - - def __init__(self): - pass - - @staticmethod - @robustCrawl - def freeProxyFirst(page=10): - """ - 抓取快代理IP http://www.kuaidaili.com/ - :param page: 翻页数 - :return: - """ - url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format(page=page) for page in range(1, page + 1)) - # 页数不用太多, 后面的全是历史IP, 可用性不高 - for url in url_list: - tree = getHtmlTree(url) - proxy_list = tree.xpath('.//div[@id="index_free_list"]//tbody/tr') - for proxy in proxy_list: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - - @staticmethod - @robustCrawl - def freeProxySecond(proxy_number=100): - """ - 抓取代理66 http://www.66ip.cn/ - :param proxy_number: 代理数量 - :return: - """ - url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) - html = requests.get(url).content - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - @robustCrawl - def freeProxyThird(days=1): - """ - 抓取有代理 http://www.youdaili.net/Daili/http/ - :param days: - :return: - """ - url = "http://www.youdaili.net/Daili/http/" - tree = getHtmlTree(url) - page_url_list = tree.xpath('.//div[@class="chunlist"]/ul//a/@href')[0:days] - for page_url in page_url_list: - html = requests.get(page_url).content - proxy_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html) - for proxy in proxy_list: - yield proxy - - @staticmethod - @robustCrawl - def freeProxyFourth(): - """ - 抓取西刺代理 http://api.xicidaili.com/free2016.txt - :return: - """ - url = "http://api.xicidaili.com/free2016.txt" - html = requests.get(url).content - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - @robustCrawl - def freeProxyFifth(): - """ - 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml - :return: - """ - url = "http://www.goubanjia.com/free/gngn/index.shtml" - tree = getHtmlTree(url) - # 现在每天最多放15个(一页) - for i in xrange(15): - d = tree.xpath('.//table[@class="table"]/tbody/tr[{}]/td'.format(i + 1))[0] - o = d.xpath('.//span/text() | .//div/text()') - yield ''.join(o[:-1]) + ':' + o[-1] - - -if __name__ == '__main__': - gg = GetFreeProxy() - for e in gg.freeProxyFifth(): - print e diff --git a/README.md b/README.md index 0161cb780..32d3a3518 100644 --- a/README.md +++ b/README.md @@ -1,150 +1,245 @@ -爬虫代理IP池 +ProxyPool 爬虫代理IP池 ======= -[![Yii2](https://img.shields.io/badge/Powered_by-Yii_Framework-green.svg?style=flat)](http://www.spiderpy.cn/) +[![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) +[![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) +[![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) +[![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) +[![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) + ______ ______ _ + | ___ \_ | ___ \ | | + | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | + | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | + | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ + \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ + __ / / + /___ / -> 在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行,当然在公司做的东西不能开源出来。不过呢,闲暇时间手痒,所以就想利用一些免费的资源搞一个简单的代理池服务。 - +### ProxyPool -### 1、问题 +爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 -* 代理IP从何而来? +* 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) -  刚自学爬虫的时候没有代理IP就去西刺、快代理之类有免费代理的网站去爬,还是有个别代理能用。当然,如果你有更好的代理接口也可以自己接入。 -  免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存 +* 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) +[![](https://img.shields.io/badge/Python-3.5-blue.svg)](https://docs.python.org/3.5/) +[![](https://img.shields.io/badge/Python-3.6-blue.svg)](https://docs.python.org/3.6/) +[![](https://img.shields.io/badge/Python-3.7-blue.svg)](https://docs.python.org/3.7/) +[![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) +[![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) +[![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) +[![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) -* 如何保证代理质量? +* 测试地址: http://demo.spiderpy.cn (勿压谢谢) -  可以肯定免费的代理IP大部分都是不能用的,不然别人为什么还提供付费的(不过事实是很多代理商的付费IP也不稳定,也有很多是不能用)。所以采集回来的代理IP不能直接使用,可以写检测程序不断的去用这些代理访问一个稳定的网站,看是否可以正常使用。这个过程可以使用多线程或异步的方式,因为检测代理是个很慢的过程。 +* 付费代理推荐: [luminati-china](https://get.brightdata.com/github_jh). 国外的亮数据BrightData(以前叫luminati)被认为是代理市场领导者,覆盖全球的7200万IP,大部分是真人住宅IP,成功率扛扛的。付费套餐多种,需要高质量代理IP的可以注册后联系中文客服。[申请免费试用](https://get.brightdata.com/github_jh) 目前有50%折扣优惠活动。(PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 -* 采集回来的代理如何存储? -  这里不得不推荐一个高性能支持多种数据结构的NoSQL数据库[SSDB](http://ssdb.io/docs/zh_cn/),用于代理Redis。支持队列、hash、set、k-v对,支持T级别数据。是做分布式爬虫很好中间存储工具。 +### 运行项目 -* 如何让爬虫更简单的使用这些代理? +##### 下载代码: -  答案肯定是做成服务咯,python有这么多的web框架,随便拿一个来写个api供爬虫调用。这样有很多好处,比如:当爬虫发现代理不能使用可以主动通过api去delete代理IP,当爬虫发现代理池IP不够用时可以主动去refresh代理池。这样比检测程序更加靠谱。 +* git clone -### 2、代理池设计 +```bash +git clone git@github.com:jhao104/proxy_pool.git +``` -  代理池由四部分组成: +* releases -* ProxyGetter: +```bash +https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 +``` -  代理获取接口,目前有5个免费代理源,每调用一次就会抓取这个5个网站的最新代理放入DB,可自行添加额外的代理获取接口; +##### 安装依赖: -* DB: +```bash +pip install -r requirements.txt +``` -  用于存放代理IP,现在暂时只支持SSDB。至于为什么选择SSDB,大家可以参考这篇[文章](https://www.sdk.cn/news/2684),个人觉得SSDB是个不错的Redis替代方案,如果你没有用过SSDB,安装起来也很简单,可以参考[这里](https://github.com/jhao104/memory-notes/blob/master/SSDB/SSDB%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE%E8%AE%B0%E5%BD%95.md); +##### 更新配置: -* Schedule: -  计划任务用户定时去检测DB中的代理可用性,删除不可用的代理。同时也会主动通过ProxyGetter去获取最新代理放入DB; +```python +# setting.py 为项目配置文件 -* ProxyApi: +# 配置API服务 -  代理池的外部接口,由于现在这么代理池功能比较简单,花两个小时看了下[Flask](http://flask.pocoo.org/),愉快的决定用Flask搞定。功能是给爬虫提供get/delete/refresh等接口,方便爬虫直接使用。 - -![设计](https://pic2.zhimg.com/v2-f2756da2986aa8a8cab1f9562a115b55_b.png) +HOST = "0.0.0.0" # IP +PORT = 5000 # 监听端口 -### 3、代码模块 -  Python中高层次的数据结构,动态类型和动态绑定,使得它非常适合于快速应用开发,也适合于作为胶水语言连接已有的软件部件。用Python来搞这个代理IP池也很简单,代码分为6个模块: +# 配置数据库 -* Api: +DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' -  api接口相关代码,目前api是由Flask实现,代码也非常简单。客户端请求传给Flask,Flask调用ProxyManager中的实现,包括`get/delete/refresh/get_all`; -* DB: +# 配置 ProxyFetcher -  数据库相关代码,目前数据库是采用SSDB。代码用工厂模式实现,方便日后扩展其他类型数据库; +PROXY_FETCHER = [ + "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py + "freeProxy02", + # .... +] +``` -* Manager: +#### 启动项目: -  `get/delete/refresh/get_all`等接口的具体实现类,目前代理池只负责管理proxy,日后可能会有更多功能,比如代理和爬虫的绑定,代理和账号的绑定等等; +```bash +# 如果已经具备运行条件, 可用通过proxyPool.py启动。 +# 程序分为: schedule 调度程序 和 server Api服务 -* ProxyGetter: +# 启动调度程序 +python proxyPool.py schedule -  代理获取的相关代码,目前抓取了[快代理](http://www.kuaidaili.com)、[代理66](http://www.66ip.cn/)、[有代理](http://www.youdaili.net/Daili/http/)、[西刺代理](http://api.xicidaili.com/free2016.txt)、[guobanjia](http://www.goubanjia.com/free/gngn/index.shtml)这个五个网站的免费代理,经测试这个5个网站每天更新的可用代理只有六七十个,当然也支持自己扩展代理接口; +# 启动webApi服务 +python proxyPool.py server -* Schedule: +``` -  定时任务相关代码,现在只是实现定时去刷新代码,并验证可用代理,采用多进程方式; +### Docker Image -* Util: +```bash +docker pull jhao104/proxy_pool -  存放一些公共的模块方法或函数,包含`GetConfig`:读取配置文件config.ini的类,`ConfigParse`: 集成重写ConfigParser的类,使其对大小写敏感, `Singleton`:实现单例,`LazyProperty`:实现类属性惰性计算。等等; +docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest +``` +### docker-compose -* 其他文件: +项目目录下运行: +``` bash +docker-compose up -d +``` -  配置文件:Config.ini,数据库配置和代理获取接口配置,可以在GetFreeProxy中添加新的代理获取方法,并在Config.ini中注册即可使用; +### 使用 -### 4、安装 +* Api -下载代码: -``` -git clone git@github.com:jhao104/proxy_pool.git +启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务: -或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 -``` +| api | method | Description | params| +| ----| ---- | ---- | ----| +| / | GET | api介绍 | None | +| /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| +| /count | GET | 查看代理数量 |None| +| /delete | GET | 删除代理 |`?proxy=host:ip`| -安装依赖: -``` -pip install -r requirements.txt + +* 爬虫使用 + +  如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: + +```python +import requests + +def get_proxy(): + return requests.get("http://127.0.0.1:5010/get/").json() + +def delete_proxy(proxy): + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) + +# your spider code + +def getHtml(): + # .... + retry_count = 5 + proxy = get_proxy().get("proxy") + while retry_count > 0: + try: + html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) + # 使用代理访问 + return html + except Exception: + retry_count -= 1 + # 删除代理池中代理 + delete_proxy(proxy) + return None ``` -启动: +### 扩展代理 + +  项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 + +  添加一个新的代理源方法如下: + +* 1、首先在[ProxyFetcher](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L21)类中添加自定义的获取代理的静态方法, +该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: +```python + +class ProxyFetcher(object): + # .... + + # 自定义代理源获取方法 + @staticmethod + def freeProxyCustom1(): # 命名不和已有重复即可 + + # 通过某网站或者某接口或某数据库获取代理 + # 假设你已经拿到了一个代理列表 + proxies = ["x.x.x.x:3128", "x.x.x.x:80"] + for proxy in proxies: + yield proxy + # 确保每个proxy都是 host:ip正确的格式返回 ``` -需要分别启动定时任务和api -到Config.ini中配置你的SSDB -项目目录下: ->>>python -m Schedule.ProxyRefreshSchedule +* 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: + +  在`PROXY_FETCHER`下添加自定义方法的名字: -到Api目录下: ->>>python -m Api.ProxyApi +```python +PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + # .... + "freeProxyCustom1" # # 确保名字和你添加方法名字一致 +] ``` -### 5、使用 -  定时任务启动后,会通过代理获取方法fetch所有代理放入数据库并验证。此后默认每20分钟会重复执行一次。定时任务启动大概一两分钟后,便可在SSDB中看到刷新出来的可用的代理: - -![useful_proxy](https://pic2.zhimg.com/v2-12f9b7eb72f60663212f317535a113d1_b.png) - -  启动ProxyApi.py后即可在浏览器中使用接口获取代理,一下是浏览器中的截图: -  index页面: +  `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 -![index](https://pic3.zhimg.com/v2-a867aa3db1d413fea8aeeb4c693f004a_b.png) - -  get: +### 免费代理源 -![get](https://pic1.zhimg.com/v2-f54b876b428893235533de20f2edbfe0_b.png) + 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): + + | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | + |---------------| ---- | -------- | ------ | ----- |------------------------------------------------| + | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | + | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | + | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | + | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | + | 冰凌代理 | ✔ | ★★★ | * | [地址](https://www.binglx.cn/) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | + | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L123) | + | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L133) | + | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L143) | + | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L154) | + | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L164) | -  get_all: + + 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 -![get_all](https://pic3.zhimg.com/v2-5c79f8c07e04f9ef655b9bea406d0306_b.png) - +### 问题反馈 -  爬虫中使用,如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: -``` -import requests +  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。 -def get_proxy(): - return requests.get("http://127.0.0.1:5000/get/").content +  你的反馈会让此项目变得更加完美。 -def delete_proxy(proxy): - requests.get("http://127.0.0.1:5000/delete/?proxy={}".format(proxy)) +### 贡献代码 -# your spider code +  本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 -def spider(): - # .... - requests.get('https://www.example.com', proxies={"http": "http://{}".format(get_proxy)}) - # .... +  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。 -``` +  这里感谢以下contributor的无私奉献: + +  [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) + + +### Release Notes + + [changelog](https://github.com/jhao104/proxy_pool/blob/master/docs/changelog.rst) -### 6、最后 -  时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! \ No newline at end of file +Featured|HelloGitHub diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py deleted file mode 100644 index d678b75cb..000000000 --- a/Schedule/ProxyRefreshSchedule.py +++ /dev/null @@ -1,74 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: ProxyRefreshSchedule.py - Description : 代理定时刷新 - Author : JHao - date: 2016/12/4 -------------------------------------------------- - Change Activity: - 2016/12/4: 代理定时刷新 -------------------------------------------------- -""" -__author__ = 'JHao' - -from apscheduler.schedulers.blocking import BlockingScheduler -from multiprocessing import Process -import requests -import time -import sys - -sys.path.append('../') - -from Manager.ProxyManager import ProxyManager - - -class ProxyRefreshSchedule(ProxyManager): - """ - 代理定时刷新 - """ - - def __init__(self): - ProxyManager.__init__(self) - - def validProxy(self): - self.db.changeTable(self.raw_proxy_queue) - raw_proxy = self.db.pop() - while raw_proxy: - proxies = {"http": "http://{proxy}".format(proxy=raw_proxy), - "https": "https://{proxy}".format(proxy=raw_proxy)} - try: - r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=50, verify=False) - if r.status_code == 200: - self.db.changeTable(self.useful_proxy_queue) - self.db.put(raw_proxy) - except Exception as e: - # print e - pass - self.db.changeTable(self.raw_proxy_queue) - raw_proxy = self.db.pop() - - -def refreshPool(): - pp = ProxyRefreshSchedule() - pp.validProxy() - - -def main(process_num=100): - p = ProxyRefreshSchedule() - p.refresh() - - for num in range(process_num): - P = Process(target=refreshPool, args=()) - P.start() - print '{time}: refresh complete!'.format(time=time.ctime()) - - -if __name__ == '__main__': - # pp = ProxyRefreshSchedule() - # pp.main() - main() - sched = BlockingScheduler() - sched.add_job(main, 'interval', minutes=20) - sched.start() diff --git a/Schedule/__init__.py b/Schedule/__init__.py deleted file mode 100644 index e94e59d11..000000000 --- a/Schedule/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -""" -------------------------------------------------- - File Name: __init__.py.py - Description : - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: -------------------------------------------------- -""" -__author__ = 'JHao' \ No newline at end of file diff --git a/Util/GetConfig.py b/Util/GetConfig.py deleted file mode 100644 index 766a29d54..000000000 --- a/Util/GetConfig.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetConfig.py - Description : fetch config from config.ini - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: get db property func -------------------------------------------------- -""" -__author__ = 'JHao' - -import os -from Util.utilClass import ConfigParse -from Util.utilClass import LazyProperty - - -class GetConfig(object): - """ - to get config from config.ini - """ - - def __init__(self): - self.pwd = os.path.split(os.path.realpath(__file__))[0] - self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse() - self.config_file.read(self.config_path) - - @LazyProperty - def db_type(self): - return self.config_file.get('DB', 'type') - - @LazyProperty - def db_name(self): - return self.config_file.get('DB', 'name') - - @LazyProperty - def db_host(self): - return self.config_file.get('DB', 'host') - - @LazyProperty - def db_port(self): - return int(self.config_file.get('DB', 'port')) - - @LazyProperty - def proxy_getter_functions(self): - return self.config_file.options('ProxyGetter') - - -if __name__ == '__main__': - gg = GetConfig() - print gg.db_type - print gg.db_name - print gg.db_host - print gg.db_port - print gg.proxy_getter_functions diff --git a/Util/utilClass.py b/Util/utilClass.py deleted file mode 100644 index 34f83acfb..000000000 --- a/Util/utilClass.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: utilClass.py - Description : tool class - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: Class LazyProperty - 2016/12/4: rewrite ConfigParser -------------------------------------------------- -""" -__author__ = 'JHao' - - -class LazyProperty(object): - """ - LazyProperty - explain: http://www.spiderpy.cn/blog/5/ - """ - - def __init__(self, func): - self.func = func - - def __get__(self, instance, owner): - if instance is None: - return self - else: - value = self.func(instance) - setattr(instance, self.func.__name__, value) - return value - - -from ConfigParser import ConfigParser - - -class ConfigParse(ConfigParser): - """ - rewrite ConfigParser, for support upper option - """ - - def __init__(self): - ConfigParser.__init__(self) - - def optionxform(self, optionstr): - return optionstr - - -class Singleton(type): - """ - Singleton Metaclass - """ - - _inst = {} - - def __call__(cls, *args, **kwargs): - if cls not in cls._inst: - cls._inst[cls] = super(Singleton, cls).__call__(*args) - return cls._inst[cls] diff --git a/Util/utilFunction.py b/Util/utilFunction.py deleted file mode 100644 index 2e3816f22..000000000 --- a/Util/utilFunction.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: utilFunction.py - Description : tool function - Author : JHao - date: 2016/11/25 -------------------------------------------------- - Change Activity: - 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree -------------------------------------------------- -""" - - -# noinspection PyPep8Naming -def robustCrawl(func): - def decorate(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - print u"sorry, 抓取出错。错误原因:" - print e - - return decorate - - -def verifyProxy(proxy): - """ - 检查代理格式 - :param proxy: - :return: - """ - import re - verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" - return True if re.findall(verify_regex, proxy) else False - - -def getHtmlTree(url, **kwargs): - """ - 获取html树 - :param url: - :param kwargs: - :return: - """ - import requests - from lxml import etree - html = requests.get(url=url).content - return etree.HTML(html) diff --git a/_config.yml b/_config.yml new file mode 100644 index 000000000..c4192631f --- /dev/null +++ b/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-cayman \ No newline at end of file diff --git a/Api/__init__.py b/api/__init__.py similarity index 94% rename from Api/__init__.py rename to api/__init__.py index c511f3103..09c93434c 100644 --- a/Api/__init__.py +++ b/api/__init__.py @@ -10,4 +10,5 @@ 2016/12/3: ------------------------------------------------- """ -__author__ = 'JHao' \ No newline at end of file +__author__ = 'JHao' + diff --git a/api/proxyApi.py b/api/proxyApi.py new file mode 100644 index 000000000..bd2de57e2 --- /dev/null +++ b/api/proxyApi.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: ProxyApi.py + Description : WebApi + Author : JHao + date: 2016/12/4 +------------------------------------------------- + Change Activity: + 2016/12/04: WebApi + 2019/08/14: 集成Gunicorn启动方式 + 2020/06/23: 新增pop接口 + 2022/07/21: 更新count接口 +------------------------------------------------- +""" +__author__ = 'JHao' + +import platform +from werkzeug.wrappers import Response +from flask import Flask, jsonify, request + +from util.six import iteritems +from helper.proxy import Proxy +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + +app = Flask(__name__) +conf = ConfigHandler() +proxy_handler = ProxyHandler() + + +class JsonResponse(Response): + @classmethod + def force_type(cls, response, environ=None): + if isinstance(response, (dict, list)): + response = jsonify(response) + + return super(JsonResponse, cls).force_type(response, environ) + + +app.response_class = JsonResponse + +api_list = [ + {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, + {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, + {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, + {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, + {"url": "/count", "params": "", "desc": "return proxy count"} + # 'refresh': 'refresh proxy pool', +] + + +@app.route('/') +def index(): + return {'url': api_list} + + +@app.route('/get/') +def get(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.get(https) + return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} + + +@app.route('/pop/') +def pop(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.pop(https) + return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} + + +@app.route('/refresh/') +def refresh(): + # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 + return 'success' + + +@app.route('/all/') +def getAll(): + https = request.args.get("type", "").lower() == 'https' + proxies = proxy_handler.getAll(https) + return jsonify([_.to_dict for _ in proxies]) + + +@app.route('/delete/', methods=['GET']) +def delete(): + proxy = request.args.get('proxy') + status = proxy_handler.delete(Proxy(proxy)) + return {"code": 0, "src": status} + + +@app.route('/count/') +def getCount(): + proxies = proxy_handler.getAll() + http_type_dict = {} + source_dict = {} + for proxy in proxies: + http_type = 'https' if proxy.https else 'http' + http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 + for source in proxy.source.split('/'): + source_dict[source] = source_dict.get(source, 0) + 1 + return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)} + + +def runFlask(): + if platform.system() == "Windows": + app.run(host=conf.serverHost, port=conf.serverPort) + else: + import gunicorn.app.base + + class StandaloneApplication(gunicorn.app.base.BaseApplication): + + def __init__(self, app, options=None): + self.options = options or {} + self.application = app + super(StandaloneApplication, self).__init__() + + def load_config(self): + _config = dict([(key, value) for key, value in iteritems(self.options) + if key in self.cfg.settings and value is not None]) + for key, value in iteritems(_config): + self.cfg.set(key.lower(), value) + + def load(self): + return self.application + + _options = { + 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), + 'workers': 4, + 'accesslog': '-', # log to stdout + 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' + } + StandaloneApplication(app, _options).run() + + +if __name__ == '__main__': + runFlask() diff --git a/DB/__init__.py b/db/__init__.py similarity index 100% rename from DB/__init__.py rename to db/__init__.py diff --git a/db/dbClient.py b/db/dbClient.py new file mode 100644 index 000000000..4d9554b18 --- /dev/null +++ b/db/dbClient.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: DbClient.py + Description : DB工厂类 + Author : JHao + date: 2016/12/2 +------------------------------------------------- + Change Activity: + 2016/12/02: DB工厂类 + 2020/07/03: 取消raw_proxy储存 +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import sys + +from util.six import urlparse, withMetaclass +from util.singleton import Singleton + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + + +class DbClient(withMetaclass(Singleton)): + """ + DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 + + + 抽象方法定义: + get(): 随机返回一个proxy; + put(proxy): 存入一个proxy; + pop(): 顺序返回并删除一个proxy; + update(proxy): 更新指定proxy信息; + delete(proxy): 删除指定proxy; + exists(proxy): 判断指定proxy是否存在; + getAll(): 返回所有代理; + clean(): 清除所有proxy信息; + getCount(): 返回proxy统计信息; + changeTable(name): 切换操作对象 + + + 所有方法需要相应类去具体实现: + ssdb: ssdbClient.py + redis: redisClient.py + mongodb: mongodbClient.py + + """ + + def __init__(self, db_conn): + """ + init + :return: + """ + self.parseDbConn(db_conn) + self.__initDbClient() + + @classmethod + def parseDbConn(cls, db_conn): + db_conf = urlparse(db_conn) + cls.db_type = db_conf.scheme.upper().strip() + cls.db_host = db_conf.hostname + cls.db_port = db_conf.port + cls.db_user = db_conf.username + cls.db_pwd = db_conf.password + cls.db_name = db_conf.path[1:] + return cls + + def __initDbClient(self): + """ + init DB Client + :return: + """ + __type = None + if "SSDB" == self.db_type: + __type = "ssdbClient" + elif "REDIS" == self.db_type: + __type = "redisClient" + else: + pass + assert __type, 'type error, Not support DB type: {}'.format(self.db_type) + self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, + port=self.db_port, + username=self.db_user, + password=self.db_pwd, + db=self.db_name) + + def get(self, https, **kwargs): + return self.client.get(https, **kwargs) + + def put(self, key, **kwargs): + return self.client.put(key, **kwargs) + + def update(self, key, value, **kwargs): + return self.client.update(key, value, **kwargs) + + def delete(self, key, **kwargs): + return self.client.delete(key, **kwargs) + + def exists(self, key, **kwargs): + return self.client.exists(key, **kwargs) + + def pop(self, https, **kwargs): + return self.client.pop(https, **kwargs) + + def getAll(self, https): + return self.client.getAll(https) + + def clear(self): + return self.client.clear() + + def changeTable(self, name): + self.client.changeTable(name) + + def getCount(self): + return self.client.getCount() + + def test(self): + return self.client.test() diff --git a/db/redisClient.py b/db/redisClient.py new file mode 100644 index 000000000..e66614d7e --- /dev/null +++ b/db/redisClient.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +""" +----------------------------------------------------- + File Name: redisClient.py + Description : 封装Redis相关操作 + Author : JHao + date: 2019/8/9 +------------------------------------------------------ + Change Activity: + 2019/08/09: 封装Redis相关操作 + 2020/06/23: 优化pop方法, 改用hscan命令 + 2021/05/26: 区别http/https代理 +------------------------------------------------------ +""" +__author__ = 'JHao' + +from redis.exceptions import TimeoutError, ConnectionError, ResponseError +from redis.connection import BlockingConnectionPool +from handler.logHandler import LogHandler +from random import choice +from redis import Redis +import json + + +class RedisClient(object): + """ + Redis client + + Redis中代理存放的结构为hash: + key为ip:port, value为代理属性的字典; + + """ + + def __init__(self, **kwargs): + """ + init + :param host: host + :param port: port + :param password: password + :param db: db + :return: + """ + self.name = "" + kwargs.pop("username") + self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, + timeout=5, + socket_timeout=5, + **kwargs)) + + def get(self, https): + """ + 返回一个代理 + :return: + """ + if https: + items = self.__conn.hvals(self.name) + proxies = list(filter(lambda x: json.loads(x).get("https"), items)) + return choice(proxies) if proxies else None + else: + proxies = self.__conn.hkeys(self.name) + proxy = choice(proxies) if proxies else None + return self.__conn.hget(self.name, proxy) if proxy else None + + def put(self, proxy_obj): + """ + 将代理放入hash, 使用changeTable指定hash name + :param proxy_obj: Proxy obj + :return: + """ + data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + return data + + def pop(self, https): + """ + 弹出一个代理 + :return: dict {proxy: value} + """ + proxy = self.get(https) + if proxy: + self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) + return proxy if proxy else None + + def delete(self, proxy_str): + """ + 移除指定代理, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hdel(self.name, proxy_str) + + def exists(self, proxy_str): + """ + 判断指定代理是否存在, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hexists(self.name, proxy_str) + + def update(self, proxy_obj): + """ + 更新 proxy 属性 + :param proxy_obj: + :return: + """ + return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + + def getAll(self, https): + """ + 字典形式返回所有代理, 使用changeTable指定hash name + :return: + """ + items = self.__conn.hvals(self.name) + if https: + return list(filter(lambda x: json.loads(x).get("https"), items)) + else: + return items + + def clear(self): + """ + 清空所有代理, 使用changeTable指定hash name + :return: + """ + return self.__conn.delete(self.name) + + def getCount(self): + """ + 返回代理数量 + :return: + """ + proxies = self.getAll(https=False) + return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} + + def changeTable(self, name): + """ + 切换操作对象 + :param name: + :return: + """ + self.name = name + + def test(self): + log = LogHandler('redis_client') + try: + self.getCount() + except TimeoutError as e: + log.error('redis connection time out: %s' % str(e), exc_info=True) + return e + except ConnectionError as e: + log.error('redis connection error: %s' % str(e), exc_info=True) + return e + except ResponseError as e: + log.error('redis connection error: %s' % str(e), exc_info=True) + return e + + diff --git a/db/ssdbClient.py b/db/ssdbClient.py new file mode 100644 index 000000000..0f5c00054 --- /dev/null +++ b/db/ssdbClient.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python +""" +------------------------------------------------- + File Name: ssdbClient.py + Description : 封装SSDB操作 + Author : JHao + date: 2016/12/2 +------------------------------------------------- + Change Activity: + 2016/12/2: + 2017/09/22: PY3中 redis-py返回的数据是bytes型 + 2017/09/27: 修改pop()方法 返回{proxy:value}字典 + 2020/07/03: 2.1.0 优化代码结构 + 2021/05/26: 区分http和https代理 +------------------------------------------------- +""" +__author__ = 'JHao' +from redis.exceptions import TimeoutError, ConnectionError, ResponseError +from redis.connection import BlockingConnectionPool +from handler.logHandler import LogHandler +from random import choice +from redis import Redis +import json + + +class SsdbClient(object): + """ + SSDB client + + SSDB中代理存放的结构为hash: + key为代理的ip:por, value为代理属性的字典; + """ + + def __init__(self, **kwargs): + """ + init + :param host: host + :param port: port + :param password: password + :return: + """ + self.name = "" + kwargs.pop("username") + self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, + timeout=5, + socket_timeout=5, + **kwargs)) + + def get(self, https): + """ + 从hash中随机返回一个代理 + :return: + """ + if https: + items_dict = self.__conn.hgetall(self.name) + proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) + return choice(proxies) if proxies else None + else: + proxies = self.__conn.hkeys(self.name) + proxy = choice(proxies) if proxies else None + return self.__conn.hget(self.name, proxy) if proxy else None + + def put(self, proxy_obj): + """ + 将代理放入hash + :param proxy_obj: Proxy obj + :return: + """ + result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + return result + + def pop(self, https): + """ + 顺序弹出一个代理 + :return: proxy + """ + proxy = self.get(https) + if proxy: + self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) + return proxy if proxy else None + + def delete(self, proxy_str): + """ + 移除指定代理, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + self.__conn.hdel(self.name, proxy_str) + + def exists(self, proxy_str): + """ + 判断指定代理是否存在, 使用changeTable指定hash name + :param proxy_str: proxy str + :return: + """ + return self.__conn.hexists(self.name, proxy_str) + + def update(self, proxy_obj): + """ + 更新 proxy 属性 + :param proxy_obj: + :return: + """ + self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) + + def getAll(self, https): + """ + 字典形式返回所有代理, 使用changeTable指定hash name + :return: + """ + item_dict = self.__conn.hgetall(self.name) + if https: + return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) + else: + return item_dict.values() + + def clear(self): + """ + 清空所有代理, 使用changeTable指定hash name + :return: + """ + return self.__conn.delete(self.name) + + def getCount(self): + """ + 返回代理数量 + :return: + """ + proxies = self.getAll(https=False) + return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} + + def changeTable(self, name): + """ + 切换操作对象 + :param name: + :return: + """ + self.name = name + + def test(self): + log = LogHandler('ssdb_client') + try: + self.getCount() + except TimeoutError as e: + log.error('ssdb connection time out: %s' % str(e), exc_info=True) + return e + except ConnectionError as e: + log.error('ssdb connection error: %s' % str(e), exc_info=True) + return e + except ResponseError as e: + log.error('ssdb connection error: %s' % str(e), exc_info=True) + return e diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..9d1a10ba4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + build: . + container_name: proxy_pool + ports: + - "5010:5010" + links: + - proxy_redis + environment: + DB_CONN: "redis://@proxy_redis:6379/0" + proxy_redis: + image: "redis" + container_name: proxy_redis \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 000000000..e3889882c --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,124 @@ +.. _changelog: + +ChangeLog +========== + +2.4.2 (2024-01-18) +------------------ + +1. 代理格式检查支持需认证的代理格式 `username:password@ip:port` ; (2023-03-10) +2. 新增代理源 **稻壳代理**; (2023-05-15) +3. 新增代理源 **冰凌代理**; (2023-01-18) + +2.4.1 (2022-07-17) +------------------ + +1. 新增代理源 **FreeProxyList**; (2022-07-21) +2. 新增代理源 **FateZero**; (2022-08-01) +3. 新增代理属性 ``region``; (2022-08-16) + +2.4.0 (2021-11-17) +------------------ + +1. 移除无效代理源 **神鸡代理**; (2021-11-16) +2. 移除无效代理源 **极速代理**; (2021-11-16) +3. 移除代理源 **西拉代理**; (2021-11-16) +4. 新增代理源 **蝶鸟IP**; (2021-11-16) +5. 新增代理源 **PROXY11**; (2021-11-16) +6. 多线程采集代理; (2021-11-17) + +2.3.0 (2021-05-27) +------------------ + +1. 修复Dockerfile时区问题; (2021-04-12) +2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13) +3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27) + +2.2.0 (2021-04-08) +------------------ + +1. 启动时检查数据库连通性; +2. 新增免费代理源 **米扑代理**; +3. 新增免费代理源 **Pzzqz**; +4. 新增免费代理源 **神鸡代理**; +5. 新增免费代理源 **极速代理**; +6. 新增免费代理源 **小幻代理**; + +2.1.1 (2021-02-23) +------------------ + +1. Fix Bug `#493`_, 新增时区配置; (2020-08-12) +2. 修复 **66代理** 采集; (2020-11-04) +3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) +4. 新增 **代理盒子** 免费源; (2020-11-04) +5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) + +.. _#493: https://github.com/jhao104/proxy_pool/issues/493 + +2.1.0 (2020.07) +------------------ + +1. 新增免费代理源 **西拉代理** (2020-03-30) +2. Fix Bug `#356`_ `#401`_ +3. 优化Docker镜像体积; (2020-06-19) +4. 优化配置方式; +5. 优化代码结构; +6. 不再储存raw_proxy, 抓取后直接验证入库; + +.. _#401: https://github.com/jhao104/proxy_pool/issues/401 +.. _#356: https://github.com/jhao104/proxy_pool/issues/356 + +2.0.1 (2019.10) +----------------- + +1. 新增免费代理源 **89免费代理**; +#. 新增免费代理源 **齐云代理** + +2.0.0 (2019.08) +------------------ + +1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; +#. 优化Proxy调度程序; +#. 扩展Proxy属性; +#. 新增cli工具, 更加方便启动proxyPool + +1.14 (2019.07) +----------------- + +1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug; +#. 修改代理源 **云代理** 抓取; +#. 修改代理源 **码农代理** 抓取; +#. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密; + +1.13 (2019.02) +----------------- + +1. 使用.py文件替换.ini作为配置文件; + +#. 优化代理采集部分; + +1.12 (2018.04) +----------------- + +1. 优化代理格式检查; + +#. 增加代理源; + +#. fix bug `#122`_ `#126`_ + +.. _#122: https://github.com/jhao104/proxy_pool/issues/122 +.. _#126: https://github.com/jhao104/proxy_pool/issues/126 + +1.11 (2017.08) +----------------- + +1. 使用多线程验证useful_pool; + +1.10 (2016.11) +----------------- + +1. 第一版; + +#. 支持PY2/PY3; + +#. 代理池基本功能; diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..3fc072f37 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,71 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import sphinx_rtd_theme + +# -- Project information ----------------------------------------------------- + +project = 'ProxyPool' +copyright = '2020, jhao104' +author = 'jhao104' + +master_doc = 'index' + +# The full version, including alpha/beta/rc tags +release = '2.1.0' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = 'zh_CN' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/dev/ext_fetcher.rst b/docs/dev/ext_fetcher.rst new file mode 100644 index 000000000..e91bdf181 --- /dev/null +++ b/docs/dev/ext_fetcher.rst @@ -0,0 +1,36 @@ +.. ext_fetcher + +扩展代理源 +----------- + +项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 + +如果要添加一个新的代理获取方法, 过程如下: + +1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如: + +.. code-block:: python + + class ProxyFetcher(object): + # .... + # 自定义代理源获取方法 + @staticmethod + def freeProxyCustom01(): # 命名不和已有重复即可 + # 通过某网站或者某接口或某数据库获取代理 + # 假设你已经拿到了一个代理列表 + proxies = ["x.x.x.x:3128", "x.x.x.x:80"] + for proxy in proxies: + yield proxy + # 确保每个proxy都是 host:ip正确的格式返回 + +2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字: + +.. code-block:: python + + PROXY_FETCHER = [ + # .... + "freeProxyCustom01" # # 确保名字和你添加方法名字一致 + ] + +.. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20 +.. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47 \ No newline at end of file diff --git a/docs/dev/ext_validator.rst b/docs/dev/ext_validator.rst new file mode 100644 index 000000000..a42d73a92 --- /dev/null +++ b/docs/dev/ext_validator.rst @@ -0,0 +1,61 @@ +.. ext_validator + +代理校验 +----------- + +内置校验 +>>>>>>>>> + +项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示 +校验通过, 返回 ``False`` 表示校验不通过。 + +* 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``: + + * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法; + * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验; + * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。 + + +.. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py +.. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29 +.. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51 +.. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58 +.. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71 + +每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。 +只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。 + +扩展校验 +>>>>>>>>> + +在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子: + +* 1. 自定义一个代理可用性的校验(``addHttpValidator``): + +.. code-block:: python + + @ProxyValidator.addHttpValidator + def customValidatorExample01(proxy): + """自定义代理可用性校验函数""" + proxies = {"http": "http://{proxy}".format(proxy=proxy)} + try: + r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) + return True if r.status_code == 200 and len(r.content) > 200 else False + except Exception as e: + return False + +* 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``): + +.. code-block:: python + + @ProxyValidator.addHttpsValidator + def customValidatorExample02(proxy): + """自定义代理是否支持https校验函数""" + proxies = {"https": "https://{proxy}".format(proxy=proxy)} + try: + r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) + return True if r.status_code == 200 and len(r.content) > 200 else False + except Exception as e: + return False + +注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。 diff --git a/docs/dev/index.rst b/docs/dev/index.rst new file mode 100644 index 000000000..704902c06 --- /dev/null +++ b/docs/dev/index.rst @@ -0,0 +1,11 @@ +========= +开发指南 +========= + +.. module:: dev + +.. toctree:: + :maxdepth: 2 + + ext_fetcher + ext_validator diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..6a52ed4c6 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,116 @@ +.. ProxyPool documentation master file, created by + sphinx-quickstart on Wed Jul 8 16:13:42 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +ProxyPool +===================================== + +:: + + **************************************************************** + *** ______ ********************* ______ *********** _ ******** + *** | ___ \_ ******************** | ___ \ ********* | | ******** + *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** + *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** + *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** + *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** + **** __ / / ***** + ************************* /___ / ******************************* + ************************* ******************************** + **************************************************************** + +Python爬虫代理IP池 + +安装 +----- + +* 下载代码 + +.. code-block:: console + + $ git clone git@github.com:jhao104/proxy_pool.git + +* 安装依赖 + +.. code-block:: console + + $ pip install -r requirements.txt + +* 更新配置 + +.. code-block:: python + + HOST = "0.0.0.0" + PORT = 5000 + + DB_CONN = 'redis://@127.0.0.1:8888' + + PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + # .... + ] + +* 启动项目 + +.. code-block:: console + + $ python proxyPool.py schedule + $ python proxyPool.py server + +使用 +______ + +* API + +============ ======== ================ ============== +Api Method Description Params +============ ======== ================ ============== +/ GET API介绍 无 +/get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理 +/pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理 +/all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理 +/count GET 返回代理数量 无 +/delete GET 删除指定代理 `?proxy=host:ip` +============ ======== ================ ============== + + +* 爬虫 + +.. code-block:: python + + import requests + + def get_proxy(): + return requests.get("http://127.0.0.1:5010/get?type=https").json() + + def delete_proxy(proxy): + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) + + # your spider code + + def getHtml(): + # .... + retry_count = 5 + proxy = get_proxy().get("proxy") + while retry_count > 0: + try: + html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)}) + # 使用代理访问 + return html + except Exception: + retry_count -= 1 + # 删除代理池中代理 + delete_proxy(proxy) + return None + +Contents +-------- + +.. toctree:: + :maxdepth: 2 + + user/index + dev/index + changelog diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..922152e96 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/user/how_to_config.rst b/docs/user/how_to_config.rst new file mode 100644 index 000000000..a04e0dba6 --- /dev/null +++ b/docs/user/how_to_config.rst @@ -0,0 +1,83 @@ +.. how_to_config + +配置参考 +--------- + +配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**. + +服务配置 +>>>>>>>>> + +* ``HOST`` + + API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``. + +* ``PORT`` + + API服务监听的端口. + +数据库配置 +>>>>>>>>>>> + +* ``DB_CONN`` + + 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``. + + 目前支持的db_type有: ``ssdb`` 、 ``redis``. + + 配置示例: + +.. code-block:: python + + # SSDB IP: 127.0.0.1 Port: 8888 + DB_CONN = 'ssdb://@127.0.0.1:8888' + # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456 + DB_CONN = 'ssdb://:123456@127.0.0.1:8888' + + # Redis IP: 127.0.0.1 Port: 6379 + DB_CONN = 'redis://@127.0.0.1:6379' + # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 + DB_CONN = 'redis://:123456@127.0.0.1:6379' + # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15 + DB_CONN = 'redis://:123456@127.0.0.1:6379/15' + + +* ``TABLE_NAME`` + + 存放代理的数据载体名称, ssdb和redis的存放结构为hash. + +采集配置 +>>>>>>>>> + +* ``PROXY_FETCHER`` + + 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中. + + 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称. + + 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`. + + 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的. + +校验配置 +>>>>>>>>> + +* ``HTTP_URL`` + + 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址. + +* ``HTTPS_URL`` + + 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址. + +* ``VERIFY_TIMEOUT`` + + 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用. + +* ``MAX_FAIL_COUNT`` + + 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除. + +* ``POOL_SIZE_MIN`` + + 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序. \ No newline at end of file diff --git a/docs/user/how_to_run.rst b/docs/user/how_to_run.rst new file mode 100644 index 000000000..91bc41ff9 --- /dev/null +++ b/docs/user/how_to_run.rst @@ -0,0 +1,73 @@ +.. how_to_run + + +如何运行 +--------- + +下载代码 +>>>>>>>>> + +本项目需要下载代码到本地运行, 通过 ``git`` 下载: + +.. code-block:: console + + $ git clone git@github.com:jhao104/proxy_pool.git + +或者下载特定的 ``release`` 版本: + +.. code-block:: console + + https://github.com/jhao104/proxy_pool/releases + +安装依赖 +>>>>>>>>> + +到项目目录下使用 ``pip`` 安装依赖库: + +.. code-block:: console + + $ pip install -r requirements.txt + + +更新配置 +>>>>>>>>> + +配置文件 ``setting.py`` 位于项目的主目录下: + +.. code-block:: python + + # 配置API服务 + + HOST = "0.0.0.0" # IP + PORT = 5000 # 监听端口 + + # 配置数据库 + + DB_CONN = 'redis://@127.0.0.1:8888/0' + + # 配置 ProxyFetcher + + PROXY_FETCHER = [ + "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py + "freeProxy02", + # .... + ] + +更多配置请参考 :doc:`/user/how_to_config` + +启动项目 +>>>>>>>>> + +如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. +完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. + +通过命令行程序分别启动调度程序和API服务: + +.. code-block:: console + + # 启动调度程序 + $ python proxyPool.py schedule + + # 启动webApi服务 + $ python proxyPool.py server + diff --git a/docs/user/how_to_use.rst b/docs/user/how_to_use.rst new file mode 100644 index 000000000..981749732 --- /dev/null +++ b/docs/user/how_to_use.rst @@ -0,0 +1,63 @@ +.. how_to_use + +如何使用 +---------- + +爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库. + +调用API +>>>>>>>>> + +启动ProxyPool的 ``server`` 后会提供如下几个http接口: + +============ ======== ================ ============== +Api Method Description Arg +============ ======== ================ ============== +/ GET API介绍 无 +/get GET 随机返回一个代理 无 +/get_all GET 返回所有代理 无 +/get_status GET 返回代理数量 无 +/delete GET 删除指定代理 proxy=host:ip +============ ======== ================ ============== + +在代码中可以通过封装上面的API接口来使用代理, 例子: + +.. code-block:: python + + import requests + + def get_proxy(): + return requests.get("http://127.0.0.1:5010/get/").json() + + def delete_proxy(proxy): + requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) + + # your spider code + + def getHtml(): + # .... + retry_count = 5 + proxy = get_proxy().get("proxy") + while retry_count > 0: + try: + # 使用代理访问 + html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) + return html + except Exception: + retry_count -= 1 + # 删除代理池中代理 + delete_proxy(proxy) + return None + +本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理. + +读数据库 +>>>>>>>>> + +目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``. + +* **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** + +* **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** + +可以在代码中自行读取. diff --git a/docs/user/index.rst b/docs/user/index.rst new file mode 100644 index 000000000..97b156d31 --- /dev/null +++ b/docs/user/index.rst @@ -0,0 +1,12 @@ +========= +用户指南 +========= + +.. module:: user + +.. toctree:: + :maxdepth: 2 + + how_to_run + how_to_use + how_to_config diff --git a/Util/__init__.py b/fetcher/__init__.py similarity index 89% rename from Util/__init__.py rename to fetcher/__init__.py index d1c5cc292..54820a3ba 100644 --- a/Util/__init__.py +++ b/fetcher/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py + File Name: __init__.py Description : Author : JHao date: 2016/11/25 diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py new file mode 100644 index 000000000..cfc37f928 --- /dev/null +++ b/fetcher/proxyFetcher.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyFetcher + Description : + Author : JHao + date: 2016/11/25 +------------------------------------------------- + Change Activity: + 2016/11/25: proxyFetcher +------------------------------------------------- +""" +__author__ = 'JHao' + +import re +import json +from time import sleep + +from util.webRequest import WebRequest + + +class ProxyFetcher(object): + """ + proxy getter + """ + + @staticmethod + def freeProxy01(): + """ + 站大爷 https://www.zdaye.com/dayProxy.html + """ + start_url = "https://www.zdaye.com/dayProxy.html" + html_tree = WebRequest().get(start_url, verify=False).tree + latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() + from datetime import datetime + interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") + if interval.seconds < 300: # 只采集5分钟内的更新 + target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() + while target_url: + _tree = WebRequest().get(target_url, verify=False).tree + for tr in _tree.xpath("//table//tr"): + ip = "".join(tr.xpath("./td[1]/text()")).strip() + port = "".join(tr.xpath("./td[2]/text()")).strip() + yield "%s:%s" % (ip, port) + next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") + target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False + sleep(5) + + @staticmethod + def freeProxy02(): + """ + 代理66 http://www.66ip.cn/ + """ + url = "http://www.66ip.cn/" + resp = WebRequest().get(url, timeout=10).tree + for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): + if i > 0: + ip = "".join(tr.xpath("./td[1]/text()")).strip() + port = "".join(tr.xpath("./td[2]/text()")).strip() + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy03(): + """ 开心代理 """ + target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] + for url in target_urls: + tree = WebRequest().get(url).tree + for tr in tree.xpath("//table[@class='active']//tr")[1:]: + ip = "".join(tr.xpath('./td[1]/text()')).strip() + port = "".join(tr.xpath('./td[2]/text()')).strip() + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy04(): + """ FreeProxyList https://www.freeproxylists.net/zh/ """ + url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" + tree = WebRequest().get(url, verify=False).tree + from urllib import parse + + def parse_ip(input_str): + html_str = parse.unquote(input_str) + ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) + return ips[0] if ips else None + + for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): + ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) + port = "".join(tr.xpath('./td[2]/text()')).strip() + if ip: + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy05(page_count=1): + """ 快代理 https://www.kuaidaili.com """ + url_pattern = [ + 'https://www.kuaidaili.com/free/inha/{}/', + 'https://www.kuaidaili.com/free/intr/{}/' + ] + url_list = [] + for page_index in range(1, page_count + 1): + for pattern in url_pattern: + url_list.append(pattern.format(page_index)) + + for url in url_list: + tree = WebRequest().get(url).tree + proxy_list = tree.xpath('.//table//tr') + sleep(1) # 必须sleep 不然第二条请求不到数据 + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) + + @staticmethod + def freeProxy06(): + """ 冰凌代理 https://www.binglx.cn """ + url = "https://www.binglx.cn/?page=1" + try: + tree = WebRequest().get(url).tree + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) + except Exception as e: + print(e) + + @staticmethod + def freeProxy07(): + """ 云代理 """ + urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] + for url in urls: + r = WebRequest().get(url, timeout=10) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod + def freeProxy08(): + """ 小幻代理 """ + urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] + for url in urls: + r = WebRequest().get(url, timeout=10) + proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod + def freeProxy09(page_count=1): + """ 免费代理库 """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + @staticmethod + def freeProxy10(): + """ 89免费代理 """ + r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) + proxies = re.findall( + r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', + r.text) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod + def freeProxy11(): + """ 稻壳代理 https://www.docip.net/ """ + r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) + try: + for each in r.json['data']: + yield each['ip'] + except Exception as e: + print(e) + + # @staticmethod + # def wallProxy01(): + # """ + # PzzQz https://pzzqz.com/ + # """ + # from requests import Session + # from lxml import etree + # session = Session() + # try: + # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text + # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) + # if x_csrf_token: + # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} + # proxy_resp = session.post("https://pzzqz.com/", verify=False, + # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() + # tree = etree.HTML(proxy_resp["proxy_html"]) + # for tr in tree.xpath("//tr"): + # ip = "".join(tr.xpath("./td[1]/text()")) + # port = "".join(tr.xpath("./td[2]/text()")) + # yield "%s:%s" % (ip, port) + # except Exception as e: + # print(e) + + # @staticmethod + # def freeProxy10(): + # """ + # 墙外网站 cn-proxy + # :return: + # """ + # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] + # request = WebRequest() + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) + # for proxy in proxies: + # yield ':'.join(proxy) + + # @staticmethod + # def freeProxy11(): + # """ + # https://proxy-list.org/english/index.php + # :return: + # """ + # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] + # request = WebRequest() + # import base64 + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) + # for proxy in proxies: + # yield base64.b64decode(proxy).decode() + + # @staticmethod + # def freeProxy12(): + # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + # request = WebRequest() + # for url in urls: + # r = request.get(url, timeout=10) + # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + # for proxy in proxies: + # yield ':'.join(proxy) + + +if __name__ == '__main__': + p = ProxyFetcher() + for _ in p.freeProxy06(): + print(_) + +# http://nntime.com/proxy-list-01.htm diff --git a/Manager/__init__.py b/handler/__init__.py similarity index 75% rename from Manager/__init__.py rename to handler/__init__.py index e94e59d11..9a42cea96 100644 --- a/Manager/__init__.py +++ b/handler/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py + File Name: __init__.py Description : Author : JHao date: 2016/12/3 @@ -10,4 +10,6 @@ 2016/12/3: ------------------------------------------------- """ -__author__ = 'JHao' \ No newline at end of file +__author__ = 'JHao' + +# from handler.ProxyManager import ProxyManager diff --git a/handler/configHandler.py b/handler/configHandler.py new file mode 100644 index 000000000..29000bcc6 --- /dev/null +++ b/handler/configHandler.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: configHandler + Description : + Author : JHao + date: 2020/6/22 +------------------------------------------------- + Change Activity: + 2020/6/22: +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import setting +from util.singleton import Singleton +from util.lazyProperty import LazyProperty +from util.six import reload_six, withMetaclass + + +class ConfigHandler(withMetaclass(Singleton)): + + def __init__(self): + pass + + @LazyProperty + def serverHost(self): + return os.environ.get("HOST", setting.HOST) + + @LazyProperty + def serverPort(self): + return os.environ.get("PORT", setting.PORT) + + @LazyProperty + def dbConn(self): + return os.getenv("DB_CONN", setting.DB_CONN) + + @LazyProperty + def tableName(self): + return os.getenv("TABLE_NAME", setting.TABLE_NAME) + + @property + def fetchers(self): + reload_six(setting) + return setting.PROXY_FETCHER + + @LazyProperty + def httpUrl(self): + return os.getenv("HTTP_URL", setting.HTTP_URL) + + @LazyProperty + def httpsUrl(self): + return os.getenv("HTTPS_URL", setting.HTTPS_URL) + + @LazyProperty + def verifyTimeout(self): + return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) + + # @LazyProperty + # def proxyCheckCount(self): + # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) + + @LazyProperty + def maxFailCount(self): + return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) + + # @LazyProperty + # def maxFailRate(self): + # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) + + @LazyProperty + def poolSizeMin(self): + return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) + + @LazyProperty + def proxyRegion(self): + return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) + + @LazyProperty + def timezone(self): + return os.getenv("TIMEZONE", setting.TIMEZONE) + diff --git a/handler/logHandler.py b/handler/logHandler.py new file mode 100644 index 000000000..45cd1201d --- /dev/null +++ b/handler/logHandler.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: LogHandler.py + Description : 日志操作模块 + Author : JHao + date: 2017/3/6 +------------------------------------------------- + Change Activity: + 2017/03/06: log handler + 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) + 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 +------------------------------------------------- +""" +__author__ = 'JHao' + +import os +import logging +import platform + +from logging.handlers import TimedRotatingFileHandler + +# 日志级别 +CRITICAL = 50 +FATAL = CRITICAL +ERROR = 40 +WARNING = 30 +WARN = WARNING +INFO = 20 +DEBUG = 10 +NOTSET = 0 + +CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) +ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) +LOG_PATH = os.path.join(ROOT_PATH, 'log') + +if not os.path.exists(LOG_PATH): + try: + os.mkdir(LOG_PATH) + except FileExistsError: + pass + + +class LogHandler(logging.Logger): + """ + LogHandler + """ + + def __init__(self, name, level=DEBUG, stream=True, file=True): + self.name = name + self.level = level + logging.Logger.__init__(self, self.name, level=level) + if stream: + self.__setStreamHandler__() + if file: + if platform.system() != "Windows": + self.__setFileHandler__() + + def __setFileHandler__(self, level=None): + """ + set file handler + :param level: + :return: + """ + file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) + # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 + file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) + file_handler.suffix = '%Y%m%d.log' + if not level: + file_handler.setLevel(self.level) + else: + file_handler.setLevel(level) + formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') + + file_handler.setFormatter(formatter) + self.file_handler = file_handler + self.addHandler(file_handler) + + def __setStreamHandler__(self, level=None): + """ + set stream handler + :param level: + :return: + """ + stream_handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') + stream_handler.setFormatter(formatter) + if not level: + stream_handler.setLevel(self.level) + else: + stream_handler.setLevel(level) + self.addHandler(stream_handler) + + +if __name__ == '__main__': + log = LogHandler('test') + log.info('this is a test msg') diff --git a/handler/proxyHandler.py b/handler/proxyHandler.py new file mode 100644 index 000000000..32e215e5d --- /dev/null +++ b/handler/proxyHandler.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: ProxyHandler.py + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/03: + 2020/05/26: 区分http和https +------------------------------------------------- +""" +__author__ = 'JHao' + +from helper.proxy import Proxy +from db.dbClient import DbClient +from handler.configHandler import ConfigHandler + + +class ProxyHandler(object): + """ Proxy CRUD operator""" + + def __init__(self): + self.conf = ConfigHandler() + self.db = DbClient(self.conf.dbConn) + self.db.changeTable(self.conf.tableName) + + def get(self, https=False): + """ + return a proxy + Args: + https: True/False + Returns: + """ + proxy = self.db.get(https) + return Proxy.createFromJson(proxy) if proxy else None + + def pop(self, https): + """ + return and delete a useful proxy + :return: + """ + proxy = self.db.pop(https) + if proxy: + return Proxy.createFromJson(proxy) + return None + + def put(self, proxy): + """ + put proxy into use proxy + :return: + """ + self.db.put(proxy) + + def delete(self, proxy): + """ + delete useful proxy + :param proxy: + :return: + """ + return self.db.delete(proxy.proxy) + + def getAll(self, https=False): + """ + get all proxy from pool as Proxy list + :return: + """ + proxies = self.db.getAll(https) + return [Proxy.createFromJson(_) for _ in proxies] + + def exists(self, proxy): + """ + check proxy exists + :param proxy: + :return: + """ + return self.db.exists(proxy.proxy) + + def getCount(self): + """ + return raw_proxy and use_proxy count + :return: + """ + total_use_proxy = self.db.getCount() + return {'count': total_use_proxy} diff --git a/helper/__init__.py b/helper/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/helper/check.py b/helper/check.py new file mode 100644 index 000000000..937645c0f --- /dev/null +++ b/helper/check.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: check + Description : 执行代理校验 + Author : JHao + date: 2019/8/6 +------------------------------------------------- + Change Activity: + 2019/08/06: 执行代理校验 + 2021/05/25: 分别校验http和https + 2022/08/16: 获取代理Region信息 +------------------------------------------------- +""" +__author__ = 'JHao' + +from util.six import Empty +from threading import Thread +from datetime import datetime +from util.webRequest import WebRequest +from handler.logHandler import LogHandler +from helper.validator import ProxyValidator +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + + +class DoValidator(object): + """ 执行校验 """ + + conf = ConfigHandler() + + @classmethod + def validator(cls, proxy, work_type): + """ + 校验入口 + Args: + proxy: Proxy Object + work_type: raw/use + Returns: + Proxy Object + """ + http_r = cls.httpValidator(proxy) + https_r = False if not http_r else cls.httpsValidator(proxy) + + proxy.check_count += 1 + proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + proxy.last_status = True if http_r else False + if http_r: + if proxy.fail_count > 0: + proxy.fail_count -= 1 + proxy.https = True if https_r else False + if work_type == "raw": + proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" + else: + proxy.fail_count += 1 + return proxy + + @classmethod + def httpValidator(cls, proxy): + for func in ProxyValidator.http_validator: + if not func(proxy.proxy): + return False + return True + + @classmethod + def httpsValidator(cls, proxy): + for func in ProxyValidator.https_validator: + if not func(proxy.proxy): + return False + return True + + @classmethod + def preValidator(cls, proxy): + for func in ProxyValidator.pre_validator: + if not func(proxy): + return False + return True + + @classmethod + def regionGetter(cls, proxy): + try: + url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0] + r = WebRequest().get(url=url, retry_time=1, timeout=2).json + return r['data']['address'] + except: + return 'error' + + +class _ThreadChecker(Thread): + """ 多线程检测 """ + + def __init__(self, work_type, target_queue, thread_name): + Thread.__init__(self, name=thread_name) + self.work_type = work_type + self.log = LogHandler("checker") + self.proxy_handler = ProxyHandler() + self.target_queue = target_queue + self.conf = ConfigHandler() + + def run(self): + self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) + while True: + try: + proxy = self.target_queue.get(block=False) + except Empty: + self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) + break + proxy = DoValidator.validator(proxy, self.work_type) + if self.work_type == "raw": + self.__ifRaw(proxy) + else: + self.__ifUse(proxy) + self.target_queue.task_done() + + def __ifRaw(self, proxy): + if proxy.last_status: + if self.proxy_handler.exists(proxy): + self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23))) + else: + self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) + self.proxy_handler.put(proxy) + else: + self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23))) + + def __ifUse(self, proxy): + if proxy.last_status: + self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) + self.proxy_handler.put(proxy) + else: + if proxy.fail_count > self.conf.maxFailCount: + self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name, + proxy.proxy.ljust(23), + proxy.fail_count)) + self.proxy_handler.delete(proxy) + else: + self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name, + proxy.proxy.ljust(23), + proxy.fail_count)) + self.proxy_handler.put(proxy) + + +def Checker(tp, queue): + """ + run Proxy ThreadChecker + :param tp: raw/use + :param queue: Proxy Queue + :return: + """ + thread_list = list() + for index in range(20): + thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) + + for thread in thread_list: + thread.setDaemon(True) + thread.start() + + for thread in thread_list: + thread.join() diff --git a/helper/fetch.py b/helper/fetch.py new file mode 100644 index 000000000..6340b3a7c --- /dev/null +++ b/helper/fetch.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: fetchScheduler + Description : + Author : JHao + date: 2019/8/6 +------------------------------------------------- + Change Activity: + 2021/11/18: 多线程采集 +------------------------------------------------- +""" +__author__ = 'JHao' + +from threading import Thread +from helper.proxy import Proxy +from helper.check import DoValidator +from handler.logHandler import LogHandler +from handler.proxyHandler import ProxyHandler +from fetcher.proxyFetcher import ProxyFetcher +from handler.configHandler import ConfigHandler + + +class _ThreadFetcher(Thread): + + def __init__(self, fetch_source, proxy_dict): + Thread.__init__(self) + self.fetch_source = fetch_source + self.proxy_dict = proxy_dict + self.fetcher = getattr(ProxyFetcher, fetch_source, None) + self.log = LogHandler("fetcher") + self.conf = ConfigHandler() + self.proxy_handler = ProxyHandler() + + def run(self): + self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) + try: + for proxy in self.fetcher(): + self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) + proxy = proxy.strip() + if proxy in self.proxy_dict: + self.proxy_dict[proxy].add_source(self.fetch_source) + else: + self.proxy_dict[proxy] = Proxy( + proxy, source=self.fetch_source) + except Exception as e: + self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) + self.log.error(str(e)) + + +class Fetcher(object): + name = "fetcher" + + def __init__(self): + self.log = LogHandler(self.name) + self.conf = ConfigHandler() + + def run(self): + """ + fetch proxy with proxyFetcher + :return: + """ + proxy_dict = dict() + thread_list = list() + self.log.info("ProxyFetch : start") + + for fetch_source in self.conf.fetchers: + self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) + fetcher = getattr(ProxyFetcher, fetch_source, None) + if not fetcher: + self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) + continue + if not callable(fetcher): + self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) + continue + thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) + + for thread in thread_list: + thread.setDaemon(True) + thread.start() + + for thread in thread_list: + thread.join() + + self.log.info("ProxyFetch - all complete!") + for _ in proxy_dict.values(): + if DoValidator.preValidator(_.proxy): + yield _ diff --git a/helper/launcher.py b/helper/launcher.py new file mode 100644 index 000000000..73d8a0ad2 --- /dev/null +++ b/helper/launcher.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: launcher + Description : 启动器 + Author : JHao + date: 2021/3/26 +------------------------------------------------- + Change Activity: + 2021/3/26: 启动器 +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys +from db.dbClient import DbClient +from handler.logHandler import LogHandler +from handler.configHandler import ConfigHandler + +log = LogHandler('launcher') + + +def startServer(): + __beforeStart() + from api.proxyApi import runFlask + runFlask() + + +def startScheduler(): + __beforeStart() + from helper.scheduler import runScheduler + runScheduler() + + +def __beforeStart(): + __showVersion() + __showConfigure() + if __checkDBConfig(): + log.info('exit!') + sys.exit() + + +def __showVersion(): + from setting import VERSION + log.info("ProxyPool Version: %s" % VERSION) + + +def __showConfigure(): + conf = ConfigHandler() + log.info("ProxyPool configure HOST: %s" % conf.serverHost) + log.info("ProxyPool configure PORT: %s" % conf.serverPort) + log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) + + +def __checkDBConfig(): + conf = ConfigHandler() + db = DbClient(conf.dbConn) + log.info("============ DATABASE CONFIGURE ================") + log.info("DB_TYPE: %s" % db.db_type) + log.info("DB_HOST: %s" % db.db_host) + log.info("DB_PORT: %s" % db.db_port) + log.info("DB_NAME: %s" % db.db_name) + log.info("DB_USER: %s" % db.db_user) + log.info("=================================================") + return db.test() diff --git a/helper/proxy.py b/helper/proxy.py new file mode 100644 index 000000000..396a84239 --- /dev/null +++ b/helper/proxy.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: Proxy + Description : 代理对象类型封装 + Author : JHao + date: 2019/7/11 +------------------------------------------------- + Change Activity: + 2019/7/11: 代理对象类型封装 +------------------------------------------------- +""" +__author__ = 'JHao' + +import json + + +class Proxy(object): + + def __init__(self, proxy, fail_count=0, region="", anonymous="", + source="", check_count=0, last_status="", last_time="", https=False): + self._proxy = proxy + self._fail_count = fail_count + self._region = region + self._anonymous = anonymous + self._source = source.split('/') + self._check_count = check_count + self._last_status = last_status + self._last_time = last_time + self._https = https + + @classmethod + def createFromJson(cls, proxy_json): + _dict = json.loads(proxy_json) + return cls(proxy=_dict.get("proxy", ""), + fail_count=_dict.get("fail_count", 0), + region=_dict.get("region", ""), + anonymous=_dict.get("anonymous", ""), + source=_dict.get("source", ""), + check_count=_dict.get("check_count", 0), + last_status=_dict.get("last_status", ""), + last_time=_dict.get("last_time", ""), + https=_dict.get("https", False) + ) + + @property + def proxy(self): + """ 代理 ip:port """ + return self._proxy + + @property + def fail_count(self): + """ 检测失败次数 """ + return self._fail_count + + @property + def region(self): + """ 地理位置(国家/城市) """ + return self._region + + @property + def anonymous(self): + """ 匿名 """ + return self._anonymous + + @property + def source(self): + """ 代理来源 """ + return '/'.join(self._source) + + @property + def check_count(self): + """ 代理检测次数 """ + return self._check_count + + @property + def last_status(self): + """ 最后一次检测结果 True -> 可用; False -> 不可用""" + return self._last_status + + @property + def last_time(self): + """ 最后一次检测时间 """ + return self._last_time + + @property + def https(self): + """ 是否支持https """ + return self._https + + @property + def to_dict(self): + """ 属性字典 """ + return {"proxy": self.proxy, + "https": self.https, + "fail_count": self.fail_count, + "region": self.region, + "anonymous": self.anonymous, + "source": self.source, + "check_count": self.check_count, + "last_status": self.last_status, + "last_time": self.last_time} + + @property + def to_json(self): + """ 属性json格式 """ + return json.dumps(self.to_dict, ensure_ascii=False) + + @fail_count.setter + def fail_count(self, value): + self._fail_count = value + + @check_count.setter + def check_count(self, value): + self._check_count = value + + @last_status.setter + def last_status(self, value): + self._last_status = value + + @last_time.setter + def last_time(self, value): + self._last_time = value + + @https.setter + def https(self, value): + self._https = value + + @region.setter + def region(self, value): + self._region = value + + def add_source(self, source_str): + if source_str: + self._source.append(source_str) + self._source = list(set(self._source)) diff --git a/helper/scheduler.py b/helper/scheduler.py new file mode 100644 index 000000000..cd91190a5 --- /dev/null +++ b/helper/scheduler.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyScheduler + Description : + Author : JHao + date: 2019/8/5 +------------------------------------------------- + Change Activity: + 2019/08/05: proxyScheduler + 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 +------------------------------------------------- +""" +__author__ = 'JHao' + +from apscheduler.schedulers.blocking import BlockingScheduler +from apscheduler.executors.pool import ProcessPoolExecutor + +from util.six import Queue +from helper.fetch import Fetcher +from helper.check import Checker +from handler.logHandler import LogHandler +from handler.proxyHandler import ProxyHandler +from handler.configHandler import ConfigHandler + + +def __runProxyFetch(): + proxy_queue = Queue() + proxy_fetcher = Fetcher() + + for proxy in proxy_fetcher.run(): + proxy_queue.put(proxy) + + Checker("raw", proxy_queue) + + +def __runProxyCheck(): + proxy_handler = ProxyHandler() + proxy_queue = Queue() + if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: + __runProxyFetch() + for proxy in proxy_handler.getAll(): + proxy_queue.put(proxy) + Checker("use", proxy_queue) + + +def runScheduler(): + __runProxyFetch() + + timezone = ConfigHandler().timezone + scheduler_log = LogHandler("scheduler") + scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) + + scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集") + scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查") + executors = { + 'default': {'type': 'threadpool', 'max_workers': 20}, + 'processpool': ProcessPoolExecutor(max_workers=5) + } + job_defaults = { + 'coalesce': False, + 'max_instances': 10 + } + + scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) + + scheduler.start() + + +if __name__ == '__main__': + runScheduler() diff --git a/helper/validator.py b/helper/validator.py new file mode 100644 index 000000000..136691c2e --- /dev/null +++ b/helper/validator.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: _validators + Description : 定义proxy验证方法 + Author : JHao + date: 2021/5/25 +------------------------------------------------- + Change Activity: + 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port +------------------------------------------------- +""" +__author__ = 'JHao' + +import re +from requests import head +from util.six import withMetaclass +from util.singleton import Singleton +from handler.configHandler import ConfigHandler + +conf = ConfigHandler() + +HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + +IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") + + +class ProxyValidator(withMetaclass(Singleton)): + pre_validator = [] + http_validator = [] + https_validator = [] + + @classmethod + def addPreValidator(cls, func): + cls.pre_validator.append(func) + return func + + @classmethod + def addHttpValidator(cls, func): + cls.http_validator.append(func) + return func + + @classmethod + def addHttpsValidator(cls, func): + cls.https_validator.append(func) + return func + + +@ProxyValidator.addPreValidator +def formatValidator(proxy): + """检查代理格式""" + return True if IP_REGEX.fullmatch(proxy) else False + + +@ProxyValidator.addHttpValidator +def httpTimeOutValidator(proxy): + """ http检测超时 """ + + proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} + + try: + r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) + return True if r.status_code == 200 else False + except Exception as e: + return False + + +@ProxyValidator.addHttpsValidator +def httpsTimeOutValidator(proxy): + """https检测超时""" + + proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} + try: + r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) + return True if r.status_code == 200 else False + except Exception as e: + return False + + +@ProxyValidator.addHttpValidator +def customValidatorExample(proxy): + """自定义validator函数,校验代理是否可用, 返回True/False""" + return True diff --git a/proxyPool.py b/proxyPool.py new file mode 100644 index 000000000..59afaadeb --- /dev/null +++ b/proxyPool.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxy_pool + Description : proxy pool 启动入口 + Author : JHao + date: 2020/6/19 +------------------------------------------------- + Change Activity: + 2020/6/19: +------------------------------------------------- +""" +__author__ = 'JHao' + +import click +from helper.launcher import startServer, startScheduler +from setting import BANNER, VERSION + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.version_option(version=VERSION) +def cli(): + """ProxyPool cli工具""" + + +@cli.command(name="schedule") +def schedule(): + """ 启动调度程序 """ + click.echo(BANNER) + startScheduler() + + +@cli.command(name="server") +def server(): + """ 启动api服务 """ + click.echo(BANNER) + startServer() + + +if __name__ == '__main__': + cli() diff --git a/requirements.txt b/requirements.txt index efea10232..53dc129b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,12 @@ -APScheduler==3.2.0 -Flask==0.11.1 -requests==2.11.0 -ssdb==0.0.3 -lxml==3.7.1 +requests==2.20.0 +gunicorn==19.9.0 +lxml==4.9.2 +redis==3.5.3 +APScheduler==3.10.0;python_version>="3.10" +APScheduler==3.2.0;python_version<"3.10" +click==8.0.1;python_version>"3.6" +click==7.0;python_version<="3.6" +Flask==2.1.1;python_version>"3.6" +Flask==1.0;python_version<="3.6" +werkzeug==2.1.0;python_version>"3.6" +werkzeug==0.15.5;python_version<="3.6" diff --git a/setting.py b/setting.py new file mode 100644 index 000000000..9bab8475c --- /dev/null +++ b/setting.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: setting.py + Description : 配置文件 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" + +BANNER = r""" +**************************************************************** +*** ______ ********************* ______ *********** _ ******** +*** | ___ \_ ******************** | ___ \ ********* | | ******** +*** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** +*** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** +*** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** +*** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** +**** __ / / ***** +************************* /___ / ******************************* +************************* ******************************** +**************************************************************** +""" + +VERSION = "2.4.0" + +# ############### server config ############### +HOST = "0.0.0.0" + +PORT = 5010 + +# ############### database config ################### +# db connection uri +# example: +# Redis: redis://:password@ip:port/db +# Ssdb: ssdb://:password@ip:port +DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' + +# proxy table name +TABLE_NAME = 'use_proxy' + + +# ###### config the proxy fetch function ###### +PROXY_FETCHER = [ + "freeProxy01", + "freeProxy02", + "freeProxy03", + "freeProxy04", + "freeProxy05", + "freeProxy06", + "freeProxy07", + "freeProxy08", + "freeProxy09", + "freeProxy10", + "freeProxy11" +] + +# ############# proxy validator ################# +# 代理验证目标网站 +HTTP_URL = "http://httpbin.org" + +HTTPS_URL = "https://www.qq.com" + +# 代理验证时超时时间 +VERIFY_TIMEOUT = 10 + +# 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 +MAX_FAIL_COUNT = 0 + +# 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 +# MAX_FAIL_RATE = 0.1 + +# proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 +POOL_SIZE_MIN = 20 + +# ############# proxy attributes ################# +# 是否启用代理地域属性 +PROXY_REGION = True + +# ############# scheduler config ################# + +# Set the timezone for the scheduler forcely (optional) +# If it is running on a VM, and +# "ValueError: Timezone offset does not match system offset" +# was raised during scheduling. +# Please uncomment the following line and set a timezone for the scheduler. +# Otherwise it will detect the timezone from the system automatically. + +TIMEZONE = "Asia/Shanghai" diff --git a/start.sh b/start.sh new file mode 100755 index 000000000..4c9b48f5d --- /dev/null +++ b/start.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +python proxyPool.py server & +python proxyPool.py schedule \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 000000000..b1c7ca1e2 --- /dev/null +++ b/test.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: test.py + Description : + Author : JHao + date: 2017/3/7 +------------------------------------------------- + Change Activity: + 2017/3/7: +------------------------------------------------- +""" +__author__ = 'JHao' + +from test import testProxyValidator +from test import testConfigHandler +from test import testLogHandler +from test import testDbClient + +if __name__ == '__main__': + print("ConfigHandler:") + testConfigHandler.testConfig() + + print("LogHandler:") + testLogHandler.testLogHandler() + + print("DbClient:") + testDbClient.testDbClient() + + print("ProxyValidator:") + testProxyValidator.testProxyValidator() diff --git a/__init__.py b/test/__init__.py similarity index 56% rename from __init__.py rename to test/__init__.py index c511f3103..d314f9455 100644 --- a/__init__.py +++ b/test/__init__.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py - Description : - Author : JHao - date: 2016/12/3 + File Name: __init__ + Description : + Author : JHao + date: 2019/2/15 ------------------------------------------------- Change Activity: - 2016/12/3: + 2019/2/15: ------------------------------------------------- """ -__author__ = 'JHao' \ No newline at end of file +__author__ = 'JHao' diff --git a/test/testConfigHandler.py b/test/testConfigHandler.py new file mode 100644 index 000000000..2336650f6 --- /dev/null +++ b/test/testConfigHandler.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testGetConfig + Description : testGetConfig + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from handler.configHandler import ConfigHandler +from time import sleep + + +def testConfig(): + """ + :return: + """ + conf = ConfigHandler() + print(conf.dbConn) + print(conf.serverPort) + print(conf.serverHost) + print(conf.tableName) + assert isinstance(conf.fetchers, list) + print(conf.fetchers) + + for _ in range(2): + print(conf.fetchers) + sleep(5) + + +if __name__ == '__main__': + testConfig() + diff --git a/test/testDbClient.py b/test/testDbClient.py new file mode 100644 index 000000000..e3a7cc2e2 --- /dev/null +++ b/test/testDbClient.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testDbClient + Description : + Author : JHao + date: 2020/6/23 +------------------------------------------------- + Change Activity: + 2020/6/23: +------------------------------------------------- +""" +__author__ = 'JHao' + +from db.dbClient import DbClient + + +def testDbClient(): + # ############### ssdb ############### + ssdb_uri = "ssdb://:password@127.0.0.1:8888" + s = DbClient.parseDbConn(ssdb_uri) + assert s.db_type == "SSDB" + assert s.db_pwd == "password" + assert s.db_host == "127.0.0.1" + assert s.db_port == 8888 + + # ############### redis ############### + redis_uri = "redis://:password@127.0.0.1:6379/1" + r = DbClient.parseDbConn(redis_uri) + assert r.db_type == "REDIS" + assert r.db_pwd == "password" + assert r.db_host == "127.0.0.1" + assert r.db_port == 6379 + assert r.db_name == "1" + print("DbClient ok!") + + +if __name__ == '__main__': + testDbClient() diff --git a/test/testLogHandler.py b/test/testLogHandler.py new file mode 100644 index 000000000..433bb2604 --- /dev/null +++ b/test/testLogHandler.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testLogHandler + Description : + Author : J_hao + date: 2017/8/2 +------------------------------------------------- + Change Activity: + 2017/8/2: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from handler.logHandler import LogHandler + + +def testLogHandler(): + log = LogHandler('test') + log.info('this is info') + log.error('this is error') + + +if __name__ == '__main__': + testLogHandler() diff --git a/test/testProxyClass.py b/test/testProxyClass.py new file mode 100644 index 000000000..b0ffc9a08 --- /dev/null +++ b/test/testProxyClass.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testProxyClass + Description : + Author : JHao + date: 2019/8/8 +------------------------------------------------- + Change Activity: + 2019/8/8: +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +from helper.proxy import Proxy + + +def testProxyClass(): + proxy = Proxy("127.0.0.1:8080") + + print(proxy.to_json) + + proxy.source = "test" + + proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False) + + print(proxy_str) + + print(Proxy.createFromJson(proxy_str).to_dict) + + +if __name__ == '__main__': + testProxyClass() diff --git a/test/testProxyFetcher.py b/test/testProxyFetcher.py new file mode 100644 index 000000000..a530b2e50 --- /dev/null +++ b/test/testProxyFetcher.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testProxyFetcher + Description : + Author : JHao + date: 2020/6/23 +------------------------------------------------- + Change Activity: + 2020/6/23: +------------------------------------------------- +""" +__author__ = 'JHao' + +from fetcher.proxyFetcher import ProxyFetcher +from handler.configHandler import ConfigHandler + + +def testProxyFetcher(): + conf = ConfigHandler() + proxy_getter_functions = conf.fetchers + proxy_counter = {_: 0 for _ in proxy_getter_functions} + for proxyGetter in proxy_getter_functions: + for proxy in getattr(ProxyFetcher, proxyGetter.strip())(): + if proxy: + print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1 + for key, value in proxy_counter.items(): + print(key, value) + + +if __name__ == '__main__': + testProxyFetcher() diff --git a/test/testProxyValidator.py b/test/testProxyValidator.py new file mode 100644 index 000000000..0199ecd75 --- /dev/null +++ b/test/testProxyValidator.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testProxyValidator + Description : + Author : JHao + date: 2021/5/25 +------------------------------------------------- + Change Activity: + 2021/5/25: +------------------------------------------------- +""" +__author__ = 'JHao' + +from helper.validator import ProxyValidator + + +def testProxyValidator(): + for _ in ProxyValidator.pre_validator: + print(_) + for _ in ProxyValidator.http_validator: + print(_) + for _ in ProxyValidator.https_validator: + print(_) + + +if __name__ == '__main__': + testProxyValidator() diff --git a/test/testRedisClient.py b/test/testRedisClient.py new file mode 100644 index 000000000..ff5b1d9c4 --- /dev/null +++ b/test/testRedisClient.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testRedisClient + Description : + Author : JHao + date: 2020/6/23 +------------------------------------------------- + Change Activity: + 2020/6/23: +------------------------------------------------- +""" +__author__ = 'JHao' + + +def testRedisClient(): + from db.dbClient import DbClient + from helper.proxy import Proxy + + uri = "redis://:pwd@127.0.0.1:6379" + db = DbClient(uri) + db.changeTable("use_proxy") + proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') + + print("put: ", db.put(proxy)) + + print("get: ", db.get(https=None)) + + print("exists: ", db.exists("27.38.96.101:9797")) + + print("exists: ", db.exists("27.38.96.101:8888")) + + print("pop: ", db.pop(https=None)) + + print("getAll: ", db.getAll(https=None)) + + print("getCount", db.getCount()) + + +if __name__ == '__main__': + testRedisClient() diff --git a/test/testSsdbClient.py b/test/testSsdbClient.py new file mode 100644 index 000000000..c24ecc042 --- /dev/null +++ b/test/testSsdbClient.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: testSsdbClient + Description : + Author : JHao + date: 2020/7/3 +------------------------------------------------- + Change Activity: + 2020/7/3: +------------------------------------------------- +""" +__author__ = 'JHao' + + +def testSsdbClient(): + from db.dbClient import DbClient + from helper.proxy import Proxy + + uri = "ssdb://@127.0.0.1:8888" + db = DbClient(uri) + db.changeTable("use_proxy") + proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') + + print("put: ", db.put(proxy)) + + print("get: ", db.get(https=None)) + + print("exists: ", db.exists("27.38.96.101:9797")) + + print("exists: ", db.exists("27.38.96.101:8888")) + + print("getAll: ", db.getAll(https=None)) + + # print("pop: ", db.pop(https=None)) + + print("clear: ", db.clear()) + + print("getCount", db.getCount()) + + +if __name__ == '__main__': + testSsdbClient() diff --git a/ProxyGetter/__init__.py b/util/__init__.py similarity index 56% rename from ProxyGetter/__init__.py rename to util/__init__.py index d1c5cc292..4a81052c3 100644 --- a/ProxyGetter/__init__.py +++ b/util/__init__.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py.py - Description : - Author : JHao - date: 2016/11/25 + File Name: __init__ + Description : + Author : JHao + date: 2020/7/6 ------------------------------------------------- Change Activity: - 2016/11/25: + 2020/7/6: ------------------------------------------------- -""" \ No newline at end of file +""" +__author__ = 'JHao' diff --git a/util/lazyProperty.py b/util/lazyProperty.py new file mode 100644 index 000000000..f028192d2 --- /dev/null +++ b/util/lazyProperty.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: lazyProperty + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/3: +------------------------------------------------- +""" +__author__ = 'JHao' + + +class LazyProperty(object): + """ + LazyProperty + explain: http://www.spiderpy.cn/blog/5/ + """ + + def __init__(self, func): + self.func = func + + def __get__(self, instance, owner): + if instance is None: + return self + else: + value = self.func(instance) + setattr(instance, self.func.__name__, value) + return value diff --git a/util/singleton.py b/util/singleton.py new file mode 100644 index 000000000..1abb7a7c3 --- /dev/null +++ b/util/singleton.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: singleton + Description : + Author : JHao + date: 2016/12/3 +------------------------------------------------- + Change Activity: + 2016/12/3: +------------------------------------------------- +""" +__author__ = 'JHao' + + +class Singleton(type): + """ + Singleton Metaclass + """ + + _inst = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._inst: + cls._inst[cls] = super(Singleton, cls).__call__(*args) + return cls._inst[cls] diff --git a/util/six.py b/util/six.py new file mode 100644 index 000000000..14ee059ba --- /dev/null +++ b/util/six.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: six + Description : + Author : JHao + date: 2020/6/22 +------------------------------------------------- + Change Activity: + 2020/6/22: +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +if PY3: + def iteritems(d, **kw): + return iter(d.items(**kw)) +else: + def iteritems(d, **kw): + return d.iteritems(**kw) + +if PY3: + from urllib.parse import urlparse +else: + from urlparse import urlparse + +if PY3: + from imp import reload as reload_six +else: + reload_six = reload + +if PY3: + from queue import Empty, Queue +else: + from Queue import Empty, Queue + + +def withMetaclass(meta, *bases): + """Create a base class with a metaclass.""" + + # This requires a bit of explanation: the basic idea is to make a dummy + # metaclass for one level of class instantiation that replaces itself with + # the actual metaclass. + class MetaClass(meta): + + def __new__(cls, name, this_bases, d): + return meta(name, bases, d) + + return type.__new__(MetaClass, 'temporary_class', (), {}) diff --git a/util/webRequest.py b/util/webRequest.py new file mode 100644 index 000000000..bf0555216 --- /dev/null +++ b/util/webRequest.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: WebRequest + Description : Network Requests Class + Author : J_hao + date: 2017/7/31 +------------------------------------------------- + Change Activity: + 2017/7/31: +------------------------------------------------- +""" +__author__ = 'J_hao' + +from requests.models import Response +from lxml import etree +import requests +import random +import time + +from handler.logHandler import LogHandler + +requests.packages.urllib3.disable_warnings() + + +class WebRequest(object): + name = "web_request" + + def __init__(self, *args, **kwargs): + self.log = LogHandler(self.name, file=False) + self.response = Response() + + @property + def user_agent(self): + """ + return an User-Agent at random + :return: + """ + ua_list = [ + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', + 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + ] + return random.choice(ua_list) + + @property + def header(self): + """ + basic header + :return: + """ + return {'User-Agent': self.user_agent, + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + + def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): + """ + get method + :param url: target url + :param header: headers + :param retry_time: retry time + :param retry_interval: retry interval + :param timeout: network timeout + :return: + """ + headers = self.header + if header and isinstance(header, dict): + headers.update(header) + while True: + try: + self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) + return self + except Exception as e: + self.log.error("requests: %s error: %s" % (url, str(e))) + retry_time -= 1 + if retry_time <= 0: + resp = Response() + resp.status_code = 200 + return self + self.log.info("retry %s second after" % retry_interval) + time.sleep(retry_interval) + + @property + def tree(self): + return etree.HTML(self.response.content) + + @property + def text(self): + return self.response.text + + @property + def json(self): + try: + return self.response.json() + except Exception as e: + self.log.error(str(e)) + return {} +