IP代理池python


IP代理池

为什么要自己建一个呢,因为啊,你一个ip爬取数据的时候,太快或者多次访问会被屏蔽。所以我们就要隔一段时间换一个ip就行。

IP地址

https://www.xicidaili.com/nn/ (西刺代理)
优点:ip很多很多 。
缺点:可用的很少。

http://www.gatherproxy.com/zh/proxylist/country/?c=China (国外的需要vpn)
优点:可用性很多。
缺点:需要翻墙才行哦。

步骤(* 以西刺代理为例子)

访问以上两个网站并且把ip和端口爬取下来保存到一个列表当中

def get_ip(z,y):
    global ip
    for xx in range(z,y+1):
        url= 'https://www.xicidaili.com/nn/%s'%str(xx)
        response = requests.get(url,headers = dailichi())
        sle = parsel.Selector(response.text)
        # guo = sle.xpath('//*[@id="ip_list"]').re(r'<img src=".*?.png" alt="(.*?)">',re.S)
        jihe  = sle.xpath('//*[@id="ip_list"]').re(r'<td>(.*?)</td>',re.S)
        ip = []
        for x in jihe[::5]:
            ip.append(x)
        duankou = []
        for x in jihe[1::5]:
            duankou.append(x)
        for x in range(0, len(ip), 1):
            ip.append(ip[x]+':'+duankou[x])
        time.sleep(random.randint(1,4))
        print('-------------------------')

这里把它爬到了ip这个全局变量里面

验证可用性

def yanzheng():
    global  ip
    while True:
        gg.acquire()
        if len(ip)==0:
            gg.release()
            break
        ip_chi = ip.pop()
        gg.release()
        proxies = {
            "http": "http://" + ip_chi,
            "https": "http://" + ip_chi,
        }
        try:
            req = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
            # print(ip_chi)
            file = open('可用ip.txt','a')
            file.write(ip_chi+'\n')
            file.close()
            # print('yes')
        except:
            # print('no')
            pass

这里我爬取ip是单线程(别问我为什么不用多线程,容易被封),验证的时候就不存在了,直接多线程验证,快点,然后,把可用ip保存到一个txt文档里面

使用可用ip代理访问网址

def requests_get(url):
    # url = 'https://www.doutula.com/photo/list/?page=1'
    import requests
    file = open('可用ip.txt','r')
    txt = file.read().split('\n')
    file.close()
    for ip in txt:
        proxies = {
            "http": "http://%s"%str(ip),
            "https": "http://%s"%str(ip),
        }
        try:
            response = requests.get(url,proxies= proxies,timeout = 5)
            # print(response.text)
            return response
            # break
        except:
            pass

读取可用ip进行requests请求就完事了

完整代码

import requests
import parsel
import re
import random
import time
import threading
import os
ip=[]
gg = threading.Lock()
def dailichi():
    daili = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; WOW64; Trident/4.0; SLCC1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; WOW64; Trident/4.0; SLCC1)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
    ]
    dai = random.choice(daili)
    # print(dai)
    head  ={
        'User-Agent':'%s'% dai
    }
    return head
def get_ip(z,y):
    global ip
    for xx in range(z,y+1):
        url= 'https://www.xicidaili.com/nn/%s'%str(xx)
        response = requests.get(url,headers = dailichi())
        sle = parsel.Selector(response.text)
        # guo = sle.xpath('//*[@id="ip_list"]').re(r'(.*?)',re.S)
        jihe  = sle.xpath('//*[@id="ip_list"]').re(r'(.*?)',re.S)
        ip = []
        for x in jihe[::5]:
            ip.append(x)
        duankou = []
        for x in jihe[1::5]:
            duankou.append(x)
        for x in range(0, len(ip), 1):
            ip.append(ip[x]+':'+duankou[x])
        time.sleep(random.randint(1,4))
        print('-------------------------')

def yanzheng():
    global  ip
    while True:
        gg.acquire()
        if len(ip)==0:
            gg.release()
            break
        ip_chi = ip.pop()
        gg.release()
        proxies = {
            "http": "http://" + ip_chi,
            "https": "http://" + ip_chi,
        }
        try:
            req = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
            # print(ip_chi)
            file = open('可用ip.txt','a')
            file.write(ip_chi+'\n')
            file.close()
            # print('yes')
        except:
            # print('no')
            pass
def requests_get(url):
    # url = 'https://www.doutula.com/photo/list/?page=1'
    import requests
    file = open('可用ip.txt','r')
    txt = file.read().split('\n')
    file.close()
    for ip in txt:
        proxies = {
            "http": "http://%s"%str(ip),
            "https": "http://%s"%str(ip),
        }
        try:
            response = requests.get(url,proxies= proxies,timeout = 5)
            # print(response.text)
            return response
            # break
        except:
            pass
def get_ipchi():
    if os.path.exists('可用ip.txt'):
        os.remove('可用ip.txt')
    get_ip(1,1)
    for x in range(20):
        threading.Thread(target=yanzheng).start()
    while   len(threading.enumerate())>1:
        pass
get_ipchi()

# 以上会爬取可用ip并且保存到(可用ip.txt)这个文件里
这里访问百度为例子
直接调用之前写的requests_url函数

response = requests_url("https://www.baidu.com")
print(response.text)
print(response.status.code)

# 是完全没问题的

国外的ip池,代码

import requests
import parsel
import re
import random
import time
import threading
import os
ip=[]
gg = threading.Lock()
def dailichi():
    daili = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; WOW64; Trident/4.0; SLCC1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; WOW64; Trident/4.0; SLCC1)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
    ]
    dai = random.choice(daili)
    # print(dai)
    head  ={
        'User-Agent':'%s'% dai
    }
    return head
def get_ip():
    global ip
    response = requests.get('http://www.gatherproxy.com/zh/proxylist/country/?c=China',headers = dailichi())
    # print(response.text)
    sle = parsel.Selector(response.text)
    ipp = sle.xpath('//*[@id="tblproxy"]').re('"PROXY_IP":"(.*?)"',re.S)
    # print(sle.xpath('//*[@id="tblproxy"]').getall())
    duankou_16 = sle.xpath('//*[@id="tblproxy"]').re('"PROXY_PORT":"(.*?)"',re.S)
    # print(ipp)
    # print(len(ipp))
    duankou= []
    for x in duankou_16:
        duankou.append(str(int(x,16)))
    # print(duankou)
    # print(len(duankou))
    for x in zip(ipp,duankou):
        ip.append(x[0]+":"+x[1])

def yanzheng():
    global  ip
    while True:
        gg.acquire()
        if len(ip)==0:
            gg.release()
            break
        ip_chi = ip.pop()
        gg.release()
        proxies = {
            "http": "http://" + ip_chi,
            "https": "http://" + ip_chi,
        }
        try:
            req = requests.get('https://mcheika.com/', proxies=proxies, timeout=3)
            # print(ip_chi)
            file = open('可用ip.txt','a')
            file.write(ip_chi+'\n')
            file.close()
            # print('yes')
        except:
            # print('no')
            pass
def requests_get(url):
    # url = 'https://www.doutula.com/photo/list/?page=1'
    import requests
    file = open('可用ip.txt','r')
    txt = file.read().split('\n')
    file.close()
    for ip in txt:
        proxies = {
            "http": "http://%s"%str(ip),
            "https": "http://%s"%str(ip),
        }
        try:
            response = requests.get(url,proxies= proxies,timeout = 5)
            # print(response.text)
            # return response
            return
            # pass
            # break
        except:
            pass
def get_ipchi():
    if os.path.exists('可用ip.txt'):
        os.remove('可用ip.txt')
    get_ip()
    for x in range(20):
        threading.Thread(target=yanzheng).start()
    while   len(threading.enumerate())>1:
        pass
get_ipchi()

再见,爱你们。


文章作者: anlen123
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 anlen123 !
 上一篇
如何共存python2和python3 如何共存python2和python3
PS:为什么做这一期博客呢?因为前些天做大数的题的时候需要用到python2我的环境是Python3所有我打算配置一个Python2的环境 下载python3和python2的安装包程序这是python官网自行下载点击此处 安装python
2019-10-03 anlen123
下一篇 
python简单多线程学习笔记 python简单多线程学习笔记
多线程原理多线程(英语:multithreading),是指从软件或者硬件上实现多个线程并发执行的技术。具有多线程能力的计算机因有硬件支持而能够在同一时间执行多于一个线程,进而提升整体处理性能。具有这种能力的系统包括对称多处理机、多核心处理
2019-08-30 anlen123
  目录