IP代理池
为什么要自己建一个呢,因为啊,你一个ip爬取数据的时候,太快或者多次访问会被屏蔽。所以我们就要隔一段时间换一个ip就行。
IP地址
https://www.xicidaili.com/nn/ (西刺代理)
优点:ip很多很多 。
缺点:可用的很少。
http://www.gatherproxy.com/zh/proxylist/country/?c=China (国外的需要vpn)
优点:可用性很多。
缺点:需要翻墙才行哦。
步骤(* 以西刺代理为例子)
访问以上两个网站并且把ip和端口爬取下来保存到一个列表当中
def get_ip(z,y):
global ip
for xx in range(z,y+1):
url= 'https://www.xicidaili.com/nn/%s'%str(xx)
response = requests.get(url,headers = dailichi())
sle = parsel.Selector(response.text)
# guo = sle.xpath('//*[@id="ip_list"]').re(r'<img src=".*?.png" alt="(.*?)">',re.S)
jihe = sle.xpath('//*[@id="ip_list"]').re(r'<td>(.*?)</td>',re.S)
ip = []
for x in jihe[::5]:
ip.append(x)
duankou = []
for x in jihe[1::5]:
duankou.append(x)
for x in range(0, len(ip), 1):
ip.append(ip[x]+':'+duankou[x])
time.sleep(random.randint(1,4))
print('-------------------------')
这里把它爬到了ip这个全局变量里面
验证可用性
def yanzheng():
global ip
while True:
gg.acquire()
if len(ip)==0:
gg.release()
break
ip_chi = ip.pop()
gg.release()
proxies = {
"http": "http://" + ip_chi,
"https": "http://" + ip_chi,
}
try:
req = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
# print(ip_chi)
file = open('可用ip.txt','a')
file.write(ip_chi+'\n')
file.close()
# print('yes')
except:
# print('no')
pass
这里我爬取ip是单线程(别问我为什么不用多线程,容易被封),验证的时候就不存在了,直接多线程验证,快点,然后,把可用ip保存到一个txt文档里面
使用可用ip代理访问网址
def requests_get(url):
# url = 'https://www.doutula.com/photo/list/?page=1'
import requests
file = open('可用ip.txt','r')
txt = file.read().split('\n')
file.close()
for ip in txt:
proxies = {
"http": "http://%s"%str(ip),
"https": "http://%s"%str(ip),
}
try:
response = requests.get(url,proxies= proxies,timeout = 5)
# print(response.text)
return response
# break
except:
pass
读取可用ip进行requests请求就完事了
完整代码
import requests
import parsel
import re
import random
import time
import threading
import os
ip=[]
gg = threading.Lock()
def dailichi():
daili = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; WOW64; Trident/4.0; SLCC1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; WOW64; Trident/4.0; SLCC1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
]
dai = random.choice(daili)
# print(dai)
head ={
'User-Agent':'%s'% dai
}
return head
def get_ip(z,y):
global ip
for xx in range(z,y+1):
url= 'https://www.xicidaili.com/nn/%s'%str(xx)
response = requests.get(url,headers = dailichi())
sle = parsel.Selector(response.text)
# guo = sle.xpath('//*[@id="ip_list"]').re(r'
',re.S)
jihe = sle.xpath('//*[@id="ip_list"]').re(r'(.*?) ',re.S)
ip = []
for x in jihe[::5]:
ip.append(x)
duankou = []
for x in jihe[1::5]:
duankou.append(x)
for x in range(0, len(ip), 1):
ip.append(ip[x]+':'+duankou[x])
time.sleep(random.randint(1,4))
print('-------------------------')
def yanzheng():
global ip
while True:
gg.acquire()
if len(ip)==0:
gg.release()
break
ip_chi = ip.pop()
gg.release()
proxies = {
"http": "http://" + ip_chi,
"https": "http://" + ip_chi,
}
try:
req = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
# print(ip_chi)
file = open('可用ip.txt','a')
file.write(ip_chi+'\n')
file.close()
# print('yes')
except:
# print('no')
pass
def requests_get(url):
# url = 'https://www.doutula.com/photo/list/?page=1'
import requests
file = open('可用ip.txt','r')
txt = file.read().split('\n')
file.close()
for ip in txt:
proxies = {
"http": "http://%s"%str(ip),
"https": "http://%s"%str(ip),
}
try:
response = requests.get(url,proxies= proxies,timeout = 5)
# print(response.text)
return response
# break
except:
pass
def get_ipchi():
if os.path.exists('可用ip.txt'):
os.remove('可用ip.txt')
get_ip(1,1)
for x in range(20):
threading.Thread(target=yanzheng).start()
while len(threading.enumerate())>1:
pass
get_ipchi()
# 以上会爬取可用ip并且保存到(可用ip.txt)这个文件里
这里访问百度为例子
直接调用之前写的requests_url函数
response = requests_url("https://www.baidu.com")
print(response.text)
print(response.status.code)
# 是完全没问题的
国外的ip池,代码
import requests
import parsel
import re
import random
import time
import threading
import os
ip=[]
gg = threading.Lock()
def dailichi():
daili = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; WOW64; Trident/4.0; SLCC1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; WOW64; Trident/4.0; SLCC1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
]
dai = random.choice(daili)
# print(dai)
head ={
'User-Agent':'%s'% dai
}
return head
def get_ip():
global ip
response = requests.get('http://www.gatherproxy.com/zh/proxylist/country/?c=China',headers = dailichi())
# print(response.text)
sle = parsel.Selector(response.text)
ipp = sle.xpath('//*[@id="tblproxy"]').re('"PROXY_IP":"(.*?)"',re.S)
# print(sle.xpath('//*[@id="tblproxy"]').getall())
duankou_16 = sle.xpath('//*[@id="tblproxy"]').re('"PROXY_PORT":"(.*?)"',re.S)
# print(ipp)
# print(len(ipp))
duankou= []
for x in duankou_16:
duankou.append(str(int(x,16)))
# print(duankou)
# print(len(duankou))
for x in zip(ipp,duankou):
ip.append(x[0]+":"+x[1])
def yanzheng():
global ip
while True:
gg.acquire()
if len(ip)==0:
gg.release()
break
ip_chi = ip.pop()
gg.release()
proxies = {
"http": "http://" + ip_chi,
"https": "http://" + ip_chi,
}
try:
req = requests.get('https://mcheika.com/', proxies=proxies, timeout=3)
# print(ip_chi)
file = open('可用ip.txt','a')
file.write(ip_chi+'\n')
file.close()
# print('yes')
except:
# print('no')
pass
def requests_get(url):
# url = 'https://www.doutula.com/photo/list/?page=1'
import requests
file = open('可用ip.txt','r')
txt = file.read().split('\n')
file.close()
for ip in txt:
proxies = {
"http": "http://%s"%str(ip),
"https": "http://%s"%str(ip),
}
try:
response = requests.get(url,proxies= proxies,timeout = 5)
# print(response.text)
# return response
return
# pass
# break
except:
pass
def get_ipchi():
if os.path.exists('可用ip.txt'):
os.remove('可用ip.txt')
get_ip()
for x in range(20):
threading.Thread(target=yanzheng).start()
while len(threading.enumerate())>1:
pass
get_ipchi()