- 2022.06.08
- 增加代理功能
- 2022.08.30
- 增加支持使用分块编码(响应头包含Transfer-Encoding: chunked)的网站
为了新项目学习Python,使用urllib库抓取网页内容,项目中如果每次抓取内容都用原生写的话会有一堆重复代码,就做了个简单的封装,顺便记录一下。目前能满足项目使用,如果以后有更新的话都会同步到这里。
废话不多说,直接上代码
# coding=UTF-8 import os import random import gzip import zlib import urllib.parse import urllib.request from urllib.error import URLError, HTTPError class GetHtml: def get(self, url, params={}): if 'data' in params: # 把字段转换为query_string格式 data = urllib.parse.urlencode(params['data']) # 拼接url url += ('?' if '?' not in url else '&') + data return self.__request(url, None, params) def post(self, url, params={}): # 编码,不设置默认utf8 charset = self.__getv(params, 'charset', 'utf8') # POST数据 data = self.__getv(params, 'data', {}) if isinstance(data, dict): # 把字段转换为query_string格式 data = urllib.parse.urlencode(data).encode(charset) elif isinstance(data, str): # 字符串转换为bytes格式 data = data.encode(charset) return self.__request(url, data, params) def auto(self, url, params={}): # 自动判断get还是post if 'data' in params: return self.post(url, params) else: return self.get(url, params) # 随机获取一个代理地址 def proxy(self, proxys): if 'file' in proxys: # 代理地址写在文件里 if os.path.isfile(proxys['file']): # 文件存在 proxylist = [] f = open(proxys['file'], 'r', encoding='utf-8') for line in f.readlines(): line = line.strip() proxylist.append(line) f.close() proxyaddr = proxylist[random.randint(0, len(proxylist)-1)] return proxyaddr else: exit('error: 代理文件不存在!') else: proxyaddrs = proxys['address'] proxyaddrs = proxyaddrs if isinstance(proxyaddrs, list) else list(proxyaddrs) # 将代理转为列表 proxyaddr = proxyaddrs[random.randint(0, len(proxyaddrs) - 1)] # 随机取一个代理地址 return proxyaddr def __request(self, url, data, params): # 发送请求 headers = self.__getv(params, 'headers', {}) # User-Agent if 'ua' in params: # 优先使用params里设置的ua headers['User-Agent'] = params['ua'] elif 'User-Agent' not in headers: # 没有设置ua,使用默认的ua headers[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36' # Cookie if 'cookie' in params: headers['Cookie'] = params['cookie'] # 请求超时,默认10秒 timeout = self.__getv(params, 'timeout', 10) # 代理 if 'proxy' in params: proxyaddr = self.proxy(params['proxy']) # 随机获取一个代理地址 proxy_support = urllib.request.ProxyHandler({'http': proxyaddr, 'https': proxyaddr}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) req = urllib.request.Request(url=url, data=data, headers=headers) try: response = urllib.request.urlopen(req, timeout=timeout) res = response.read() if response.getheader('Transfer-Encoding') == 'chunked': # 分块编码 encoding = response.getheader('Content-Encoding') if encoding == 'gzip': res = gzip.decompress(res) elif encoding == 'deflate': res = zlib.decompress(res) return res.decode("utf8") except HTTPError as e: return 'Error : code %s' % e.code except URLError as e: return 'Error : Reason %s' % e.reason except Exception as e: return 'Error : %s' % e def __getv(self, data, key, default=None): if not isinstance(data, dict): data = {} if key in data: return data[key] return default
调用方法
写个简单的php来测试效果
通过http://www.test.com/test.php
访问
<?php
echo 'SERVER : ';
print_r($_SERVER);
echo 'POST : ';
print_r($_POST);
echo 'COOKIE : ';
print_r($_COOKIE);
GET
http = GetHtml()
url = 'http://www.test.com/test.php?name=notevm'
# get的传参也可以写成字典
# data = {'name': 'notevm','user': 'admin'}
# params = {'data': data}
# html = http.get(url, params)
html = http.get(url)
print(html)

POST
http = GetHtml()
url = 'http://www.test.com/test.php'
data = {
'site': 'www.notevm.com',
'name': 'notevm',
'pass': '123456'
}
# post请求的data也可以是字符串类型
# data = "site=www.notevm.com&name=notevm&pass=123456"
params = {
'data': data
}
html = http.post(url, params)
print(html)

带User-Agent、Cookie
http = GetHtml()
url = 'http://www.test.com/test.php'
data = {
'site': 'www.notevm.com',
'name': 'notevm',
'pass': '123456'
}
params = {
'data': data,
'ua': 'python www.notevm.com',
'cookie': 'user=notevm;url=www.notevm.com'
}
html = http.get(url, params)
print(html)

自定义header信息
http = GetHtml()
url = 'http://www.test.com/test.php'
headers = {
'token': 'notevm-123456'
}
params = {
'headers': headers,
}
html = http.get(url, params)
print(html)

自动
http = GetHtml()
url = 'http://www.test.com/test.php?name=notevm'
# get
# get请求把参数写在url,不要在params传入data,传入data会以post的方式请求
html = http.auto(url)
print(html)
# post
data = {
'site': 'www.notevm.com'
}
params = {
'data': data
}
html = http.auto(url, params)
print(html)
更新功能示例
一次设置多个代理地址会随机取一个来使用
http = GetHtml()
url = 'http://www.test.com/test.php?name=notevm'
# 方法1:代理地址写在列表
proxy = {
'address': [
'192.168.1.1:8881',
'192.168.1.2:8882',
'192.168.1.3:8883'
]
}
# 方法2:代理地址写在txt里,一行写一个代理地址,格式:ip:port 例如:192.168.1.1:8881
proxy = {
'file': './proxy.txt'
}
params = {
'proxy': proxy
}
html = http.get(url, params)
print(html)
原创文章,作者:小哆啦,如若转载,请注明出处:https://www.notevm.com/a/5693.html