Martin-word-crawler_related
/
daili.py

import requests
import re
from bs4 import BeautifulSoup

def try_run(func):                                  # 装饰器包装操作，方便日志输出
    def run(*args):
        try:
            return func(*args)
        except:
            err('尝试运行失败')

    return run

def err(type, num = None):                          # 错误处理
    if num is None:
        print(type)
    else:
        print(type, num)

@try_run
def get_url(url, type = None, encoding = 'UTF-8'):             # html访问 使用装饰器中的 try 避免连接超时导致线程中断
    headers = {
        'User-Agent':'Mozilla/5.0'
    }
    html = requests.get(url, headers = headers)
    html.encoding = encoding
    if html.status_code is 200:                                 # 默认解析器 如果出现兼容问题 请使用 html.parser
        if type is 'text':
            return html.text                                    # 不经过 BeautifulSoup 解析
        elif type is 'html.parser':
            return BeautifulSoup(html.text, 'html.parser')
        else:                                                   # 默认解析器 lxml
            return BeautifulSoup(html.text, 'lxml')
    else:
        err('request 错误', html.status_code)

@try_run
def for_ip(data):                                               # 正则匹配 IP 地址
    data = re.findall(r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\:[0-9]{1,5}", data)[0]
    return data

def ip_clear(list):                                             # 将网页返回的 IP 数组进行清理
    for i in range(len(list)):
        list[i] = for_ip(list[i])
        if list[i] is None:
            del(list[i])
    return list

def ip_89ip():                                                  # 从 89IP 获取 代理信息
    html = get_url('http://www.89ip.cn/tqdl.html?api=1&num=9999&port=&address=&isp=', 'text')
    allip = html.split('</script>')[2]
    ip = allip.split('<br>')[:-1]
    ip = ip_clear(ip)
    return ip

def ip_test(proxies):                                                 # IP 测试 返回 IP , 归属地
    url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    html = requests.get(url, proxies = proxies)
    html.encoding = 'UTF-8'
    html = BeautifulSoup(html.text, 'html.parser')
    ip_where = html.find('div', {'class':'c-span21 c-span-last op-ip-detail'})
    ip_where = ip_where.find('td').text
    ip = re.findall(r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}", ip_where)[0]
    where = ip_where.split(ip)[1]
    where = re.findall(r"\S*\S", where)
    where = ''.join(where)
    return ip, where