Martin-word-crawler_related
/
baidu_pic.py

import requests
import time
import re
import os

# 获取当前时间戳
def __now(): return int(time.time() * 1000)

# 非法字符转义
def text(text):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_text = re.sub(rstr, "_", text)  # 替换为下划线
    return new_text

# 配置请求头 搜索图片 【名字，第几页数】
def __params(name, page):
    params = {
        "tn": "resultjson_com",
        "ipn": "rj",
        "ct": "201326592",
        "fp": "result",
        "queryWord": name,
        "cl": 2,
        "lm": -1,
        "ie": "utf-8",
        "oe": "utf-8",
        "word": name,
        "nc": 1,
        "pn": page * 30,
        "rn": 30,
        __now(): None
    }
    return params

# 查找指定类型图片， 查找的页数
def pic_find(name, page):
    pic = []
    for i in range(page):
        html = requests.get('https://image.baidu.com/search/acjson', params=__params(name, i))
        html.encoding = 'utf-8'
        html = html.json().get('data')
        for i in html[:-1]:
            pic_one = {
                'name' : i.get('fromPageTitleEnc'),
                'url' : i.get('thumbURL'),
            }
            pic.append(pic_one)
    return pic

# 保存图片到指定路径
def pic_save(pic, download):
    if download[-1] is not '/':
        download = download + '/'
    for i in pic:
        type = i["url"][-3:]
        html = requests.get(i["url"], stream=True)
        name = text(i["name"])
        i = 0
        while True:
            if os.path.exists(download + name + str(i) + '.' + type):
                i = i + 1
            else:
                break
        name = download + name + str(i) + '.' + type
        with open(name, 'wb') as f:
            f.write(html.content)