skyjoy-vsplider
/
threading_demo.py

import requests
from lxml import etree
import threading # 导入threading模块
from queue import Queue #导入queue模块
import time  #导入time模块
from utils.db_pymysql import Database

headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

conf = {"host":"localhost",
      "port":3306,
      "user":"root",
      "pw":"105.46888",
      "db":"eacoophp_vsplider"}
ch={"动作":1,
    "喜剧":2,
    "爱情":3,
    "科幻":4,
    "恐怖":5,
    "剧情":6,
    "战争":7,
    "记录":8,
    "音乐":9,
    "伦理":10,
    "福利":11}

# 爬取文章详情页
def get_detail_html(detail_url_list, id):
    while True:
        url = detail_url_list.get().split('|') #Queue队列的get方法用于从队列中提取元素
        #
        try:
            data = requests.get(url[1], headers=headers).text
        except:
            print("爬取文章详情页异常：5s后重试")
            time.sleep(5)
            continue
        s = etree.HTML(data)
        name = s.xpath('//h2/text()')
        if name==None:
            continue
        db = Database(conf)
        if db.count("eacoo_video", " name='" + str(name[0]) + "'") > 0:
            print(str(name[0]),"存在，已跳过！")
            time.sleep(2)
            continue
        if name:
            print(name[0])
        othername = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[1]/span/text()')
        if othername:
            print(othername[0])
        director = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[2]/span/text()')
        if director:
            print(director[0])
        actor = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[3]/span/text()')
        if actor:
            print(actor[0])
        channel = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[4]/span/text()')
        if channel:
            print(channel)
        area = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[5]/span/text()')
        if area:
            print(area[0])
        year = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[7]/span/text()')
        if year:
            print(year[0])
        timelong = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[8]/span/text()')
        if timelong:
            print(timelong[0])
        ads = s.xpath('//div[contains(@class, "vodh")]/span/text()')
        if ads:
            print(ads[0])
        star = s.xpath('//div[contains(@class, "vodh")]/label/text()')
        if star:
            print(star[0])
        introduce = s.xpath('//div[contains(@class, "vodplayinfo")]/text()')
        if introduce:
            print(introduce[0])
        tags = s.xpath('//div[contains(@class, "tags")]/span/text()')
        print(tags)
        conver = s.xpath('//div[contains(@class, "vodImg")]/img/@src')
        print(conver[0])
        play1 = s.xpath('//*[@id="play_1"]/ul/li/text()')
        print(play1)
        play2 = s.xpath('//*[@id="play_2"]/ul/li/text()')
        print(play2)
        down1 = s.xpath('//*[@id="down_1"]/ul/li/text()')
        print(down1)

        time.sleep(2)  # 延时2s，模拟网络请求和爬取文章详情的过程
        print("thread {id}: get {url} detail finished".format(id=id,url=url[1])) #打印线程id和被爬取了文章内容的url

        #
        chs=","
        for c in range(len(channel)):
            cx =str(channel[c]).replace("片","").replace("\xa0","")
            if cx in ch:
                item = ch[cx]
                if chs.find(str(item)+",")<0:
                    chs +=str(item)+","
                print(chs)
        if url[2]:
            chs=url[2]
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
        data ={"type":str(url[0]),
               "channel":"'"+str(chs)+"'",
               "name":"'"+str(name[0] if name else "").replace("'","").replace("\"","")+"'",
               "othername":"'"+str(othername[0] if othername else "").replace("'","").replace("\"","")+"'",
               "director":"'"+str(director[0] if director else "").replace("'","").replace("\"","")+"'",
               "actor":"'"+str(actor[0] if actor else "").replace("'","").replace("\"","")+"'",
               "area":"'"+str(area[0] if area else "")+"'",
               "year":"'"+str(year[0] if year else "")+"'",
               "timelong":"'"+str(timelong[0] if timelong else "")+"'",
               "ads":"'"+str(ads[0] if ads else "").replace("'","").replace("\"","")+"'",
               "star":"'"+str(star[0] if star else "")+"'",
               "introduce":"'"+str(introduce[0] if introduce else "").replace("'","").replace("\"","")+"'",
               "tags":"'"+(','.join(tags)).replace("'","").replace("\"","")+"'",
               "conver":"'"+str(conver[0] if conver else "")+"'",
               "status":str(1),
               "recom":str(0),
               "create_time": "'" + now + "'",
               "update_time": "'" + now + "'"}
        id=db.insert("eacoo_video",data)
        print("{id}{n}入库success!".format(id=id,n=name[0]))
        if id>0:
            if play1:
                for x in range(len(play1)):
                    arr_x=play1[x].split('$');
                    data_play1={"type":str(1),
                        "video":str(id),
                        "title":"'"+str(arr_x[0])+"'",
                        "url":"'"+str(arr_x[1])+"'",
                        "num":str(x),
                        "status":str(1),
                        "create_time": "'" + now + "'",
                        "update_time": "'" + now + "'"}
                    db.insert("eacoo_play",data_play1)
            if play2:
                for x in range(len(play2)):
                    arr_x=play2[x].split('$');
                    data_play2={"type":str(2),
                        "video":str(id),
                        "title":"'"+str(arr_x[0])+"'",
                        "url":"'"+str(arr_x[1])+"'",
                        "num":str(x),
                        "status":str(1),
                        "create_time": "'" + now + "'",
                        "update_time": "'" + now + "'"}
                    db.insert("eacoo_play",data_play2)
            if down1:
                for x in range(len(down1)):
                    arr_x=down1[x].split('$');
                    data_down1={"type":str(3),
                        "video":str(id),
                        "title":"'"+str(arr_x[0])+"'",
                        "url":"'"+str(arr_x[1])+"'",
                        "num":str(x),
                        "status":str(1),
                        "create_time": "'" + now + "'",
                        "update_time": "'" + now + "'"}
                    db.insert("eacoo_play",data_down1)

# 爬取文章列表页
def get_detail_url(queue,url):
    # url="http://www.zuidazy5.com/?m=vod-type-id-2-pg-*.html";
    count=0;
    # print(url[0].split('|')[2]);exit();
    for l in range(len(url)):
        listurl=url[l].split('|')
        p2=listurl[2].split('-')
        i=int(p2[0])
        for i in range(int(p2[1])):
            #
            try:
                data = requests.get(listurl[0].replace("*",str(i+1)), headers=headers).text
            except:
                print("爬取文章列表页异常：5s后重试")
                time.sleep(5)
                continue
            s = etree.HTML(data)
            href = s.xpath('//div[contains(@class, "xing_vb")]/ul/li/span[2]/a/@href')
            # print(href[0])
            time.sleep(1) # 延时1s，模拟比爬取文章详情要快
            for x in href:
                count+=1
                queue.put("{t}|http://www.zuidazy5.com/{u}|{c}".format(u=x,t=listurl[1],c=listurl[3]))#Queue队列的put方法用于向Queue队列中放置元素，由于Queue是先进先出队列，所以先被Put的URL也就会被先get出来。
                print("get detail url {u} 入队列就绪！".format(u=x))#打印出得到了哪些文章的url
    print("count:",str(count))

#主函数
if __name__ == "__main__":
    # print(conf["db"]);exit();
    default_url=[
             # 'http://www.zuidazy5.com/?m=vod-type-id-1-pg-*.html|1|40-50|'
             # 'http://www.zuidazy5.com/?m=vod-type-id-2-pg-*.html|2|6-30|'
             # 'http://www.zuidazy5.com/?m=vod-type-id-3-pg-*.html|3|1-19|']
             # 'http://www.zuidazy5.com/?m=vod-type-id-4-pg-*.html|4|1-20|']
             # 'http://www.zuidazy5.com/?m=vod-type-id-16-pg-*.html|1|1-20|,11,']
             'http://www.zuidazy5.com/?m=vod-type-id-17-pg-*.html|1|1-20|,10,']
             # 'http://www.zuidazy5.com/?m=vod-type-id-18-pg-*.html|1|1-7|,9,']
    detail_url_queue = Queue(maxsize=100000) #用Queue构造一个大小为1000的线程安全的先进先出队列
    # 先创造四个线程
    thread = threading.Thread(target=get_detail_url, args=(detail_url_queue,default_url)) #A线程负责抓取列表url
    html_thread= []
    for i in range(3):
        thread2 = threading.Thread(target=get_detail_html, args=(detail_url_queue,i))
        html_thread.append(thread2)#B C D 线程抓取文章详情
    start_time = time.time()
    # 启动四个线程
    thread.start()
    for i in range(3):
        html_thread[i].start()
    # 等待所有线程结束，thread.join()函数代表子线程完成之前，其父进程一直处于阻塞状态。
    thread.join()
    for i in range(3):
        html_thread[i].join()

    print("last time: {} s".format(time.time()-start_time))#等ABCD四个线程都结束后，在主进程中计算总爬取时间。