1 В избранное 0 Ответвления 0

OSCHINA-MIRROR/skyjoy-vsplider

Присоединиться к Gitlife
Откройте для себя и примите участие в публичных проектах с открытым исходным кодом с участием более 10 миллионов разработчиков. Приватные репозитории также полностью бесплатны :)
Присоединиться бесплатно
Клонировать/Скачать
threading_demo.py 9.4 КБ
Копировать Редактировать Web IDE Исходные данные Просмотреть построчно История
skyJOY2015 Отправлено 20.04.2020 11:23 5e2f538
import requests
from lxml import etree
import threading # 导入threading模块
from queue import Queue #导入queue模块
import time #导入time模块
from utils.db_pymysql import Database
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
conf = {"host":"localhost",
"port":3306,
"user":"root",
"pw":"105.46888",
"db":"eacoophp_vsplider"}
ch={"动作":1,
"喜剧":2,
"爱情":3,
"科幻":4,
"恐怖":5,
"剧情":6,
"战争":7,
"记录":8,
"音乐":9,
"伦理":10,
"福利":11}
# 爬取文章详情页
def get_detail_html(detail_url_list, id):
while True:
url = detail_url_list.get().split('|') #Queue队列的get方法用于从队列中提取元素
#
try:
data = requests.get(url[1], headers=headers).text
except:
print("爬取文章详情页异常:5s后重试")
time.sleep(5)
continue
s = etree.HTML(data)
name = s.xpath('//h2/text()')
if name==None:
continue
db = Database(conf)
if db.count("eacoo_video", " name='" + str(name[0]) + "'") > 0:
print(str(name[0]),"存在,已跳过!")
time.sleep(2)
continue
if name:
print(name[0])
othername = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[1]/span/text()')
if othername:
print(othername[0])
director = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[2]/span/text()')
if director:
print(director[0])
actor = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[3]/span/text()')
if actor:
print(actor[0])
channel = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[4]/span/text()')
if channel:
print(channel)
area = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[5]/span/text()')
if area:
print(area[0])
year = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[7]/span/text()')
if year:
print(year[0])
timelong = s.xpath('//div[contains(@class, "vodinfobox")]/ul/li[8]/span/text()')
if timelong:
print(timelong[0])
ads = s.xpath('//div[contains(@class, "vodh")]/span/text()')
if ads:
print(ads[0])
star = s.xpath('//div[contains(@class, "vodh")]/label/text()')
if star:
print(star[0])
introduce = s.xpath('//div[contains(@class, "vodplayinfo")]/text()')
if introduce:
print(introduce[0])
tags = s.xpath('//div[contains(@class, "tags")]/span/text()')
print(tags)
conver = s.xpath('//div[contains(@class, "vodImg")]/img/@src')
print(conver[0])
play1 = s.xpath('//*[@id="play_1"]/ul/li/text()')
print(play1)
play2 = s.xpath('//*[@id="play_2"]/ul/li/text()')
print(play2)
down1 = s.xpath('//*[@id="down_1"]/ul/li/text()')
print(down1)
time.sleep(2) # 延时2s,模拟网络请求和爬取文章详情的过程
print("thread {id}: get {url} detail finished".format(id=id,url=url[1])) #打印线程id和被爬取了文章内容的url
#
chs=","
for c in range(len(channel)):
cx =str(channel[c]).replace("片","").replace("\xa0","")
if cx in ch:
item = ch[cx]
if chs.find(str(item)+",")<0:
chs +=str(item)+","
print(chs)
if url[2]:
chs=url[2]
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
data ={"type":str(url[0]),
"channel":"'"+str(chs)+"'",
"name":"'"+str(name[0] if name else "").replace("'","").replace("\"","")+"'",
"othername":"'"+str(othername[0] if othername else "").replace("'","").replace("\"","")+"'",
"director":"'"+str(director[0] if director else "").replace("'","").replace("\"","")+"'",
"actor":"'"+str(actor[0] if actor else "").replace("'","").replace("\"","")+"'",
"area":"'"+str(area[0] if area else "")+"'",
"year":"'"+str(year[0] if year else "")+"'",
"timelong":"'"+str(timelong[0] if timelong else "")+"'",
"ads":"'"+str(ads[0] if ads else "").replace("'","").replace("\"","")+"'",
"star":"'"+str(star[0] if star else "")+"'",
"introduce":"'"+str(introduce[0] if introduce else "").replace("'","").replace("\"","")+"'",
"tags":"'"+(','.join(tags)).replace("'","").replace("\"","")+"'",
"conver":"'"+str(conver[0] if conver else "")+"'",
"status":str(1),
"recom":str(0),
"create_time": "'" + now + "'",
"update_time": "'" + now + "'"}
id=db.insert("eacoo_video",data)
print("{id}{n}入库success!".format(id=id,n=name[0]))
if id>0:
if play1:
for x in range(len(play1)):
arr_x=play1[x].split('$');
data_play1={"type":str(1),
"video":str(id),
"title":"'"+str(arr_x[0])+"'",
"url":"'"+str(arr_x[1])+"'",
"num":str(x),
"status":str(1),
"create_time": "'" + now + "'",
"update_time": "'" + now + "'"}
db.insert("eacoo_play",data_play1)
if play2:
for x in range(len(play2)):
arr_x=play2[x].split('$');
data_play2={"type":str(2),
"video":str(id),
"title":"'"+str(arr_x[0])+"'",
"url":"'"+str(arr_x[1])+"'",
"num":str(x),
"status":str(1),
"create_time": "'" + now + "'",
"update_time": "'" + now + "'"}
db.insert("eacoo_play",data_play2)
if down1:
for x in range(len(down1)):
arr_x=down1[x].split('$');
data_down1={"type":str(3),
"video":str(id),
"title":"'"+str(arr_x[0])+"'",
"url":"'"+str(arr_x[1])+"'",
"num":str(x),
"status":str(1),
"create_time": "'" + now + "'",
"update_time": "'" + now + "'"}
db.insert("eacoo_play",data_down1)
# 爬取文章列表页
def get_detail_url(queue,url):
# url="http://www.zuidazy5.com/?m=vod-type-id-2-pg-*.html";
count=0;
# print(url[0].split('|')[2]);exit();
for l in range(len(url)):
listurl=url[l].split('|')
p2=listurl[2].split('-')
i=int(p2[0])
for i in range(int(p2[1])):
#
try:
data = requests.get(listurl[0].replace("*",str(i+1)), headers=headers).text
except:
print("爬取文章列表页异常:5s后重试")
time.sleep(5)
continue
s = etree.HTML(data)
href = s.xpath('//div[contains(@class, "xing_vb")]/ul/li/span[2]/a/@href')
# print(href[0])
time.sleep(1) # 延时1s,模拟比爬取文章详情要快
for x in href:
count+=1
queue.put("{t}|http://www.zuidazy5.com/{u}|{c}".format(u=x,t=listurl[1],c=listurl[3]))#Queue队列的put方法用于向Queue队列中放置元素,由于Queue是先进先出队列,所以先被Put的URL也就会被先get出来。
print("get detail url {u} 入队列就绪!".format(u=x))#打印出得到了哪些文章的url
print("count:",str(count))
#主函数
if __name__ == "__main__":
# print(conf["db"]);exit();
default_url=[
# 'http://www.zuidazy5.com/?m=vod-type-id-1-pg-*.html|1|40-50|'
# 'http://www.zuidazy5.com/?m=vod-type-id-2-pg-*.html|2|6-30|'
# 'http://www.zuidazy5.com/?m=vod-type-id-3-pg-*.html|3|1-19|']
# 'http://www.zuidazy5.com/?m=vod-type-id-4-pg-*.html|4|1-20|']
# 'http://www.zuidazy5.com/?m=vod-type-id-16-pg-*.html|1|1-20|,11,']
'http://www.zuidazy5.com/?m=vod-type-id-17-pg-*.html|1|1-20|,10,']
# 'http://www.zuidazy5.com/?m=vod-type-id-18-pg-*.html|1|1-7|,9,']
detail_url_queue = Queue(maxsize=100000) #用Queue构造一个大小为1000的线程安全的先进先出队列
# 先创造四个线程
thread = threading.Thread(target=get_detail_url, args=(detail_url_queue,default_url)) #A线程负责抓取列表url
html_thread= []
for i in range(3):
thread2 = threading.Thread(target=get_detail_html, args=(detail_url_queue,i))
html_thread.append(thread2)#B C D 线程抓取文章详情
start_time = time.time()
# 启动四个线程
thread.start()
for i in range(3):
html_thread[i].start()
# 等待所有线程结束,thread.join()函数代表子线程完成之前,其父进程一直处于阻塞状态。
thread.join()
for i in range(3):
html_thread[i].join()
print("last time: {} s".format(time.time()-start_time))#等ABCD四个线程都结束后,在主进程中计算总爬取时间。

Опубликовать ( 0 )

Вы можете оставить комментарий после Вход в систему

1
https://api.gitlife.ru/oschina-mirror/skyjoy-vsplider.git
git@api.gitlife.ru:oschina-mirror/skyjoy-vsplider.git
oschina-mirror
skyjoy-vsplider
skyjoy-vsplider
master