wpxux-study
/
getimg.py

#_*_coding:utf-8_*_
# Date:.2019/1/7

import requests
import re
from bs4 import BeautifulSoup


# 获取参评学校的相关信息
def get_img(url):
    # 通过requests库的get函数获取网站的HTML页面,请求成功返回状态码200，不成功返回404
    res = requests.get(url)
    # 从header中推测编码为为utf-8
    res.encoding = "utf-8"
    # BeautifulSoup对res.text进行HTML的解析，输出HTML界面
    soup1 = BeautifulSoup(res.text, "html.parser")
    # print(soup1)
    # 通过find/find_all函数获取想要的标签
    tag = soup1.find(name="tbody")
    all_td = tag.find_all(name="td")
    print(all_td)
    url_ = url+"college/%s.html"
    for i in all_td:
        # 获取所有td标签的文本内容
        college_img = i.text
        # 通过正则表达式将得到的数据处理
        college_id = re.match('\d+', college_img).group()
        col_count = re.search('\\(\d\\)', college_img).group()
        # 通过url拼接，得到每一个学校查看参选专业的url
        new_url = url_%college_id
        # 拼接成每一个查看详细信息的url
        a = i.find(name='a')
        print(college_id, a.text, col_count, new_url)
        # models.Img.objects.create(code=college_id, college=a.text, count=col_count, url=new_url)

url = "http://zypt.neusoft.edu.cn/hasdb/pubfiles/gongshi2016/"
ret = get_img(url)


# 获取每个学校参评的所有专业
ret1 = requests.get('http://zypt.neusoft.edu.cn/hasdb/pubfiles/gongshi2016/college/10459.html')
ret1.encoding = "utf-8"
soup = BeautifulSoup(ret1.text, "html.parser")
tab_img = soup.find(name="tbody")
# 获取所有的a标签
a_img = tab_img.find_all(name="a")
print(a_img)
# 将得到的a标签中的文本取出
ls = []
for i in range(len(a_img)):
    # print(a_img[i].text)
    # 由于获取到的a标签是一个需要的数据一个不需要的数据，所以通过if判断语句挑选出需要的数据添加进列表ls中
    if i%2 == 0:
        ls.append(a_img[i].text)
    else:
        continue

print(ls)


# 爬取专业评估申报的信息
url = 'http://zypt.neusoft.edu.cn/hasdb/pubfiles/gongshi2016/detail/10463/10463_080901.html'
res = requests.get(url)
res.encoding = "utf-8"
soup2 = BeautifulSoup(res.text, "html.parser")
tab_img = soup2.find(name="tbody")
# 通过find_all函数获取所有的标签名为tr并且有class属性为subtitle的标签
td_img = tab_img.find_all(name="tr",attrs={"class":"subtitle"})
content_list = []
for i in td_img:
    content = re.search(">\d.+<", str(i))
    if content:
        content_list.append(content.group()[1:-1])
# print(content_list)

# 爬取专业评估申报的所需的详细信息
a_tag = tab_img.find_all(name="a")
# 获取详细信息的序号，并添加到一个列表xh_list中
xh = tab_img.find_all(attrs={"class":"subitem"})
xh_list = []
for j in range(len(xh)):
    xh_list.append(xh[j].text)
# 获取详细信息的内容，并添加到一个列表detail_list中
detail_list = []
for k in range(len(a_tag)):
    if k%2 == 0:
        detail_list.append(a_tag[k].text)
    else:
        continue

# 将获取到的序号和内容进行拼接
ret_list = []
for i in range(len(detail_list)):
    ret_list.append("{} {}".format(xh_list[i],detail_list[i]))

print(ret_list)