xinxin93666-crawler_baidu_post_bar
/
重写贴吧.py

from urllib import request
import sqlite3
import re
from tools import StrTools

class Tba(object):
    def __init__(self, t_id):
        self.conn = None
        self.cursor = None
        self.create_sql()
        self.url = "https://tieba.baidu.com" + t_id
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0"}
        self.html = ''
        self.title = ''
        self.total = 0

    def get_html(self, url):
        """获取网页源代码"""
        while True:
            # 捕获超时异常
            try:
                # 1.构建请求对象
                req = request.Request(url=url, headers=self.headers)
                # 2.发起请求，接收响应
                response = request.urlopen(req, timeout=3)
                # 3.读取响应数据，转换为字符串
                self.html = response.read().decode("utf-8", "ignore")
            except Exception as e:
                print(e)
                continue
            else:
                break

    def get_title_total(self):
        """获取标题和总页数"""
        # 准备正则
        title_pattern = re.compile(r'<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
        # 根据正则匹配标题
        title_res = re.search(title_pattern, self.html)
        if title_res:
            self.title = title_res.group(1)

        # 匹配总页数
        page_pattern = re.compile(r'<span class="red">(.*?)</span>', re.S)
        page_res = re.search(page_pattern, self.html)
        if page_res:
            self.total = int(page_res.group(1))

    def parse_html(self):
        """解析网页数据"""
        # 1.准备正则
        pattern = re.compile(r'<a.*?class="p_author_name.*?>(.*?)</a>.*?<div class="d_badge_title.*?>(.*?)</div>.*?class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap">(.*?)</div>', re.S)
        res = re.findall(pattern, self.html)
        for r in res:
            # 处理昵称内容
            author = StrTools.strip_symbol(r[0])
            content = StrTools.strip_symbol(r[3])
            title = r[1]
            grade = r[2]
            # 处理来自客户端
            come_from, floor, datetime = StrTools.process_come_from(r[4])
            self.insert_sql(author, title, grade, content, come_from, floor, datetime)

    def connect_sql(self):
        self.conn = sqlite3.connect("tba.db")
        self.cursor = self.conn.cursor()

    def close_sql(self):
        self.conn.commit()
        self.cursor.close()
        self.conn.close()

    def create_sql(self):
        self.connect_sql()
        sql = "CREATE TABLE IF NOT EXISTS tba(id INTEGER primary key, author CHAR ,title CHAR ,grade INTEGER , content TEXT, client CHAR, floor CHAR ,datetime CHAR )"
        self.cursor.execute(sql)
        self.close_sql()

    def insert_sql(self, *args):
        self.connect_sql()
        sql = '''INSERT INTO tba(author,title,grade,content,client,floor,datetime)VALUES ("%s","%s",%s,"%s","%s","%s","%s")''' % args
        self.cursor.execute(sql)
        self.close_sql()

    def run(self):
        self.get_html(self.url)
        self.get_title_total()
        print("正在爬取:{},共{}页，请稍后...".format(self.title, self.total))
        # 循环爬取每一页数据
        count = 0
        for x in range(1, self.total + 1):
            # 拼接每一页地址
            count += 1
            url = self.url + '?pn={}'.format(x)
            print("正在爬取第%s页" % count)
            self.get_html(url=url)
            self.parse_html()
            if count == self.total:
                print("程序结束后数据爬取完成")


if __name__ == "__main__":
    badu = Tba(t_id='/p/3333305665')
    badu.run()