zhupf-WebSpider
/
WebSpider.py

#coding:utf-8
import re, os, shutil, sys, traceback
import urllib2, socket, cookielib
from threading import Thread, stack_size, Lock
from Queue import LifoQueue, Queue
import time
from gzip import GzipFile
from StringIO import StringIO
from sets import Set

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        raise urllib2.HTTPError(req, 302, "302 HTTPRedirect.", headers, fp)
    http_error_301 = http_error_303 = http_error_307 = http_error_302

class ContentEncodingProcessor(urllib2.BaseHandler):
    """A handler to add gzip capabilities to urllib2 requests """

      # add headers to requests
    def http_request(self, req):
        req.add_header("Accept-Encoding", "gzip, deflate")
        return req

    # decode
    def http_response(self, req, resp):
        old_resp = resp
        # gzip
        if resp.headers.get("content-encoding") == "gzip":
            gz = GzipFile(
                fileobj=StringIO(resp.read()),
                mode="r"
                )
            resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # deflate
        if resp.headers.get("content-encoding") == "deflate":
            gz = StringIO(deflate(resp.read()))
            resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
            resp.msg = old_resp.msg
        return resp

# deflate support
import zlib
def deflate(data):   # zlib only provides the zlib compress format, not the deflate format;
    try:               # so on top of all there's this workaround:
        return zlib.decompress(data, -zlib.MAX_WBITS)
    except zlib.error:
        return zlib.decompress(data)

class Fetcher:
    '''
    html Fetcher

    basic usage
    -----------
    from fetcher import Fetcher
    f = Fetcher()
    f.get(url)

    post
    ----
    req = urllib2.Request(...)
    f.post(req)

    multi-thread
    ------------
    f = Fetcher(threads=10)
    for url in urls:
        f.push(url)
    while f.taskleft()
        url,html = f.pop()
        deal_with(url,html)
    '''
    def __init__(self, timeout=15, threads=None, stacksize=32768*16, loginfunc=None, home=None, baseurl=None):
        #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
        cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        encoding_support = ContentEncodingProcessor()
        #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
        self.opener = urllib2.build_opener(SmartRedirectHandler, cookie_support, encoding_support, urllib2.HTTPHandler)
        self.req = urllib2.Request('http://www.baidu.com')
        socket.setdefaulttimeout(timeout)
        #self.q_req = LifoQueue()
        #self.q_ans = LifoQueue()
        self.q_req = Queue()
        self.q_ans = Queue()
        self.s_url = Set()
        self.lock = Lock()
        self.running = 0
        self.home = home
        self.baseurl = baseurl
        if loginfunc:
            self.opener = loginfunc(self.opener)
        if threads:
            self.threads = threads
            stack_size(stacksize)
            for i in range(threads):
                t = Thread(target=self.threadget)
                t.setDaemon(True)
                t.start()

    def __del__(self):
        time.sleep(0.5)
        self.q_req.join()
        self.q_ans.join()

    def taskleft(self):
        return self.q_req.qsize()+self.q_ans.qsize()+self.running

    def push(self, req, repeat=3):
        if not self.threads:
            print 'no thread, return get instead'
            return get(req, repeat)
        if req not in self.s_url:
            self.q_req.put(req)
            self.s_url.add(req)
            open(self.home+'_log.txt', 'a').write(req+'\n')

    def pop(self):
        try:
            data = self.q_ans.get(block=True, timeout=15)
            self.q_ans.task_done()
        except:
            data = ['', '']
        return data

    def threadget(self):
        while True:
            req = self.q_req.get()
            with self.lock:
                self.running += 1
            ans = self.get(req)
            print 'got', req
            self.q_ans.put((req, ans))
            try:
                self.q_req.task_done()
            except:
                pass
            with self.lock:
                self.running -= 1
            time.sleep(0.1) # don't spam

    def proxyisworking(self):
        try:
            self.opener.open('http://www.baidu.com').read(1024)
            return True
        except Exception, what:
            print what
            return False
    def get(self, req, repeat=3):
        '''
        http GET req and repeat 3 times if failed
        html text is returned when succeeded
        '' is returned when failed
        '''
        try:
            response = self.opener.open(req)
            data = response.read()
        except Exception, what:
            print what, '[get err][try='+str(repeat)+']'+req
            if repeat > 0:
                time.sleep(0.1) # don't spam
                return self.get(req, repeat-1)
            else:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
                err = req+'\n'.join('!! ' + line for line in lines) + '\n'
                if err.find('HTTP Error 302') < 0:
                    open(self.home+'_err.txt', 'a').write(req+'\n')  # Log it or whatever here
                print 'GET Failed', req
                return ''
        return data

    def post(self, req, repeat=3):
        '''
        http POST req and repeat 3 times if failed
        html text/True is returned when succeeded
        False is returned when failed
        '''
        if not isinstance(req, urllib2.Request):
            print 'post method need urllib.Request as argument'
            return False
        else:
            r = self.get(req, repeat)
            if r:
                return r
            else:
                return True

class SiteCopyer:
    def __init__(self, url):
        self.baseurl = url
        self.home = self.baseurl.split('/')[2]
        print "baseurl: "+self.baseurl
        print "home: "+self.home
        self.f = Fetcher(threads=3, home=self.home, baseurl=self.baseurl)
        self.create_dir()
        self.adlist = []
        self.init_adlist()

    def init_adlist(self):
        adlist = []
        for ad in adlist:
            ad = ad.replace('\r\n', '\n')
            ad = ad.replace('\n', '\r\n')
            self.adlist.append(ad)

    def create_dir(self):
        try:
            shutil.rmtree(self.home)
        except Exception, what:
            print what
        try:
            os.mkdir(self.home)
        except Exception, what:
            print what

    def full_link(self, link, baseurl=None):
        if not baseurl:
            baseurl = self.baseurl
        if '?' in link:
            link = link.rsplit('?', 1)[0]
        if link.find('://') < 0:
            if link.startswith('/'):
                link = '/'.join(baseurl.split('/', 3)[:3]) + link
            elif link.startswith('../'):
                while link.startswith('../'):
                    baseurl = baseurl.rsplit('/', 2)[0]
                    link = link[3:]
                link = baseurl+'/'+link
            else:
                link = baseurl.rsplit('/', 1)[0]+'/'+link
        #print "full_link: "+link
        return link

    def link_alias(self, link, mkdirs=False):
        link = self.full_link(link)
        link = link.replace(self.baseurl, '/')
        if mkdirs:
            tdir = link.rsplit('/', 1)[0]
            try:
                os.makedirs(self.home+'/'+tdir)
            except:
                pass
        return link

    def strip_link(self, link):
        if link and (link[0] in ['"', "'"]):
            link = link[1:]
        while link and (link[-1] in ['"', "'"]):
            link = link[:-1]
        while link.endswith('/'):
            link = link[:-1]
        if link and (link[0] not in ["<", "'", '"']) and ('feed' not in link):
            return link
        elif link == "#":
            return ''
        else:
            return ''

    def trim_ad(self, page):
        for ad in self.adlist:
            page = page.replace(ad, '')
        return page

    def copy(self):
        page = self.f.get(self.baseurl)
        links = re.compile(r'<link[^>]*href=(.*?)[ >]', re.I).findall(page)
        links.extend(re.compile(r'<frame[^>]*src=(.*?)[ >]', re.I).findall(page))
        links.extend(re.compile(r'<a[^>]*href=(.*?)[ >]', re.I).findall(page))
        links.extend(re.compile(r'<script[^>]*src=(.*?)[ >]', re.I).findall(page))
        links.extend(re.compile(r'<img[^>]*src=(.*?)[ >]', re.I).findall(page))
        links.extend(re.compile(r'.location=(.*?)[;]', re.I).findall(page))
        links.append('index.htm')
        templinks = []
        for link in links:
            slink = self.strip_link(link)
            if slink:
                templinks.append(slink)
        links = templinks
        for link in links:
            print "links: "+link
            if not link.startswith('http') or link.find(self.baseurl) >= 0:
                self.f.push(self.full_link(link))
                #print "links_F: "+self.full_link(link)
        open(self.home+'/index.html', 'wb').write(page)
        while self.f.taskleft():
            url, page = self.f.pop()
            linkspush = []
            url = self.strip_link(url)
            pagename = url.rsplit('/', 1)[1]
            print "pagename: "+pagename
            if pagename == self.home:
                continue
            if pagename.find('.') < 0:
                url = url + '.html'
            if url.endswith('.css'):
                links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
                templinks = []
                for link in links:
                    slink = self.strip_link(link)
                    if slink:
                        templinks.append(slink)
                links = templinks
                for link in links:
                    newurl = self.full_link(link, url)
                    if not newurl.startswith('http') or newurl.find(self.baseurl) >= 0:
                        linkspush.append(newurl)
            elif url.endswith('.html') or url.endswith('.htm') or url.endswith('/'):
                templinks = []
                links = re.compile(r'<a[^>]*href=(.*?)[ >]', re.I).findall(page)
                for link in links:
                    slink = self.strip_link(link)
                    if slink and slink != "#":
                        if slink.endswith('.css') or slink.endswith('.html') \
                        or slink.endswith('.htm') or slink.endswith('.txt'):
                            templinks.append(slink)
                links = re.compile(r'<img[^>]*src=(.*?)[ >]', re.I).findall(page)
                for link in links:
                    slink = self.strip_link(link)
                    if slink:
                        templinks.append(slink)
                links = templinks
                for link in links:
                    newurl = self.full_link(link, url)
                    if not newurl.startswith('http') or newurl.find(self.baseurl) >= 0:
                        linkspush.append(newurl)
            for link in linkspush:
                if link.startswith(self.baseurl):
                    getfile = self.home+self.link_alias(link)
                    if not os.path.isfile(getfile):
                        self.f.push(link)
            print 'write to', self.home+self.link_alias(url)
            if len(page) > 0:
                open(self.home+'_ok.txt', 'a').write(url+'\n')
                try:
                    if url.endswith('.html') or url.endswith('.htm') or url.endswith('/'):
                        open(self.home+self.link_alias(url, True), 'wb').write(self.trim_ad(page))
                    else:
                        open(self.home+self.link_alias(url, True), 'wb').write(page)
                except Exception, what:
                    print what

if __name__ == "__main__":
    if len(sys.argv) == 2:
        url = sys.argv[1]
        SiteCopyer(url).copy()
    else:
        print "Usage: python "+sys.argv[0]+" url"