Слияние кода завершено, страница обновится автоматически
#coding:utf-8
import re, os, shutil, sys, traceback
import urllib2, socket, cookielib
from threading import Thread, stack_size, Lock
from Queue import LifoQueue, Queue
import time
from gzip import GzipFile
from StringIO import StringIO
from sets import Set
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
raise urllib2.HTTPError(req, 302, "302 HTTPRedirect.", headers, fp)
http_error_301 = http_error_303 = http_error_307 = http_error_302
class ContentEncodingProcessor(urllib2.BaseHandler):
"""A handler to add gzip capabilities to urllib2 requests """
# add headers to requests
def http_request(self, req):
req.add_header("Accept-Encoding", "gzip, deflate")
return req
# decode
def http_response(self, req, resp):
old_resp = resp
# gzip
if resp.headers.get("content-encoding") == "gzip":
gz = GzipFile(
fileobj=StringIO(resp.read()),
mode="r"
)
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
# deflate
if resp.headers.get("content-encoding") == "deflate":
gz = StringIO(deflate(resp.read()))
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
resp.msg = old_resp.msg
return resp
# deflate support
import zlib
def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
try: # so on top of all there's this workaround:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
class Fetcher:
'''
html Fetcher
basic usage
-----------
from fetcher import Fetcher
f = Fetcher()
f.get(url)
post
----
req = urllib2.Request(...)
f.post(req)
multi-thread
------------
f = Fetcher(threads=10)
for url in urls:
f.push(url)
while f.taskleft()
url,html = f.pop()
deal_with(url,html)
'''
def __init__(self, timeout=15, threads=None, stacksize=32768*16, loginfunc=None, home=None, baseurl=None):
#proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
encoding_support = ContentEncodingProcessor()
#self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
self.opener = urllib2.build_opener(SmartRedirectHandler, cookie_support, encoding_support, urllib2.HTTPHandler)
self.req = urllib2.Request('http://www.baidu.com')
socket.setdefaulttimeout(timeout)
#self.q_req = LifoQueue()
#self.q_ans = LifoQueue()
self.q_req = Queue()
self.q_ans = Queue()
self.s_url = Set()
self.lock = Lock()
self.running = 0
self.home = home
self.baseurl = baseurl
if loginfunc:
self.opener = loginfunc(self.opener)
if threads:
self.threads = threads
stack_size(stacksize)
for i in range(threads):
t = Thread(target=self.threadget)
t.setDaemon(True)
t.start()
def __del__(self):
time.sleep(0.5)
self.q_req.join()
self.q_ans.join()
def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running
def push(self, req, repeat=3):
if not self.threads:
print 'no thread, return get instead'
return get(req, repeat)
if req not in self.s_url:
self.q_req.put(req)
self.s_url.add(req)
open(self.home+'_log.txt', 'a').write(req+'\n')
def pop(self):
try:
data = self.q_ans.get(block=True, timeout=15)
self.q_ans.task_done()
except:
data = ['', '']
return data
def threadget(self):
while True:
req = self.q_req.get()
with self.lock:
self.running += 1
ans = self.get(req)
print 'got', req
self.q_ans.put((req, ans))
try:
self.q_req.task_done()
except:
pass
with self.lock:
self.running -= 1
time.sleep(0.1) # don't spam
def proxyisworking(self):
try:
self.opener.open('http://www.baidu.com').read(1024)
return True
except Exception, what:
print what
return False
def get(self, req, repeat=3):
'''
http GET req and repeat 3 times if failed
html text is returned when succeeded
'' is returned when failed
'''
try:
response = self.opener.open(req)
data = response.read()
except Exception, what:
print what, '[get err][try='+str(repeat)+']'+req
if repeat > 0:
time.sleep(0.1) # don't spam
return self.get(req, repeat-1)
else:
exc_type, exc_value, exc_traceback = sys.exc_info()
lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
err = req+'\n'.join('!! ' + line for line in lines) + '\n'
if err.find('HTTP Error 302') < 0:
open(self.home+'_err.txt', 'a').write(req+'\n') # Log it or whatever here
print 'GET Failed', req
return ''
return data
def post(self, req, repeat=3):
'''
http POST req and repeat 3 times if failed
html text/True is returned when succeeded
False is returned when failed
'''
if not isinstance(req, urllib2.Request):
print 'post method need urllib.Request as argument'
return False
else:
r = self.get(req, repeat)
if r:
return r
else:
return True
class SiteCopyer:
def __init__(self, url):
self.baseurl = url
self.home = self.baseurl.split('/')[2]
print "baseurl: "+self.baseurl
print "home: "+self.home
self.f = Fetcher(threads=3, home=self.home, baseurl=self.baseurl)
self.create_dir()
self.adlist = []
self.init_adlist()
def init_adlist(self):
adlist = []
for ad in adlist:
ad = ad.replace('\r\n', '\n')
ad = ad.replace('\n', '\r\n')
self.adlist.append(ad)
def create_dir(self):
try:
shutil.rmtree(self.home)
except Exception, what:
print what
try:
os.mkdir(self.home)
except Exception, what:
print what
def full_link(self, link, baseurl=None):
if not baseurl:
baseurl = self.baseurl
if '?' in link:
link = link.rsplit('?', 1)[0]
if link.find('://') < 0:
if link.startswith('/'):
link = '/'.join(baseurl.split('/', 3)[:3]) + link
elif link.startswith('../'):
while link.startswith('../'):
baseurl = baseurl.rsplit('/', 2)[0]
link = link[3:]
link = baseurl+'/'+link
else:
link = baseurl.rsplit('/', 1)[0]+'/'+link
#print "full_link: "+link
return link
def link_alias(self, link, mkdirs=False):
link = self.full_link(link)
link = link.replace(self.baseurl, '/')
if mkdirs:
tdir = link.rsplit('/', 1)[0]
try:
os.makedirs(self.home+'/'+tdir)
except:
pass
return link
def strip_link(self, link):
if link and (link[0] in ['"', "'"]):
link = link[1:]
while link and (link[-1] in ['"', "'"]):
link = link[:-1]
while link.endswith('/'):
link = link[:-1]
if link and (link[0] not in ["<", "'", '"']) and ('feed' not in link):
return link
elif link == "#":
return ''
else:
return ''
def trim_ad(self, page):
for ad in self.adlist:
page = page.replace(ad, '')
return page
def copy(self):
page = self.f.get(self.baseurl)
links = re.compile(r'<link[^>]*href=(.*?)[ >]', re.I).findall(page)
links.extend(re.compile(r'<frame[^>]*src=(.*?)[ >]', re.I).findall(page))
links.extend(re.compile(r'<a[^>]*href=(.*?)[ >]', re.I).findall(page))
links.extend(re.compile(r'<script[^>]*src=(.*?)[ >]', re.I).findall(page))
links.extend(re.compile(r'<img[^>]*src=(.*?)[ >]', re.I).findall(page))
links.extend(re.compile(r'.location=(.*?)[;]', re.I).findall(page))
links.append('index.htm')
templinks = []
for link in links:
slink = self.strip_link(link)
if slink:
templinks.append(slink)
links = templinks
for link in links:
print "links: "+link
if not link.startswith('http') or link.find(self.baseurl) >= 0:
self.f.push(self.full_link(link))
#print "links_F: "+self.full_link(link)
open(self.home+'/index.html', 'wb').write(page)
while self.f.taskleft():
url, page = self.f.pop()
linkspush = []
url = self.strip_link(url)
pagename = url.rsplit('/', 1)[1]
print "pagename: "+pagename
if pagename == self.home:
continue
if pagename.find('.') < 0:
url = url + '.html'
if url.endswith('.css'):
links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
templinks = []
for link in links:
slink = self.strip_link(link)
if slink:
templinks.append(slink)
links = templinks
for link in links:
newurl = self.full_link(link, url)
if not newurl.startswith('http') or newurl.find(self.baseurl) >= 0:
linkspush.append(newurl)
elif url.endswith('.html') or url.endswith('.htm') or url.endswith('/'):
templinks = []
links = re.compile(r'<a[^>]*href=(.*?)[ >]', re.I).findall(page)
for link in links:
slink = self.strip_link(link)
if slink and slink != "#":
if slink.endswith('.css') or slink.endswith('.html') \
or slink.endswith('.htm') or slink.endswith('.txt'):
templinks.append(slink)
links = re.compile(r'<img[^>]*src=(.*?)[ >]', re.I).findall(page)
for link in links:
slink = self.strip_link(link)
if slink:
templinks.append(slink)
links = templinks
for link in links:
newurl = self.full_link(link, url)
if not newurl.startswith('http') or newurl.find(self.baseurl) >= 0:
linkspush.append(newurl)
for link in linkspush:
if link.startswith(self.baseurl):
getfile = self.home+self.link_alias(link)
if not os.path.isfile(getfile):
self.f.push(link)
print 'write to', self.home+self.link_alias(url)
if len(page) > 0:
open(self.home+'_ok.txt', 'a').write(url+'\n')
try:
if url.endswith('.html') or url.endswith('.htm') or url.endswith('/'):
open(self.home+self.link_alias(url, True), 'wb').write(self.trim_ad(page))
else:
open(self.home+self.link_alias(url, True), 'wb').write(page)
except Exception, what:
print what
if __name__ == "__main__":
if len(sys.argv) == 2:
url = sys.argv[1]
SiteCopyer(url).copy()
else:
print "Usage: python "+sys.argv[0]+" url"
Вы можете оставить комментарий после Вход в систему
Неприемлемый контент может быть отображен здесь и не будет показан на странице. Вы можете проверить и изменить его с помощью соответствующей функции редактирования.
Если вы подтверждаете, что содержание не содержит непристойной лексики/перенаправления на рекламу/насилия/вульгарной порнографии/нарушений/пиратства/ложного/незначительного или незаконного контента, связанного с национальными законами и предписаниями, вы можете нажать «Отправить» для подачи апелляции, и мы обработаем ее как можно скорее.
Опубликовать ( 0 )