home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2009 June / maximum-cd-2009-06.iso / DiscContents / digsby_setup.exe / lib / util / httptools.pyo (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2009-02-26  |  11.4 KB  |  375 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyo (Python 2.5)
  3.  
  4. from __future__ import with_statement
  5. from callbacks import callsback
  6. from threads import threaded
  7. from threads.timeout_thread import Timer
  8. from net import build_opener, build_cookie
  9. import re
  10. import StringIO
  11. import cookielib
  12. import urllib2
  13. import logging
  14. import lxml.etree as ET
  15. import lxml.html as HTML
  16. import operator
  17. from contextlib import closing
  18. itemgetter0 = operator.itemgetter(0)
  19. log = logging.getLogger('httptools')
  20.  
  21. class RequestOpener(object):
  22.     max_redirects = 5
  23.     retries = 3
  24.     pause_for_attempts = 1
  25.     js_redirect_res = ((re.compile('window\\.location\\.replace\\("(.*?)"\\);'), 1),)
  26.     
  27.     def __init__(self, opener, request, data = None, **kwds):
  28.         self.openfunc = getattr(opener, 'open', opener)
  29.         retries = kwds.pop('retries', None)
  30.         if retries is not None:
  31.             self.retries = retries
  32.         
  33.         if isinstance(request, basestring):
  34.             request = urllib2.Request.make_request(request, data, **kwds)
  35.         
  36.         self.request = request
  37.         self._sub_requester = None
  38.         self.callback = None
  39.  
  40.     
  41.     def open(self, callback = None):
  42.         if self.callback is not None:
  43.             raise Exception('Request already in progress')
  44.         
  45.         self.callback = callback
  46.         self._attempt_open()
  47.  
  48.     open = callsback(open)
  49.     
  50.     def _attempt_open(self):
  51.         self.openfunc(self.request, success = self._check_success, error = self._check_error)
  52.  
  53.     
  54.     def preprocess_response(self, resp):
  55.         closing(resp).__enter__()
  56.         
  57.         try:
  58.             data = resp.read()
  59.         finally:
  60.             pass
  61.  
  62.         sio = StringIO.StringIO(data)
  63.         for attr in ('read', 'seek', 'close', 'tell'):
  64.             setattr(resp, attr, getattr(sio, attr))
  65.         
  66.         resp._stringio = sio
  67.         resp.content = data
  68.         return resp
  69.  
  70.     
  71.     def _check_success(self, resp):
  72.         
  73.         try:
  74.             resp = self.preprocess_response(resp)
  75.         except Exception:
  76.             e = None
  77.             self._on_error(e)
  78.             return None
  79.  
  80.         redir = self.can_redirect(resp)
  81.         if redir:
  82.             return self.redirect(redir)
  83.         else:
  84.             error = self.check_resp_for_errors(resp)
  85.             if error is None:
  86.                 self.finish('success', resp)
  87.             else:
  88.                 self._on_error(error)
  89.  
  90.     
  91.     def _redirect_success(self, resp):
  92.         self._sub_requester = None
  93.         self.finish('success', resp)
  94.  
  95.     
  96.     def _redirect_error(self, err = None):
  97.         self._sub_requester = None
  98.         self._on_error(err)
  99.  
  100.     
  101.     def can_redirect(self, resp):
  102.         if getattr(self, '_redirect_count', 0) > self.max_redirects:
  103.             return False
  104.         
  105.         if self._sub_requester is not None:
  106.             return False
  107.         
  108.         return self.make_redirect_request(resp)
  109.  
  110.     
  111.     def redirect(self, redirect):
  112.         new = self._sub_requester = type(self)(self.openfunc, redirect)
  113.         setattr(new, '_redirect_count', getattr(self, '_redirect_count', 0) + 1)
  114.         new.open(success = self._redirect_success, error = self._redirect_error)
  115.  
  116.     
  117.     def make_redirect_request(self, resp):
  118.         for redirecter in (self._find_http_redirect, self._find_js_redirect):
  119.             redirect = redirecter(resp)
  120.             if redirect is not None:
  121.                 if not redirect.startswith('http'):
  122.                     if not redirect.startswith('/'):
  123.                         redirect = '/' + redirect
  124.                     
  125.                     redirect = self.request.get_type() + '://' + self.request.get_host() + redirect
  126.                 
  127.                 log.debug('got redirect: %r', redirect)
  128.                 return redirect
  129.                 continue
  130.         
  131.  
  132.     
  133.     def _find_http_redirect(self, resp):
  134.         if resp.code in (301, 302):
  135.             return resp.headers.get('Location', None)
  136.         
  137.  
  138.     
  139.     def _find_js_redirect(self, resp):
  140.         for redirect_re, url_group_id in self.js_redirect_res:
  141.             match = redirect_re.search(resp.content)
  142.             if match:
  143.                 new_url = match.group(url_group_id)
  144.                 if new_url:
  145.                     return new_url
  146.                 
  147.             new_url
  148.         
  149.  
  150.     
  151.     def check_resp_for_errors(self, resp):
  152.         pass
  153.  
  154.     
  155.     def _check_error(self, err = None):
  156.         self._on_error(err)
  157.  
  158.     
  159.     def _on_error(self, e = None):
  160.         self.retries -= 1
  161.         if self.retries:
  162.             if self.pause_for_attempts > 0:
  163.                 Timer(self.pause_for_attempts, self._attempt_open).start()
  164.             else:
  165.                 self._attempt_open()
  166.         else:
  167.             self.finish('error', e)
  168.  
  169.     
  170.     def finish(self, result, *args):
  171.         cb = self.callback
  172.         self.callback = None
  173.         self._sub_request = None
  174.         self.request = None
  175.         self.openfunc = None
  176.         getattr(cb, result, (lambda : pass))(*args)
  177.  
  178.  
  179.  
  180. def dispatcher(what, arg_getter):
  181.     
  182.     def dispatch(self, *args):
  183.         name = arg_getter(args)
  184.         handler = getattr(self, '%s_%s' % (what, name), getattr(self, '%s_default' % what, None))
  185.         if handler is not None:
  186.             return handler(*args)
  187.         else:
  188.             log.error('No default handler for %r', what)
  189.  
  190.     return dispatch
  191.  
  192.  
  193. class WebScraper(object):
  194.     CookieJarFactory = cookielib.CookieJar
  195.     HttpOpenerFactory = staticmethod(build_opener)
  196.     RequestFactory = staticmethod(urllib2.Request.make_request)
  197.     domain = None
  198.     urls = { }
  199.     
  200.     def __init__(self):
  201.         self._waiting = set()
  202.         self._callbacks = { }
  203.         self.init_http()
  204.  
  205.     
  206.     def init_http(self):
  207.         self._jar = self.CookieJarFactory()
  208.         self.http = self.HttpOpenerFactory(urllib2.HTTPCookieProcessor(self._jar))
  209.  
  210.     
  211.     def get_cookie(self, key, default = sentinel, domain = None, path = '/'):
  212.         if domain is None:
  213.             domain = self.domain
  214.         
  215.         val = default
  216.         
  217.         try:
  218.             self._jar._cookies_lock.__enter__()
  219.             
  220.             try:
  221.                 val = self._jar._cookies[domain][path][key].value
  222.             finally:
  223.                 pass
  224.  
  225.         except (AttributeError, KeyError):
  226.             e = None
  227.             if val is sentinel:
  228.                 raise e
  229.             else:
  230.                 return val
  231.         except:
  232.             val is sentinel
  233.  
  234.         return val
  235.  
  236.     
  237.     def set_cookie(self, key, value, domain = None, path = '/'):
  238.         if domain is None:
  239.             domain = self.domain
  240.         
  241.         self._jar._cookies_lock.__enter__()
  242.         
  243.         try:
  244.             domain_dict = self._jar._cookies.setdefault(domain, { })
  245.             path_dict = domain_dict.setdefault(path, { })
  246.             cookie = path_dict.get(key, None)
  247.             if cookie is None:
  248.                 cookie = build_cookie(key, value, domain = domain, path = path)
  249.                 path_dict[key] = cookie
  250.             else:
  251.                 cookie.value = value
  252.         finally:
  253.             pass
  254.  
  255.  
  256.     
  257.     def set_waiting(self, *things):
  258.         self._waiting.update(things)
  259.  
  260.     
  261.     def clear_waiting(self, *things):
  262.         self._waiting -= set(things)
  263.         if not self._waiting:
  264.             self.done_waiting()
  265.         
  266.  
  267.     
  268.     def done_waiting(self):
  269.         pass
  270.  
  271.     
  272.     def request(self, name, callback = None):
  273.         if name in self._waiting:
  274.             log.warning('already waiting for %r', name)
  275.             return None
  276.         
  277.         self._callbacks[name] = callback
  278.         req = self.build_request(name)
  279.         self.perform_request(name, req)
  280.  
  281.     request = callsback(request)
  282.     
  283.     def perform_request(self, name, req):
  284.         self.set_waiting(name)
  285.         if req is None:
  286.             return self.error_handler(name)(Exception('No request created for %r' % name))
  287.         
  288.         reqopen = RequestOpener(threaded(self.http.open), req)
  289.         reqopen.open(success = self.success_handler(name), error = self.error_handler(name))
  290.  
  291.     
  292.     def error_handler(self, name):
  293.         
  294.         def handler(e = (None, None)):
  295.             self.clear_waiting(name)
  296.             cb = self._callbacks.pop(name, None)
  297.             retval = self.handle_error(name, e)
  298.             if cb is not None:
  299.                 cb.error(e)
  300.             
  301.             return retval
  302.  
  303.         return handler
  304.  
  305.     
  306.     def success_handler(self, name):
  307.         
  308.         def handler(resp):
  309.             self.clear_waiting(name)
  310.             
  311.             try:
  312.                 resp = self.preprocess_resp(name, resp)
  313.             except Exception:
  314.                 exc = None
  315.                 self.handle_error(name, exc)
  316.                 return None
  317.  
  318.             
  319.             try:
  320.                 newresp = self.handle_success(name, resp)
  321.             except Exception:
  322.                 exc = None
  323.                 self.handle_error(name, exc)
  324.                 return None
  325.  
  326.             if newresp is not None:
  327.                 resp = newresp
  328.             
  329.             cb = self._callbacks.pop(name, None)
  330.             if cb is not None:
  331.                 cb.success(resp)
  332.             
  333.             return newresp
  334.  
  335.         return handler
  336.  
  337.     build_request = dispatcher('build_request', itemgetter0)
  338.     handle_error = dispatcher('handle_error', itemgetter0)
  339.     preprocess_resp = dispatcher('preprocess_resp', itemgetter0)
  340.     handle_success = dispatcher('handle_success', itemgetter0)
  341.     
  342.     def build_request_default(self, name):
  343.         link = self.urls[name]
  344.         if callable(link):
  345.             link = link()
  346.         
  347.         return self.RequestFactory(link)
  348.  
  349.     
  350.     def handle_error_default(self, name, e):
  351.         log.error('Error requesting %r: %r', name, e)
  352.  
  353.     
  354.     def handle_success_default(self, name, resp):
  355.         if resp.document is not None:
  356.             print HTML.tostring(resp.document, pretty_print = True)
  357.         else:
  358.             print 'Got None for lxml doc. code/status= %r' % ((resp.code, resp.msg, str(resp.headers)),)
  359.  
  360.     
  361.     def preprocess_resp_default(self, name, resp):
  362.         data = resp.content
  363.         if data:
  364.             document = HTML.fromstring(data, base_url = resp.geturl())
  365.             document.make_links_absolute()
  366.             resp.document = document
  367.         else:
  368.             resp.document = None
  369.         return resp
  370.  
  371.  
  372. if __name__ == '__main__':
  373.     pass
  374.  
  375.