home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2012 January / maximum-cd-2012-01.iso / DiscContents / digsby_setup.exe / lib / util / httptools.pyo (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2011-10-05  |  14.2 KB  |  458 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyo (Python 2.6)
  3.  
  4. from __future__ import with_statement
  5. from callbacks import callsback
  6. from threads import threaded
  7. from threads.timeout_thread import Timer
  8. from net import build_opener, build_cookie
  9. from common.asynchttp.cookiejartypes import CookieJarHTTPMaster
  10. import contextlib
  11. import re
  12. import traceback
  13. import StringIO
  14. import cookielib
  15. import urlparse
  16. import urllib2
  17. import logging
  18. import lxml.etree as ET
  19. import lxml.html as HTML
  20. import operator
  21. from contextlib import closing
  22. itemgetter0 = operator.itemgetter(0)
  23. log = logging.getLogger('httptools')
  24.  
  25. class RequestOpener(object):
  26.     max_redirects = 5
  27.     retries = 3
  28.     pause_for_attempts = 1
  29.     js_redirect_res = ((re.compile('window\\.location\\.replace\\("(.*?)"\\);'), 1),)
  30.     request_cls = urllib2.Request
  31.     
  32.     def __init__(self, opener, request, data = None, **kwds):
  33.         self.openfunc = getattr(opener, 'open', opener)
  34.         retries = kwds.pop('retries', None)
  35.         if retries is not None:
  36.             self.retries = retries
  37.         
  38.         if isinstance(request, basestring):
  39.             request = self.request_cls.make_request(request, data, **kwds)
  40.         
  41.         self.request = request
  42.         self._sub_requester = None
  43.         self.callback = None
  44.  
  45.     
  46.     def open(self, callback = None):
  47.         if self.callback is not None:
  48.             raise Exception('Request already in progress')
  49.         self.callback is not None
  50.         self.callback = callback
  51.         self._attempt_open()
  52.  
  53.     open = callsback(open)
  54.     
  55.     def _attempt_open(self):
  56.         self.openfunc(self.request, success = self._check_success, error = self._check_error)
  57.  
  58.     
  59.     def preprocess_response(self, resp):
  60.         closing(resp).__enter__()
  61.         
  62.         try:
  63.             data = resp.read()
  64.         finally:
  65.             pass
  66.  
  67.         c_encoding = resp.headers.get('Content-Encoding', 'identity')
  68.         sio = StringIO.StringIO(data)
  69.         for attr in ('read', 'seek', 'close', 'tell'):
  70.             setattr(resp, attr, getattr(sio, attr))
  71.         
  72.         resp._stringio = sio
  73.         resp.content = data
  74.         return resp
  75.  
  76.     
  77.     def _check_success(self, resp):
  78.         
  79.         try:
  80.             resp = self.preprocess_response(resp)
  81.         except Exception:
  82.             e = None
  83.             self._on_error(e)
  84.             return None
  85.  
  86.         redir = self.can_redirect(resp)
  87.         if redir:
  88.             return self.redirect(redir)
  89.         error = self.check_resp_for_errors(resp)
  90.         if error is None:
  91.             self.finish('success', resp)
  92.         else:
  93.             self._on_error(error)
  94.  
  95.     
  96.     def _redirect_success(self, resp):
  97.         self._sub_requester = None
  98.         self.finish('success', resp)
  99.  
  100.     
  101.     def _redirect_error(self, err = None):
  102.         self._sub_requester = None
  103.         self._on_error(err)
  104.  
  105.     
  106.     def can_redirect(self, resp):
  107.         if getattr(self, '_redirect_count', 0) > self.max_redirects:
  108.             return False
  109.         if self._sub_requester is not None:
  110.             return False
  111.         return self.make_redirect_request(resp)
  112.  
  113.     
  114.     def redirect(self, redirect):
  115.         new = self._sub_requester = type(self)(self.openfunc, redirect)
  116.         setattr(new, '_redirect_count', getattr(self, '_redirect_count', 0) + 1)
  117.         new.open(success = self._redirect_success, error = self._redirect_error)
  118.  
  119.     
  120.     def make_redirect_request(self, resp):
  121.         for redirecter in (self._find_http_redirect, self._find_js_redirect):
  122.             redirect = redirecter(resp)
  123.             if redirect is not None:
  124.                 if not redirect.startswith('http'):
  125.                     if not redirect.startswith('/'):
  126.                         redirect = '/' + redirect
  127.                     
  128.                     redirect = self.request.get_type() + '://' + self.request.get_host() + redirect
  129.                 
  130.                 parsed = urlparse.urlparse(redirect)
  131.                 if parsed.path == '':
  132.                     d = parsed._asdict()
  133.                     d['path'] = '/'
  134.                     redirect = urlparse.urlunparse(type(parsed)(**d))
  135.                 
  136.                 log.debug('got redirect: %r', redirect)
  137.                 return redirect
  138.         
  139.  
  140.     
  141.     def _find_http_redirect(self, resp):
  142.         if resp.code in (301, 302):
  143.             return resp.headers.get('Location', None)
  144.  
  145.     
  146.     def _find_js_redirect(self, resp):
  147.         for redirect_re, url_group_id in self.js_redirect_res:
  148.             match = redirect_re.search(resp.content)
  149.             if match:
  150.                 new_url = match.group(url_group_id)
  151.                 if new_url:
  152.                     return new_url
  153.                 continue
  154.             new_url
  155.         
  156.  
  157.     
  158.     def check_resp_for_errors(self, resp):
  159.         pass
  160.  
  161.     
  162.     def _check_error(self, err = None, resp = None):
  163.         if resp is not None:
  164.             self._on_error((err, resp))
  165.         else:
  166.             self._on_error(err)
  167.  
  168.     
  169.     def _on_error(self, e = None):
  170.         self.retries -= 1
  171.         if self.retries:
  172.             if self.pause_for_attempts > 0:
  173.                 Timer(self.pause_for_attempts, self._attempt_open).start()
  174.             else:
  175.                 self._attempt_open()
  176.         else:
  177.             self.finish('error', e)
  178.  
  179.     
  180.     def finish(self, result, *args):
  181.         cb = self.callback
  182.         self.callback = None
  183.         self._sub_request = None
  184.         self.request = None
  185.         self.openfunc = None
  186.         getattr(cb, result, (lambda : pass))(*args)
  187.  
  188.  
  189.  
  190. def dispatcher(what, arg_getter):
  191.     
  192.     def dispatch(self, *args, **req_options):
  193.         name = arg_getter(args)
  194.         handler = getattr(self, '%s_%s' % (what, name), getattr(self, '%s_default' % what, None))
  195.         if handler is not None:
  196.             return handler(*args, **req_options)
  197.         log.error('No default handler for %r', what)
  198.  
  199.     return dispatch
  200.  
  201.  
  202. class WebScraperBase(object):
  203.     CookieJarFactory = cookielib.CookieJar
  204.     HttpOpenerFactory = staticmethod(build_opener)
  205.     RequestFactory = staticmethod(urllib2.Request.make_request)
  206.     
  207.     def RequestOpenerFactory(cls, open, req, **kwds):
  208.         return RequestOpener(threaded(open), req, **kwds)
  209.  
  210.     RequestOpenerFactory = classmethod(RequestOpenerFactory)
  211.     domain = None
  212.     urls = { }
  213.     
  214.     def __init__(self):
  215.         self._waiting = set()
  216.         self._callbacks = { }
  217.         self.init_http()
  218.         self._batching = False
  219.         self._batchqueue = []
  220.  
  221.     
  222.     def init_http(self):
  223.         self._jar = self.CookieJarFactory()
  224.         self.http = self.HttpOpenerFactory(urllib2.HTTPCookieProcessor(self._jar))
  225.  
  226.     
  227.     def get_cookie(self, key, default = sentinel, domain = None, path = '/'):
  228.         if domain is None:
  229.             domain = self.domain
  230.         
  231.         val = default
  232.         
  233.         try:
  234.             self._jar._cookies_lock.__enter__()
  235.             
  236.             try:
  237.                 val = self._jar._cookies[domain][path][key].value
  238.             finally:
  239.                 pass
  240.  
  241.         except (AttributeError, KeyError):
  242.             e = None
  243.             if val is sentinel:
  244.                 raise e
  245.             val is sentinel
  246.             return val
  247.  
  248.         return val
  249.  
  250.     
  251.     def set_cookie(self, key, value, domain = None, path = '/'):
  252.         if domain is None:
  253.             domain = self.domain
  254.         
  255.         self._jar._cookies_lock.__enter__()
  256.         
  257.         try:
  258.             domain_dict = self._jar._cookies.setdefault(domain, { })
  259.             path_dict = domain_dict.setdefault(path, { })
  260.             cookie = path_dict.get(key, None)
  261.             if cookie is None:
  262.                 cookie = build_cookie(key, value, domain = domain, path = path)
  263.                 path_dict[key] = cookie
  264.             else:
  265.                 cookie.value = value
  266.         finally:
  267.             pass
  268.  
  269.  
  270.     
  271.     def set_waiting(self, *things):
  272.         self._waiting.update(things)
  273.  
  274.     
  275.     def clear_waiting(self, *things):
  276.         self._waiting -= set(things)
  277.         if not self._waiting:
  278.             self.done_waiting()
  279.         
  280.  
  281.     
  282.     def done_waiting(self):
  283.         pass
  284.  
  285.     
  286.     def batch(self):
  287.         if self._batching:
  288.             raise Exception("Can't do more than one batch of requests at a time.")
  289.         self._batching
  290.         self._batching = True
  291.         
  292.         try:
  293.             yield self
  294.         finally:
  295.             self._batching = False
  296.             while self._batchqueue:
  297.                 (name, req, req_options) = self._batchqueue.pop(0)
  298.                 self.perform_request(name, req, **req_options)
  299.  
  300.  
  301.     batch = contextlib.contextmanager(batch)
  302.     
  303.     def request(self, name, callback = None, **req_options):
  304.         if name in self._waiting:
  305.             log.warning('already waiting for %r', name)
  306.             return None
  307.         self._callbacks[name] = callback
  308.         req = self.build_request(name, **req_options)
  309.         if self._batching:
  310.             self.set_waiting(name)
  311.             self._batchqueue.append((name, req, req_options))
  312.             return None
  313.         self.perform_request(name, req, **req_options)
  314.  
  315.     request = callsback(request)
  316.     
  317.     def perform_request(self, name, req, **req_options):
  318.         self.set_waiting(name)
  319.         if req is None:
  320.             return self.error_handler(name, req_options)(Exception('No request created for %r' % name))
  321.         reqopen = self.RequestOpenerFactory(self.http.open, req, **req_options)
  322.         reqopen.open(success = self.success_handler(name, req_options), error = self.error_handler(name, req_options))
  323.  
  324.     
  325.     def error_handler(self, name, req_options):
  326.         
  327.         def handler(e = (None, None, None)):
  328.             
  329.             try:
  330.                 e = self.preprocess_resp(name, e, **req_options)
  331.             except Exception:
  332.                 exc = None
  333.                 if not req_options.get('quiet', False):
  334.                     traceback.print_exc()
  335.                 
  336.             except:
  337.                 req_options.get('quiet', False)
  338.  
  339.             self.clear_waiting(name)
  340.             cb = self._callbacks.pop(name, None)
  341.             retval = self.handle_error(name, e, **req_options)
  342.             if cb is not None:
  343.                 cb.error(e)
  344.             
  345.             return retval
  346.  
  347.         return handler
  348.  
  349.     
  350.     def success_handler(self, name, req_options):
  351.         
  352.         def handler(resp):
  353.             
  354.             try:
  355.                 resp = self.preprocess_resp(name, resp, **req_options)
  356.             except Exception:
  357.                 exc = None
  358.                 traceback.print_exc()
  359.                 self.error_handler(name, req_options)(exc)
  360.                 return None
  361.  
  362.             
  363.             try:
  364.                 newresp = self.handle_success(name, resp, **req_options)
  365.             except Exception:
  366.                 exc = None
  367.                 traceback.print_exc()
  368.                 self.error_handler(name, req_options)(exc)
  369.                 return None
  370.  
  371.             if newresp is not None:
  372.                 resp = newresp
  373.             
  374.             cb = self._callbacks.pop(name, None)
  375.             if cb is not None:
  376.                 cb.success(resp)
  377.             
  378.             self.clear_waiting(name)
  379.             return newresp
  380.  
  381.         return handler
  382.  
  383.     build_request = dispatcher('build_request', itemgetter0)
  384.     handle_error = dispatcher('handle_error', itemgetter0)
  385.     preprocess_resp = dispatcher('preprocess_resp', itemgetter0)
  386.     handle_success = dispatcher('handle_success', itemgetter0)
  387.     
  388.     def build_request_default(self, name, **req_options):
  389.         link = self.urls[name]
  390.         if callable(link):
  391.             link = link()
  392.         
  393.         return self.RequestFactory(link, **req_options)
  394.  
  395.     
  396.     def handle_error_default(self, name, e, **req_options):
  397.         log.error('Error requesting %r (options = %r): %r', name, req_options, e)
  398.  
  399.     
  400.     def handle_success_default(self, name, resp, **req_options):
  401.         if resp.document is not None:
  402.             log.debug_s('document body: %r', HTML.tostring(resp.document, pretty_print = True))
  403.         else:
  404.             log.info('Got None for lxml doc. code/status= %r', resp.code, resp.msg, str(resp.headers))
  405.  
  406.     
  407.     def preprocess_resp_default(self, name, resp, **req_options):
  408.         data = resp.content
  409.         if data:
  410.             document = HTML.fromstring(data, base_url = resp.geturl())
  411.             document.make_links_absolute()
  412.             resp.document = document
  413.         else:
  414.             resp.document = None
  415.         return resp
  416.  
  417.  
  418.  
  419. class AsyncRequestOpener(RequestOpener):
  420.     request_cls = CookieJarHTTPMaster.request_cls
  421.     
  422.     def _check_success(self, req, resp):
  423.         return super(AsyncRequestOpener, self)._check_success(resp)
  424.  
  425.     
  426.     def _check_error(self, req, resp = None):
  427.         if resp == None:
  428.             resp = req
  429.         
  430.         return super(AsyncRequestOpener, self)._check_error(resp)
  431.  
  432.  
  433.  
  434. class AsyncWebScraper(WebScraperBase):
  435.     HttpOpenerFactory = CookieJarHTTPMaster
  436.     
  437.     def RequestOpenerFactory(self, open, req, **kwds):
  438.         return AsyncRequestOpener(open, req, **kwds)
  439.  
  440.     
  441.     def init_http(self):
  442.         self._jar = self.CookieJarFactory()
  443.         self.http = self.HttpOpenerFactory(jar = self._jar)
  444.  
  445.     
  446.     def RequestFactory(self, *a, **k):
  447.         headers = dict(getattr(self.http, 'addheaders', { }))
  448.         headers.update(k.get('headers', { }))
  449.         k['headers'] = headers
  450.         ret = self.http.request_cls.make_request(*a, **k)
  451.         return ret
  452.  
  453.  
  454. WebScraper = AsyncWebScraper
  455. if __name__ == '__main__':
  456.     pass
  457.  
  458.