home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 February / maximum-cd-2011-02.iso / DiscContents / digsby_setup85.exe / lib / util / httptools.pyo (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-11-24  |  14.0 KB  |  451 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyo (Python 2.6)
  3.  
  4. from __future__ import with_statement
  5. from callbacks import callsback
  6. from threads import threaded
  7. from threads.timeout_thread import Timer
  8. from net import build_opener, build_cookie
  9. from common.asynchttp.cookiejartypes import CookieJarHTTPMaster
  10. import contextlib
  11. import re
  12. import traceback
  13. import StringIO
  14. import cookielib
  15. import urllib2
  16. import logging
  17. import lxml.etree as ET
  18. import lxml.html as HTML
  19. import operator
  20. from contextlib import closing
  21. itemgetter0 = operator.itemgetter(0)
  22. log = logging.getLogger('httptools')
  23.  
  24. class RequestOpener(object):
  25.     max_redirects = 5
  26.     retries = 3
  27.     pause_for_attempts = 1
  28.     js_redirect_res = ((re.compile('window\\.location\\.replace\\("(.*?)"\\);'), 1),)
  29.     request_cls = urllib2.Request
  30.     
  31.     def __init__(self, opener, request, data = None, **kwds):
  32.         self.openfunc = getattr(opener, 'open', opener)
  33.         retries = kwds.pop('retries', None)
  34.         if retries is not None:
  35.             self.retries = retries
  36.         
  37.         if isinstance(request, basestring):
  38.             request = self.request_cls.make_request(request, data, **kwds)
  39.         
  40.         self.request = request
  41.         self._sub_requester = None
  42.         self.callback = None
  43.  
  44.     
  45.     def open(self, callback = None):
  46.         if self.callback is not None:
  47.             raise Exception('Request already in progress')
  48.         self.callback is not None
  49.         self.callback = callback
  50.         self._attempt_open()
  51.  
  52.     open = callsback(open)
  53.     
  54.     def _attempt_open(self):
  55.         self.openfunc(self.request, success = self._check_success, error = self._check_error)
  56.  
  57.     
  58.     def preprocess_response(self, resp):
  59.         closing(resp).__enter__()
  60.         
  61.         try:
  62.             data = resp.read()
  63.         finally:
  64.             pass
  65.  
  66.         c_encoding = resp.headers.get('Content-Encoding', 'identity')
  67.         sio = StringIO.StringIO(data)
  68.         for attr in ('read', 'seek', 'close', 'tell'):
  69.             setattr(resp, attr, getattr(sio, attr))
  70.         
  71.         resp._stringio = sio
  72.         resp.content = data
  73.         return resp
  74.  
  75.     
  76.     def _check_success(self, resp):
  77.         
  78.         try:
  79.             resp = self.preprocess_response(resp)
  80.         except Exception:
  81.             e = None
  82.             self._on_error(e)
  83.             return None
  84.  
  85.         redir = self.can_redirect(resp)
  86.         if redir:
  87.             return self.redirect(redir)
  88.         error = self.check_resp_for_errors(resp)
  89.         if error is None:
  90.             self.finish('success', resp)
  91.         else:
  92.             self._on_error(error)
  93.  
  94.     
  95.     def _redirect_success(self, resp):
  96.         self._sub_requester = None
  97.         self.finish('success', resp)
  98.  
  99.     
  100.     def _redirect_error(self, err = None):
  101.         self._sub_requester = None
  102.         self._on_error(err)
  103.  
  104.     
  105.     def can_redirect(self, resp):
  106.         if getattr(self, '_redirect_count', 0) > self.max_redirects:
  107.             return False
  108.         if self._sub_requester is not None:
  109.             return False
  110.         return self.make_redirect_request(resp)
  111.  
  112.     
  113.     def redirect(self, redirect):
  114.         new = self._sub_requester = type(self)(self.openfunc, redirect)
  115.         setattr(new, '_redirect_count', getattr(self, '_redirect_count', 0) + 1)
  116.         new.open(success = self._redirect_success, error = self._redirect_error)
  117.  
  118.     
  119.     def make_redirect_request(self, resp):
  120.         for redirecter in (self._find_http_redirect, self._find_js_redirect):
  121.             redirect = redirecter(resp)
  122.             if redirect is not None:
  123.                 if not redirect.startswith('http'):
  124.                     if not redirect.startswith('/'):
  125.                         redirect = '/' + redirect
  126.                     
  127.                     redirect = self.request.get_type() + '://' + self.request.get_host() + redirect
  128.                 
  129.                 log.debug('got redirect: %r', redirect)
  130.                 return redirect
  131.         
  132.  
  133.     
  134.     def _find_http_redirect(self, resp):
  135.         if resp.code in (301, 302):
  136.             return resp.headers.get('Location', None)
  137.  
  138.     
  139.     def _find_js_redirect(self, resp):
  140.         for redirect_re, url_group_id in self.js_redirect_res:
  141.             match = redirect_re.search(resp.content)
  142.             if match:
  143.                 new_url = match.group(url_group_id)
  144.                 if new_url:
  145.                     return new_url
  146.                 continue
  147.             new_url
  148.         
  149.  
  150.     
  151.     def check_resp_for_errors(self, resp):
  152.         pass
  153.  
  154.     
  155.     def _check_error(self, err = None, resp = None):
  156.         if resp is not None:
  157.             self._on_error((err, resp))
  158.         else:
  159.             self._on_error(err)
  160.  
  161.     
  162.     def _on_error(self, e = None):
  163.         self.retries -= 1
  164.         if self.retries:
  165.             if self.pause_for_attempts > 0:
  166.                 Timer(self.pause_for_attempts, self._attempt_open).start()
  167.             else:
  168.                 self._attempt_open()
  169.         else:
  170.             self.finish('error', e)
  171.  
  172.     
  173.     def finish(self, result, *args):
  174.         cb = self.callback
  175.         self.callback = None
  176.         self._sub_request = None
  177.         self.request = None
  178.         self.openfunc = None
  179.         getattr(cb, result, (lambda : pass))(*args)
  180.  
  181.  
  182.  
  183. def dispatcher(what, arg_getter):
  184.     
  185.     def dispatch(self, *args, **req_options):
  186.         name = arg_getter(args)
  187.         handler = getattr(self, '%s_%s' % (what, name), getattr(self, '%s_default' % what, None))
  188.         if handler is not None:
  189.             return handler(*args, **req_options)
  190.         log.error('No default handler for %r', what)
  191.  
  192.     return dispatch
  193.  
  194.  
  195. class WebScraperBase(object):
  196.     CookieJarFactory = cookielib.CookieJar
  197.     HttpOpenerFactory = staticmethod(build_opener)
  198.     RequestFactory = staticmethod(urllib2.Request.make_request)
  199.     
  200.     def RequestOpenerFactory(cls, open, req, **kwds):
  201.         return RequestOpener(threaded(open), req, **kwds)
  202.  
  203.     RequestOpenerFactory = classmethod(RequestOpenerFactory)
  204.     domain = None
  205.     urls = { }
  206.     
  207.     def __init__(self):
  208.         self._waiting = set()
  209.         self._callbacks = { }
  210.         self.init_http()
  211.         self._batching = False
  212.         self._batchqueue = []
  213.  
  214.     
  215.     def init_http(self):
  216.         self._jar = self.CookieJarFactory()
  217.         self.http = self.HttpOpenerFactory(urllib2.HTTPCookieProcessor(self._jar))
  218.  
  219.     
  220.     def get_cookie(self, key, default = sentinel, domain = None, path = '/'):
  221.         if domain is None:
  222.             domain = self.domain
  223.         
  224.         val = default
  225.         
  226.         try:
  227.             self._jar._cookies_lock.__enter__()
  228.             
  229.             try:
  230.                 val = self._jar._cookies[domain][path][key].value
  231.             finally:
  232.                 pass
  233.  
  234.         except (AttributeError, KeyError):
  235.             e = None
  236.             if val is sentinel:
  237.                 raise e
  238.             val is sentinel
  239.             return val
  240.  
  241.         return val
  242.  
  243.     
  244.     def set_cookie(self, key, value, domain = None, path = '/'):
  245.         if domain is None:
  246.             domain = self.domain
  247.         
  248.         self._jar._cookies_lock.__enter__()
  249.         
  250.         try:
  251.             domain_dict = self._jar._cookies.setdefault(domain, { })
  252.             path_dict = domain_dict.setdefault(path, { })
  253.             cookie = path_dict.get(key, None)
  254.             if cookie is None:
  255.                 cookie = build_cookie(key, value, domain = domain, path = path)
  256.                 path_dict[key] = cookie
  257.             else:
  258.                 cookie.value = value
  259.         finally:
  260.             pass
  261.  
  262.  
  263.     
  264.     def set_waiting(self, *things):
  265.         self._waiting.update(things)
  266.  
  267.     
  268.     def clear_waiting(self, *things):
  269.         self._waiting -= set(things)
  270.         if not self._waiting:
  271.             self.done_waiting()
  272.         
  273.  
  274.     
  275.     def done_waiting(self):
  276.         pass
  277.  
  278.     
  279.     def batch(self):
  280.         if self._batching:
  281.             raise Exception("Can't do more than one batch of requests at a time.")
  282.         self._batching
  283.         self._batching = True
  284.         
  285.         try:
  286.             yield self
  287.         finally:
  288.             self._batching = False
  289.             while self._batchqueue:
  290.                 (name, req, req_options) = self._batchqueue.pop(0)
  291.                 self.perform_request(name, req, **req_options)
  292.  
  293.  
  294.     batch = contextlib.contextmanager(batch)
  295.     
  296.     def request(self, name, callback = None, **req_options):
  297.         if name in self._waiting:
  298.             log.warning('already waiting for %r', name)
  299.             return None
  300.         self._callbacks[name] = callback
  301.         req = self.build_request(name, **req_options)
  302.         if self._batching:
  303.             self.set_waiting(name)
  304.             self._batchqueue.append((name, req, req_options))
  305.             return None
  306.         self.perform_request(name, req, **req_options)
  307.  
  308.     request = callsback(request)
  309.     
  310.     def perform_request(self, name, req, **req_options):
  311.         self.set_waiting(name)
  312.         if req is None:
  313.             return self.error_handler(name, req_options)(Exception('No request created for %r' % name))
  314.         reqopen = self.RequestOpenerFactory(self.http.open, req, **req_options)
  315.         reqopen.open(success = self.success_handler(name, req_options), error = self.error_handler(name, req_options))
  316.  
  317.     
  318.     def error_handler(self, name, req_options):
  319.         
  320.         def handler(e = (None, None, None)):
  321.             
  322.             try:
  323.                 e = self.preprocess_resp(name, e, **req_options)
  324.             except Exception:
  325.                 exc = None
  326.                 if not req_options.get('quiet', False):
  327.                     traceback.print_exc()
  328.                 
  329.             except:
  330.                 req_options.get('quiet', False)
  331.  
  332.             self.clear_waiting(name)
  333.             cb = self._callbacks.pop(name, None)
  334.             retval = self.handle_error(name, e, **req_options)
  335.             if cb is not None:
  336.                 cb.error(e)
  337.             
  338.             return retval
  339.  
  340.         return handler
  341.  
  342.     
  343.     def success_handler(self, name, req_options):
  344.         
  345.         def handler(resp):
  346.             
  347.             try:
  348.                 resp = self.preprocess_resp(name, resp, **req_options)
  349.             except Exception:
  350.                 exc = None
  351.                 traceback.print_exc()
  352.                 self.error_handler(name, req_options)(exc)
  353.                 return None
  354.  
  355.             
  356.             try:
  357.                 newresp = self.handle_success(name, resp, **req_options)
  358.             except Exception:
  359.                 exc = None
  360.                 traceback.print_exc()
  361.                 self.error_handler(name, req_options)(exc)
  362.                 return None
  363.  
  364.             if newresp is not None:
  365.                 resp = newresp
  366.             
  367.             cb = self._callbacks.pop(name, None)
  368.             if cb is not None:
  369.                 cb.success(resp)
  370.             
  371.             self.clear_waiting(name)
  372.             return newresp
  373.  
  374.         return handler
  375.  
  376.     build_request = dispatcher('build_request', itemgetter0)
  377.     handle_error = dispatcher('handle_error', itemgetter0)
  378.     preprocess_resp = dispatcher('preprocess_resp', itemgetter0)
  379.     handle_success = dispatcher('handle_success', itemgetter0)
  380.     
  381.     def build_request_default(self, name, **req_options):
  382.         link = self.urls[name]
  383.         if callable(link):
  384.             link = link()
  385.         
  386.         return self.RequestFactory(link, **req_options)
  387.  
  388.     
  389.     def handle_error_default(self, name, e, **req_options):
  390.         log.error('Error requesting %r (options = %r): %r', name, req_options, e)
  391.  
  392.     
  393.     def handle_success_default(self, name, resp, **req_options):
  394.         if resp.document is not None:
  395.             log.debug_s('document body: %r', HTML.tostring(resp.document, pretty_print = True))
  396.         else:
  397.             log.info('Got None for lxml doc. code/status= %r', resp.code, resp.msg, str(resp.headers))
  398.  
  399.     
  400.     def preprocess_resp_default(self, name, resp, **req_options):
  401.         data = resp.content
  402.         if data:
  403.             document = HTML.fromstring(data, base_url = resp.geturl())
  404.             document.make_links_absolute()
  405.             resp.document = document
  406.         else:
  407.             resp.document = None
  408.         return resp
  409.  
  410.  
  411.  
  412. class AsyncRequestOpener(RequestOpener):
  413.     request_cls = CookieJarHTTPMaster.request_cls
  414.     
  415.     def _check_success(self, req, resp):
  416.         return super(AsyncRequestOpener, self)._check_success(resp)
  417.  
  418.     
  419.     def _check_error(self, req, resp = None):
  420.         if resp == None:
  421.             resp = req
  422.         
  423.         return super(AsyncRequestOpener, self)._check_error(resp)
  424.  
  425.  
  426.  
  427. class AsyncWebScraper(WebScraperBase):
  428.     HttpOpenerFactory = CookieJarHTTPMaster
  429.     
  430.     def RequestOpenerFactory(self, open, req, **kwds):
  431.         return AsyncRequestOpener(open, req, **kwds)
  432.  
  433.     
  434.     def init_http(self):
  435.         self._jar = self.CookieJarFactory()
  436.         self.http = self.HttpOpenerFactory(jar = self._jar)
  437.  
  438.     
  439.     def RequestFactory(self, *a, **k):
  440.         headers = dict(getattr(self.http, 'addheaders', { }))
  441.         headers.update(k.get('headers', { }))
  442.         k['headers'] = headers
  443.         ret = self.http.request_cls.make_request(*a, **k)
  444.         return ret
  445.  
  446.  
  447. WebScraper = AsyncWebScraper
  448. if __name__ == '__main__':
  449.     pass
  450.  
  451.