home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import time
- import htmlentitydefs
- import logging
- import socket
- import urllib2
- import urllib
- import httplib
- import sgmllib
- from urllib2 import URLError, HTTPError, BaseHandler
- from cStringIO import StringIO
- from _clientcookie import CookieJar
- from _headersutil import is_html
- from _html import unescape, unescape_charref
- from _request import Request
- from _response import closeable_response, response_seek_wrapper
- import _rfc3986
- import _sockettimeout
- debug = logging.getLogger('mechanize').debug
- debug_robots = logging.getLogger('mechanize.robots').debug
- CHUNK = 1024
- DEFAULT_ENCODING = 'latin-1'
-
- try:
- socket._fileobject('fake socket', close = True)
- except TypeError:
- create_readline_wrapper = socket._fileobject
-
-
- def create_readline_wrapper(fh):
- return socket._fileobject(fh, close = True)
-
-
- class HTTPRedirectHandler(BaseHandler):
- max_repeats = 4
- max_redirections = 10
-
- def redirect_request(self, newurl, req, fp, code, msg, headers):
- if (code in (301, 302, 303, 'refresh') or code == 307) and not req.has_data():
- new = Request(newurl, headers = req.headers, origin_req_host = req.get_origin_req_host(), unverifiable = True, visit = False)
- new._origin_req = getattr(req, '_origin_req', req)
- return new
- raise HTTPError(req.get_full_url(), code, msg, headers, fp)
-
-
- def http_error_302(self, req, fp, code, msg, headers):
- if headers.has_key('location'):
- newurl = headers.getheaders('location')[0]
- elif headers.has_key('uri'):
- newurl = headers.getheaders('uri')[0]
- else:
- return None
- newurl = headers.has_key('location').clean_url(newurl, 'latin-1')
- newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
- new = self.redirect_request(newurl, req, fp, code, msg, headers)
- if new is None:
- return None
- visited[newurl] = visited.get(newurl, 0) + 1
- fp.read()
- fp.close()
- return self.parent.open(new)
-
- http_error_301 = http_error_303 = http_error_307 = http_error_302
- http_error_refresh = http_error_302
- inf_msg = 'The HTTP server returned a redirect error that would lead to an infinite loop.\nThe last 30x error message was:\n'
-
-
- class EndOfHeadError(Exception):
- pass
-
-
- class AbstractHeadParser:
- head_elems = ('html', 'head', 'title', 'base', 'script', 'style', 'meta', 'link', 'object')
- _entitydefs = htmlentitydefs.name2codepoint
- _encoding = DEFAULT_ENCODING
-
- def __init__(self):
- self.http_equiv = []
-
-
- def start_meta(self, attrs):
- http_equiv = None
- content = None
- for key, value in attrs:
- if key == 'http-equiv':
- http_equiv = self.unescape_attr_if_required(value)
- continue
- if key == 'content':
- content = self.unescape_attr_if_required(value)
- continue
-
- if http_equiv is not None and content is not None:
- self.http_equiv.append((http_equiv, content))
-
-
-
- def end_head(self):
- raise EndOfHeadError()
-
-
- def handle_entityref(self, name):
- self.handle_data(unescape('&%s;' % name, self._entitydefs, self._encoding))
-
-
- def handle_charref(self, name):
- self.handle_data(unescape_charref(name, self._encoding))
-
-
- def unescape_attr(self, name):
- return unescape(name, self._entitydefs, self._encoding)
-
-
- def unescape_attrs(self, attrs):
- escaped_attrs = { }
- for key, val in attrs.items():
- escaped_attrs[key] = self.unescape_attr(val)
-
- return escaped_attrs
-
-
- def unknown_entityref(self, ref):
- self.handle_data('&%s;' % ref)
-
-
- def unknown_charref(self, ref):
- self.handle_data('%s;' % ref)
-
-
-
- try:
- import HTMLParser
- except ImportError:
- pass
-
-
- class XHTMLCompatibleHeadParser(AbstractHeadParser, HTMLParser.HTMLParser):
-
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
-
- def handle_starttag(self, tag, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- tag not in self.head_elems
-
- try:
- method = getattr(self, 'start_' + tag)
- except AttributeError:
-
- try:
- method = getattr(self, 'do_' + tag)
- except AttributeError:
- pass
-
- method(attrs)
-
- method(attrs)
-
-
- def handle_endtag(self, tag):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- tag not in self.head_elems
-
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
- pass
-
- method()
-
-
- def unescape(self, name):
- return self.unescape_attr(name)
-
-
- def unescape_attr_if_required(self, name):
- return name
-
-
-
- class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
-
- def _not_called(self):
- pass
-
-
- def __init__(self):
- sgmllib.SGMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
-
- def handle_starttag(self, tag, method, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- tag not in self.head_elems
- if tag == 'meta':
- method(attrs)
-
-
-
- def unknown_starttag(self, tag, attrs):
- self.handle_starttag(tag, self._not_called, attrs)
-
-
- def handle_endtag(self, tag, method):
- if tag in self.head_elems:
- method()
- else:
- raise EndOfHeadError()
- return tag in self.head_elems
-
-
- def unescape_attr_if_required(self, name):
- return self.unescape_attr(name)
-
-
-
- def parse_head(fileobj, parser):
- while None:
- data = fileobj.read(CHUNK)
-
- try:
- parser.feed(data)
- except EndOfHeadError:
- break
-
- if len(data) != CHUNK:
- break
- continue
- continue
- return parser.http_equiv
-
-
- class HTTPEquivProcessor(BaseHandler):
- handler_order = 300
-
- def __init__(self, head_parser_class = HeadParser, i_want_broken_xhtml_support = False):
- self.head_parser_class = head_parser_class
- self._allow_xhtml = i_want_broken_xhtml_support
-
-
- def http_response(self, request, response):
- if not hasattr(response, 'seek'):
- response = response_seek_wrapper(response)
-
- http_message = response.info()
- url = response.geturl()
- ct_hdrs = http_message.getheaders('content-type')
- if is_html(ct_hdrs, url, self._allow_xhtml):
-
- try:
-
- try:
- html_headers = parse_head(response, self.head_parser_class())
- finally:
- response.seek(0)
-
- except (HTMLParser.HTMLParseError, sgmllib.SGMLParseError):
- pass
-
- for hdr, val in html_headers:
- http_message.dict[hdr.lower()] = val
- text = hdr + ': ' + val
- for line in text.split('\n'):
- http_message.headers.append(line + '\n')
-
-
-
- return response
-
- https_response = http_response
-
-
- class HTTPCookieProcessor(BaseHandler):
-
- def __init__(self, cookiejar = None):
- if cookiejar is None:
- cookiejar = CookieJar()
-
- self.cookiejar = cookiejar
-
-
- def http_request(self, request):
- self.cookiejar.add_cookie_header(request)
- return request
-
-
- def http_response(self, request, response):
- self.cookiejar.extract_cookies(response, request)
- return response
-
- https_request = http_request
- https_response = http_response
-
-
- try:
- import robotparser
- except ImportError:
- pass
-
-
- class MechanizeRobotFileParser(robotparser.RobotFileParser):
-
- def __init__(self, url = '', opener = None):
- robotparser.RobotFileParser.__init__(self, url)
- self._opener = opener
- self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
-
-
- def set_opener(self, opener = None):
- import _opener
- if opener is None:
- opener = _opener.OpenerDirector()
-
- self._opener = opener
-
-
- def set_timeout(self, timeout):
- self._timeout = timeout
-
-
- def read(self):
- if self._opener is None:
- self.set_opener()
-
- req = Request(self.url, unverifiable = True, visit = False, timeout = self._timeout)
-
- try:
- f = self._opener.open(req)
- except HTTPError:
- f = None
- except (IOError, socket.error, OSError):
- exc = None
- debug_robots('ignoring error opening %r: %s' % (self.url, exc))
- return None
-
- lines = []
- line = f.readline()
- while line:
- lines.append(line.strip())
- line = f.readline()
- status = f.code
- if status == 401 or status == 403:
- self.disallow_all = True
- debug_robots('disallow all')
- elif status >= 400:
- self.allow_all = True
- debug_robots('allow all')
- elif status == 200 and lines:
- debug_robots('parse lines')
- self.parse(lines)
-
-
-
-
- class RobotExclusionError(urllib2.HTTPError):
-
- def __init__(self, request, *args):
- apply(urllib2.HTTPError.__init__, (self,) + args)
- self.request = request
-
-
-
- class HTTPRobotRulesProcessor(BaseHandler):
- handler_order = 800
-
- try:
- from httplib import HTTPMessage
- except:
- from mimetools import Message
- http_response_class = Message
-
- http_response_class = HTTPMessage
-
- def __init__(self, rfp_class = MechanizeRobotFileParser):
- self.rfp_class = rfp_class
- self.rfp = None
- self._host = None
-
-
- def http_request(self, request):
- scheme = request.get_type()
- if scheme not in ('http', 'https'):
- return request
- if request.get_selector() == '/robots.txt':
- return request
- host = request.get_host()
- origin_req = getattr(request, '_origin_req', None)
- if origin_req is not None and origin_req.get_selector() == '/robots.txt' and origin_req.get_host() == host:
- return request
- if host != self._host:
- self.rfp = self.rfp_class()
-
- try:
- self.rfp.set_opener(self.parent)
- except AttributeError:
- origin_req.get_host() == host
- origin_req.get_host() == host
- request.get_selector() == '/robots.txt'
- debug('%r instance does not support set_opener' % self.rfp.__class__)
- except:
- scheme not in ('http', 'https')
-
- self.rfp.set_url(scheme + '://' + host + '/robots.txt')
- self.rfp.set_timeout(request.timeout)
- self.rfp.read()
- self._host = host
-
- ua = request.get_header('User-agent', '')
- if self.rfp.can_fetch(ua, request.get_full_url()):
- return request
- msg = 'request disallowed by robots.txt'
- raise RobotExclusionError(request, request.get_full_url(), 403, msg, self.http_response_class(StringIO()), StringIO(msg))
-
- https_request = http_request
-
-
- class HTTPRefererProcessor(BaseHandler):
-
- def __init__(self):
- self.referer = None
-
-
- def http_request(self, request):
- if self.referer is not None and not request.has_header('Referer'):
- request.add_unredirected_header('Referer', self.referer)
-
- return request
-
-
- def http_response(self, request, response):
- self.referer = response.geturl()
- return response
-
- https_request = http_request
- https_response = http_response
-
-
- def clean_refresh_url(url):
- if (url.startswith('"') or url.endswith('"') or url.startswith("'")) and url.endswith("'"):
- url = url[1:-1]
-
- return _rfc3986.clean_url(url, 'latin-1')
-
-
- def parse_refresh_header(refresh):
- ii = refresh.find(';')
- if ii != -1:
- pause = float(refresh[:ii])
- newurl_spec = refresh[ii + 1:]
- jj = newurl_spec.find('=')
- key = None
- if jj != -1:
- key = newurl_spec[:jj]
- newurl = newurl_spec[jj + 1:]
- newurl = clean_refresh_url(newurl)
-
- if key is None or key.strip().lower() != 'url':
- raise ValueError()
- key.strip().lower() != 'url'
- else:
- pause = float(refresh)
- newurl = None
- return (pause, newurl)
-
-
- class HTTPRefreshProcessor(BaseHandler):
- handler_order = 1000
-
- def __init__(self, max_time = 0, honor_time = True):
- self.max_time = max_time
- self.honor_time = honor_time
- self._sleep = time.sleep
-
-
- def http_response(self, request, response):
- code = response.code
- msg = response.msg
- hdrs = response.info()
- if code == 200 and hdrs.has_key('refresh'):
- refresh = hdrs.getheaders('refresh')[0]
-
- try:
- (pause, newurl) = parse_refresh_header(refresh)
- except ValueError:
- debug('bad Refresh header: %r' % refresh)
- return response
-
- if newurl is None:
- newurl = response.geturl()
-
- if self.max_time is None or pause <= self.max_time:
- if pause > 0.001 and self.honor_time:
- self._sleep(pause)
-
- hdrs['location'] = newurl
- response = self.parent.error('http', request, response, 'refresh', msg, hdrs)
- else:
- debug('Refresh header ignored: %r' % refresh)
-
- return response
-
- https_response = http_response
-
-
- class HTTPErrorProcessor(BaseHandler):
- handler_order = 1000
-
- def http_response(self, request, response):
- code = response.code
- msg = response.msg
- hdrs = response.info()
- if code != 200:
- response = self.parent.error('http', request, response, code, msg, hdrs)
-
- return response
-
- https_response = http_response
-
-
- class HTTPDefaultErrorHandler(BaseHandler):
-
- def http_error_default(self, req, fp, code, msg, hdrs):
- if isinstance(fp, urllib2.HTTPError):
- response = fp
- else:
- response = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
- raise response
-
-
-
- class AbstractHTTPHandler(BaseHandler):
-
- def __init__(self, debuglevel = 0):
- self._debuglevel = debuglevel
-
-
- def set_http_debuglevel(self, level):
- self._debuglevel = level
-
-
- def do_request_(self, request):
- host = request.get_host()
- if not host:
- raise URLError('no host given')
- host
- if request.has_data():
- data = request.get_data()
- if not request.has_header('Content-type'):
- request.add_unredirected_header('Content-type', 'application/x-www-form-urlencoded')
-
- if not request.has_header('Content-length'):
- request.add_unredirected_header('Content-length', '%d' % len(data))
-
-
- (scheme, sel) = urllib.splittype(request.get_selector())
- (sel_host, sel_path) = urllib.splithost(sel)
- if not request.has_header('Host'):
- if not sel_host:
- pass
- request.add_unredirected_header('Host', host)
-
- for name, value in self.parent.addheaders:
- name = name.capitalize()
- if not request.has_header(name):
- request.add_unredirected_header(name, value)
- continue
-
- return request
-
-
- def do_open(self, http_class, req):
- host_port = req.get_host()
- if not host_port:
- raise URLError('no host given')
- host_port
-
- try:
- h = http_class(host_port, timeout = req.timeout)
- except TypeError:
- h = http_class(host_port)
-
- h.set_debuglevel(self._debuglevel)
- headers = dict(req.headers)
- headers.update(req.unredirected_hdrs)
- headers['Connection'] = 'close'
- headers = []([ (name.title(), val) for name, val in headers.items() ])
-
- try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
- r = h.getresponse()
- except socket.error:
- []
- err = []
- dict
- raise URLError(err)
- except:
- []
-
- r.recv = r.read
- fp = create_readline_wrapper(r)
- resp = closeable_response(fp, r.msg, req.get_full_url(), r.status, r.reason)
- return resp
-
-
-
- class HTTPHandler(AbstractHTTPHandler):
-
- def http_open(self, req):
- return self.do_open(httplib.HTTPConnection, req)
-
- http_request = AbstractHTTPHandler.do_request_
-
- if hasattr(httplib, 'HTTPS'):
-
- class HTTPSConnectionFactory:
-
- def __init__(self, key_file, cert_file):
- self._key_file = key_file
- self._cert_file = cert_file
-
-
- def __call__(self, hostport):
- return httplib.HTTPSConnection(hostport, key_file = self._key_file, cert_file = self._cert_file)
-
-
-
- class HTTPSHandler(AbstractHTTPHandler):
-
- def __init__(self, client_cert_manager = None):
- AbstractHTTPHandler.__init__(self)
- self.client_cert_manager = client_cert_manager
-
-
- def https_open(self, req):
- if self.client_cert_manager is not None:
- (key_file, cert_file) = self.client_cert_manager.find_key_cert(req.get_full_url())
- conn_factory = HTTPSConnectionFactory(key_file, cert_file)
- else:
- conn_factory = httplib.HTTPSConnection
- return self.do_open(conn_factory, req)
-
- https_request = AbstractHTTPHandler.do_request_
-
-
-