home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import re
- import copy
- import htmlentitydefs
- import sgmllib
- import ClientForm
- import _request
- from _headersutil import split_header_words, is_html as _is_html
- import _rfc3986
- DEFAULT_ENCODING = 'latin-1'
- COMPRESS_RE = re.compile('\\s+')
-
- class ParseError(ClientForm.ParseError):
- pass
-
-
- class CachingGeneratorFunction(object):
-
- def __init__(self, iterable):
- self._cache = []
- self._iterator = iter(iterable)
-
-
- def __call__(self):
- cache = self._cache
- for item in cache:
- yield item
-
- for item in self._iterator:
- cache.append(item)
- yield item
-
-
-
-
- class EncodingFinder:
-
- def __init__(self, default_encoding):
- self._default_encoding = default_encoding
-
-
- def encoding(self, response):
- for ct in response.info().getheaders('content-type'):
- for k, v in split_header_words([
- ct])[0]:
- if k == 'charset':
- return v
-
-
- return self._default_encoding
-
-
-
- class ResponseTypeFinder:
-
- def __init__(self, allow_xhtml):
- self._allow_xhtml = allow_xhtml
-
-
- def is_html(self, response, encoding):
- ct_hdrs = response.info().getheaders('content-type')
- url = response.geturl()
- return _is_html(ct_hdrs, url, self._allow_xhtml)
-
-
-
- class Args:
-
- def __init__(self, args_map):
- self.dictionary = dict(args_map)
-
-
- def __getattr__(self, key):
-
- try:
- return self.dictionary[key]
- except KeyError:
- return getattr(self.__class__, key)
-
-
-
-
- def form_parser_args(select_default = False, form_parser_class = None, request_class = None, backwards_compat = False):
- return Args(locals())
-
-
- class Link:
-
- def __init__(self, base_url, url, text, tag, attrs):
- self.base_url = base_url
- self.absolute_url = _rfc3986.urljoin(base_url, url)
- (self.url, self.text, self.tag, self.attrs) = (url, text, tag, attrs)
-
-
- def __cmp__(self, other):
-
- try:
- for name in ('url', 'text', 'tag', 'attrs'):
- if getattr(self, name) != getattr(other, name):
- return -1
- except AttributeError:
- return -1
-
- return 0
-
-
- def __repr__(self):
- return 'Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)' % (self.base_url, self.url, self.text, self.tag, self.attrs)
-
-
-
- class LinksFactory:
-
- def __init__(self, link_parser_class = None, link_class = Link, urltags = None):
- import _pullparser
- if link_parser_class is None:
- link_parser_class = _pullparser.TolerantPullParser
-
- self.link_parser_class = link_parser_class
- self.link_class = link_class
- if urltags is None:
- urltags = {
- 'a': 'href',
- 'area': 'href',
- 'frame': 'src',
- 'iframe': 'src' }
-
- self.urltags = urltags
- self._response = None
- self._encoding = None
-
-
- def set_response(self, response, base_url, encoding):
- self._response = response
- self._encoding = encoding
- self._base_url = base_url
-
-
- def links(self):
- response = self._response
- encoding = self._encoding
- base_url = self._base_url
- p = self.link_parser_class(response, encoding = encoding)
-
- try:
- for token in p.tags(*self.urltags.keys() + [
- 'base']):
- if token.type == 'endtag':
- continue
-
- if token.data == 'base':
- base_href = dict(token.attrs).get('href')
- if base_href is not None:
- base_url = base_href
- continue
- continue
-
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get('name')
- text = None
- url = attrs.get(self.urltags[tag])
- if not url:
- continue
-
- url = _rfc3986.clean_url(url, encoding)
- if tag == 'a':
- if token.type != 'startendtag':
- text = p.get_compressed_text(('endtag', tag))
-
-
- yield Link(base_url, url, text, tag, token.attrs)
- except sgmllib.SGMLParseError:
- exc = None
- raise ParseError(exc)
-
-
-
-
- class FormsFactory:
-
- def __init__(self, select_default = False, form_parser_class = None, request_class = None, backwards_compat = False):
- import ClientForm
- self.select_default = select_default
- if form_parser_class is None:
- form_parser_class = ClientForm.FormParser
-
- self.form_parser_class = form_parser_class
- if request_class is None:
- request_class = _request.Request
-
- self.request_class = request_class
- self.backwards_compat = backwards_compat
- self._response = None
- self.encoding = None
- self.global_form = None
-
-
- def set_response(self, response, encoding):
- self._response = response
- self.encoding = encoding
- self.global_form = None
-
-
- def forms(self):
- import ClientForm
- encoding = self.encoding
-
- try:
- forms = ClientForm.ParseResponseEx(self._response, select_default = self.select_default, form_parser_class = self.form_parser_class, request_class = self.request_class, encoding = encoding, _urljoin = _rfc3986.urljoin, _urlparse = _rfc3986.urlsplit, _urlunparse = _rfc3986.urlunsplit)
- except ClientForm.ParseError:
- exc = None
- raise ParseError(exc)
-
- self.global_form = forms[0]
- return forms[1:]
-
-
-
- class TitleFactory:
-
- def __init__(self):
- self._response = None
- self._encoding = None
-
-
- def set_response(self, response, encoding):
- self._response = response
- self._encoding = encoding
-
-
- def _get_title_text(self, parser):
- import _pullparser
- text = []
- tok = None
- while None:
-
- try:
- tok = parser.get_token()
- except _pullparser.NoMoreTokensError:
- break
-
- if tok.type == 'data':
- text.append(str(tok))
- continue
- if tok.type == 'entityref':
- t = unescape('&%s;' % tok.data, parser._entitydefs, parser.encoding)
- text.append(t)
- continue
- if tok.type == 'charref':
- t = unescape_charref(tok.data, parser.encoding)
- text.append(t)
- continue
- if tok.type in ('starttag', 'endtag', 'startendtag'):
- tag_name = tok.data
- if tok.type == 'endtag' and tag_name == 'title':
- break
-
- text.append(str(tok))
- continue
- continue
- return COMPRESS_RE.sub(' ', ''.join(text).strip())
-
-
- def title(self):
- import _pullparser
- p = _pullparser.TolerantPullParser(self._response, encoding = self._encoding)
-
- try:
-
- try:
- p.get_tag('title')
- except _pullparser.NoMoreTokensError:
- return None
-
- return self._get_title_text(p)
- except sgmllib.SGMLParseError:
- exc = None
- raise ParseError(exc)
-
-
-
-
- def unescape(data, entities, encoding):
- if data is None or '&' not in data:
- return data
-
- def replace_entities(match):
- ent = match.group()
- if ent[1] == '#':
- return unescape_charref(ent[2:-1], encoding)
- repl = entities.get(ent[1:-1])
- return repl
-
- return re.sub('?[A-Za-z0-9]+?;', replace_entities, data)
-
-
- def unescape_charref(data, encoding):
- name = data
- base = 10
- if name.startswith('x'):
- name = name[1:]
- base = 16
-
- uc = unichr(int(name, base))
- if encoding is None:
- return uc
-
- try:
- repl = uc.encode(encoding)
- except UnicodeError:
- encoding is None
- encoding is None
- repl = '%s;' % data
- except:
- encoding is None
-
- return repl
-
- import _beautifulsoup
- import ClientForm
- (RobustFormParser, NestingRobustFormParser) = ClientForm._create_bs_classes(_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup)
- sgmllib.charref = re.compile('(x?[0-9a-fA-F]+)[^0-9a-fA-F]')
-
- class MechanizeBs(_beautifulsoup.BeautifulSoup):
- _entitydefs = htmlentitydefs.name2codepoint
- PARSER_MASSAGE = [
- (re.compile('(<[^<>]*)/>'), (lambda x: x.group(1) + ' />')),
- (re.compile('<!\\s+([^<>]*)>'), (lambda x: '<!' + x.group(1) + '>'))]
-
- def __init__(self, encoding, text = None, avoidParserProblems = True, initialTextIsEverything = True):
- self._encoding = encoding
- _beautifulsoup.BeautifulSoup.__init__(self, text, avoidParserProblems, initialTextIsEverything)
-
-
- def handle_charref(self, ref):
- t = unescape('%s;' % ref, self._entitydefs, self._encoding)
- self.handle_data(t)
-
-
- def handle_entityref(self, ref):
- t = unescape('&%s;' % ref, self._entitydefs, self._encoding)
- self.handle_data(t)
-
-
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- val = unescape(val, self._entitydefs, self._encoding)
- escaped_attrs.append((key, val))
-
- return escaped_attrs
-
-
-
- class RobustLinksFactory:
- compress_re = COMPRESS_RE
-
- def __init__(self, link_parser_class = None, link_class = Link, urltags = None):
- if link_parser_class is None:
- link_parser_class = MechanizeBs
-
- self.link_parser_class = link_parser_class
- self.link_class = link_class
- if urltags is None:
- urltags = {
- 'a': 'href',
- 'area': 'href',
- 'frame': 'src',
- 'iframe': 'src' }
-
- self.urltags = urltags
- self._bs = None
- self._encoding = None
- self._base_url = None
-
-
- def set_soup(self, soup, base_url, encoding):
- self._bs = soup
- self._base_url = base_url
- self._encoding = encoding
-
-
- def links(self):
- import _beautifulsoup
- bs = self._bs
- base_url = self._base_url
- encoding = self._encoding
- gen = bs.recursiveChildGenerator()
- for ch in bs.recursiveChildGenerator():
- if isinstance(ch, _beautifulsoup.Tag) and ch.name in self.urltags.keys() + [
- 'base']:
- link = ch
- attrs = bs.unescape_attrs(link.attrs)
- attrs_dict = dict(attrs)
- if link.name == 'base':
- base_href = attrs_dict.get('href')
- if base_href is not None:
- base_url = base_href
- continue
- continue
-
- url_attr = self.urltags[link.name]
- url = attrs_dict.get(url_attr)
- if not url:
- continue
-
- url = _rfc3986.clean_url(url, encoding)
- text = link.fetchText((lambda t: True))
- if not text:
- if link.name == 'a':
- text = ''
- else:
- text = None
- else:
- text = self.compress_re.sub(' ', ' '.join(text).strip())
- yield Link(base_url, url, text, link.name, attrs)
- continue
-
-
-
-
- class RobustFormsFactory(FormsFactory):
-
- def __init__(self, *args, **kwds):
- args = form_parser_args(*args, **kwds)
- if args.form_parser_class is None:
- args.form_parser_class = RobustFormParser
-
- FormsFactory.__init__(self, **args.dictionary)
-
-
- def set_response(self, response, encoding):
- self._response = response
- self.encoding = encoding
-
-
-
- class RobustTitleFactory:
-
- def __init__(self):
- self._bs = None
- self._encoding = None
-
-
- def set_soup(self, soup, encoding):
- self._bs = soup
- self._encoding = encoding
-
-
- def title(self):
- import _beautifulsoup
- title = self._bs.first('title')
- if title == _beautifulsoup.Null:
- return None
- inner_html = []([ str(node) for node in title.contents ])
- return COMPRESS_RE.sub(' ', inner_html.strip())
-
-
-
- class Factory:
- LAZY_ATTRS = [
- 'encoding',
- 'is_html',
- 'title',
- 'global_form']
-
- def __init__(self, forms_factory, links_factory, title_factory, encoding_finder = EncodingFinder(DEFAULT_ENCODING), response_type_finder = ResponseTypeFinder(allow_xhtml = False)):
- self._forms_factory = forms_factory
- self._links_factory = links_factory
- self._title_factory = title_factory
- self._encoding_finder = encoding_finder
- self._response_type_finder = response_type_finder
- self.set_response(None)
-
-
- def set_request_class(self, request_class):
- self._forms_factory.request_class = request_class
-
-
- def set_response(self, response):
- self._response = response
- self._forms_genf = None
- self._links_genf = None
- self._get_title = None
- for name in self.LAZY_ATTRS:
-
- try:
- delattr(self, name)
- continue
- except AttributeError:
- continue
-
-
-
-
-
- def __getattr__(self, name):
- if name not in self.LAZY_ATTRS:
- return getattr(self.__class__, name)
- if name == 'encoding':
- self.encoding = self._encoding_finder.encoding(copy.copy(self._response))
- return self.encoding
- if name == 'is_html':
- self.is_html = self._response_type_finder.is_html(copy.copy(self._response), self.encoding)
- return self.is_html
- if name == 'title':
- return self.title
- if name == 'global_form':
- self.forms()
- return self.global_form
-
-
- def forms(self):
- if self._forms_genf is None:
-
- try:
- self._forms_genf = CachingGeneratorFunction(self._forms_factory.forms())
- except:
- self.set_response(self._response)
- raise
-
- self.global_form = getattr(self._forms_factory, 'global_form', None)
-
- return self._forms_genf()
-
-
- def links(self):
- if self._links_genf is None:
-
- try:
- self._links_genf = CachingGeneratorFunction(self._links_factory.links())
- self.set_response(self._response)
- raise
-
-
- return self._links_genf()
-
-
-
- class DefaultFactory(Factory):
-
- def __init__(self, i_want_broken_xhtml_support = False):
- Factory.__init__(self, forms_factory = FormsFactory(), links_factory = LinksFactory(), title_factory = TitleFactory(), response_type_finder = ResponseTypeFinder(allow_xhtml = i_want_broken_xhtml_support))
-
-
- def set_response(self, response):
- Factory.set_response(self, response)
- if response is not None:
- self._forms_factory.set_response(copy.copy(response), self.encoding)
- self._links_factory.set_response(copy.copy(response), response.geturl(), self.encoding)
- self._title_factory.set_response(copy.copy(response), self.encoding)
-
-
-
-
- class RobustFactory(Factory):
-
- def __init__(self, i_want_broken_xhtml_support = False, soup_class = None):
- Factory.__init__(self, forms_factory = RobustFormsFactory(), links_factory = RobustLinksFactory(), title_factory = RobustTitleFactory(), response_type_finder = ResponseTypeFinder(allow_xhtml = i_want_broken_xhtml_support))
- if soup_class is None:
- soup_class = MechanizeBs
-
- self._soup_class = soup_class
-
-
- def set_response(self, response):
- Factory.set_response(self, response)
- if response is not None:
- data = response.read()
- soup = self._soup_class(self.encoding, data)
- self._forms_factory.set_response(copy.copy(response), self.encoding)
- self._links_factory.set_soup(soup, response.geturl(), self.encoding)
- self._title_factory.set_soup(soup, self.encoding)
-
-
-
-