home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import re
- import htmlentitydefs
- import sgmllib
- import HTMLParser
- from xml.sax import saxutils
- from _html import unescape, unescape_charref
-
- class NoMoreTokensError(Exception):
- pass
-
-
- class Token:
-
- def __init__(self, type, data, attrs = None):
- self.type = type
- self.data = data
- self.attrs = attrs
-
-
- def __iter__(self):
- return iter((self.type, self.data, self.attrs))
-
-
- def __eq__(self, other):
- (type, data, attrs) = other
- if self.type == type and self.data == data and self.attrs == attrs:
- return True
- return False
-
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
-
- def __repr__(self):
- args = ', '.join(map(repr, [
- self.type,
- self.data,
- self.attrs]))
- return self.__class__.__name__ + '(%s)' % args
-
-
- def __str__(self):
- if self.type == 'starttag':
- return '<%s%s>' % (self.data, attrs)
- if self.type == 'startendtag':
- return '<%s%s />' % (self.data, attrs)
- if self.type == 'endtag':
- return '</%s>' % self.data
- if self.type == 'charref':
- return '%s;' % self.data
- if self.type == 'entityref':
- return '&%s;' % self.data
- if self.type == 'data':
- return self.data
- if self.type == 'comment':
- return '<!--%s-->' % self.data
- if self.type == 'decl':
- return '<!%s>' % self.data
- if self.type == 'pi':
- return '<?%s>' % self.data
-
-
-
- def iter_until_exception(fn, exception, *args, **kwds):
- while None:
-
- try:
- yield fn(*args, **kwds)
- continue
- except exception:
- raise StopIteration
- continue
-
-
- return None
-
-
- class _AbstractParser:
- chunk = 1024
- compress_re = re.compile('\\s+')
-
- def __init__(self, fh, textify = {
- 'img': 'alt',
- 'applet': 'alt' }, encoding = 'ascii', entitydefs = None):
- self._fh = fh
- self._tokenstack = []
- self.textify = textify
- self.encoding = encoding
- if entitydefs is None:
- entitydefs = htmlentitydefs.name2codepoint
-
- self._entitydefs = entitydefs
-
-
- def __iter__(self):
- return self
-
-
- def tags(self, *names):
- return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
-
-
- def tokens(self, *tokentypes):
- return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
-
-
- def next(self):
-
- try:
- return self.get_token()
- except NoMoreTokensError:
- raise StopIteration()
-
-
-
- def get_token(self, *tokentypes):
- while None:
- while self._tokenstack:
- token = self._tokenstack.pop(0)
- if tokentypes:
- if token.type in tokentypes:
- return token
- continue
- token.type in tokentypes
- return token
- data = self._fh.read(self.chunk)
- if not data:
- raise NoMoreTokensError()
- self.feed(data)
- continue
- return None
-
-
- def unget_token(self, token):
- self._tokenstack.insert(0, token)
-
-
- def get_tag(self, *names):
- while None:
- tok = self.get_token()
- if tok.type not in ('starttag', 'endtag', 'startendtag'):
- continue
-
- if names:
- if tok.data in names:
- return tok
- continue
- return tok
- continue
- return None
-
-
- def get_text(self, endat = None):
- text = []
- tok = None
- while None:
-
- try:
- tok = self.get_token()
- except NoMoreTokensError:
- if tok:
- self.unget_token(tok)
-
- break
-
- if tok.type == 'data':
- text.append(tok.data)
- continue
- if tok.type == 'entityref':
- t = unescape('&%s;' % tok.data, self._entitydefs, self.encoding)
- text.append(t)
- continue
- if tok.type == 'charref':
- t = unescape_charref(tok.data, self.encoding)
- text.append(t)
- continue
- if tok.type in ('starttag', 'endtag', 'startendtag'):
- tag_name = tok.data
- if tok.type in ('starttag', 'startendtag'):
- alt = self.textify.get(tag_name)
- if alt is not None:
- if callable(alt):
- text.append(alt(tok))
- elif tok.attrs is not None:
- for k, v in tok.attrs:
- if k == alt:
- text.append(v)
- continue
-
- text.append('[%s]' % tag_name.upper())
-
-
-
- if endat is None or endat == (tok.type, tag_name):
- self.unget_token(tok)
- break
-
- endat == (tok.type, tag_name)
- continue
- return ''.join(text)
-
-
- def get_compressed_text(self, *args, **kwds):
- text = self.get_text(*args, **kwds)
- text = text.strip()
- return self.compress_re.sub(' ', text)
-
-
- def handle_startendtag(self, tag, attrs):
- self._tokenstack.append(Token('startendtag', tag, attrs))
-
-
- def handle_starttag(self, tag, attrs):
- self._tokenstack.append(Token('starttag', tag, attrs))
-
-
- def handle_endtag(self, tag):
- self._tokenstack.append(Token('endtag', tag))
-
-
- def handle_charref(self, name):
- self._tokenstack.append(Token('charref', name))
-
-
- def handle_entityref(self, name):
- self._tokenstack.append(Token('entityref', name))
-
-
- def handle_data(self, data):
- self._tokenstack.append(Token('data', data))
-
-
- def handle_comment(self, data):
- self._tokenstack.append(Token('comment', data))
-
-
- def handle_decl(self, decl):
- self._tokenstack.append(Token('decl', decl))
-
-
- def unknown_decl(self, data):
- self._tokenstack.append(Token('decl', data))
-
-
- def handle_pi(self, data):
- self._tokenstack.append(Token('pi', data))
-
-
- def unescape_attr(self, name):
- return unescape(name, self._entitydefs, self.encoding)
-
-
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- escaped_attrs.append((key, self.unescape_attr(val)))
-
- return escaped_attrs
-
-
-
- class PullParser(_AbstractParser, HTMLParser.HTMLParser):
-
- def __init__(self, *args, **kwds):
- HTMLParser.HTMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
-
-
- def unescape(self, name):
- return self.unescape_attr(name)
-
-
-
- class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
-
- def __init__(self, *args, **kwds):
- sgmllib.SGMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
-
-
- def unknown_starttag(self, tag, attrs):
- attrs = self.unescape_attrs(attrs)
- self._tokenstack.append(Token('starttag', tag, attrs))
-
-
- def unknown_endtag(self, tag):
- self._tokenstack.append(Token('endtag', tag))
-
-
-
- def _test():
- import doctest
- import _pullparser
- return doctest.testmod(_pullparser)
-
- if __name__ == '__main__':
- _test()
-
-