home *** CD-ROM | disk | FTP | other *** search
Wrap
# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) from __future__ import generators __author__ = 'Leonard Richardson (leonardr@segfault.org)' __version__ = '2.1.1' __date__ = '$Date: 2004/10/18 00:14:20 $' __copyright__ = 'Copyright (c) 2004-2005 Leonard Richardson' __license__ = 'PSF' from sgmllib import SGMLParser, SGMLParseError import types import re import sgmllib sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') class NullType(object): def __new__(cls): return Null def __call__(self, *args, **kwargs): return Null def __getattr__(self, attr): return Null def __getitem__(self, item): return Null def __setattr__(self, attr, value): pass def __setitem__(self, item, value): pass def __len__(self): return 0 def __iter__(self): return iter([]) def __contains__(self, item): return False def __repr__(self): return 'Null' Null = object.__new__(NullType) class PageElement: def setup(self, parent = Null, previous = Null): self.parent = parent self.previous = previous self.next = Null self.previousSibling = Null self.nextSibling = Null if self.parent and self.parent.contents: self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self def findNext(self, name = None, attrs = { }, text = None): return self._first(self.fetchNext, name, attrs, text) firstNext = findNext def fetchNext(self, name = None, attrs = { }, text = None, limit = None): return self._fetch(name, attrs, text, limit, self.nextGenerator) def findNextSibling(self, name = None, attrs = { }, text = None): return self._first(self.fetchNextSiblings, name, attrs, text) firstNextSibling = findNextSibling def fetchNextSiblings(self, name = None, attrs = { }, text = None, limit = None): return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) def findPrevious(self, name = None, attrs = { }, text = None): return self._first(self.fetchPrevious, name, attrs, text) def fetchPrevious(self, name = None, attrs = { }, text = None, limit = None): return self._fetch(name, attrs, text, limit, self.previousGenerator) firstPrevious = findPrevious def findPreviousSibling(self, name = None, attrs = { }, text = None): return self._first(self.fetchPreviousSiblings, name, attrs, text) firstPreviousSibling = findPreviousSibling def fetchPreviousSiblings(self, name = None, attrs = { }, text = None, limit = None): return self._fetch(name, attrs, text, limit, self.previousSiblingGenerator) def findParent(self, name = None, attrs = { }): r = Null l = self.fetchParents(name, attrs, 1) if l: r = l[0] return r firstParent = findParent def fetchParents(self, name = None, attrs = { }, limit = None): return self._fetch(name, attrs, None, limit, self.parentGenerator) def _first(self, method, name, attrs, text): r = Null l = method(name, attrs, text, 1) if l: r = l[0] return r def _fetch(self, name, attrs, text, limit, generator): if not hasattr(attrs, 'items'): attrs = { 'class': attrs } results = [] g = generator() while True: try: i = g.next() except StopIteration: break found = None if isinstance(i, Tag): if not text: if not name or self._matches(i, name): match = True for attr, matchAgainst in attrs.items(): check = i.get(attr) if not self._matches(check, matchAgainst): match = False break continue if match: found = i elif text: if self._matches(i, text): found = i if found: results.append(found) if limit and len(results) >= limit: break len(results) >= limit return results def nextGenerator(self): i = self while i: i = i.next yield i def nextSiblingGenerator(self): i = self while i: i = i.nextSibling yield i def previousGenerator(self): i = self while i: i = i.previous yield i def previousSiblingGenerator(self): i = self while i: i = i.previousSibling yield i def parentGenerator(self): i = self while i: i = i.parent yield i def _matches(self, chunk, howToMatch): if isList(chunk) and not isinstance(chunk, Tag): for tag in chunk: if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): return True return False if callable(howToMatch): return howToMatch(chunk) if not isinstance(chunk, basestring): chunk = str(chunk) if hasattr(howToMatch, 'match'): return howToMatch.search(chunk) if isList(howToMatch): return chunk in howToMatch if hasattr(howToMatch, 'items'): return howToMatch.has_key(chunk) return str(howToMatch) == chunk class NavigableText(PageElement): def __getattr__(self, attr): if attr == 'string': return self raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) class NavigableString(str, NavigableText): pass class NavigableUnicodeString(unicode, NavigableText): pass class Tag(PageElement): def __init__(self, name, attrs = None, parent = Null, previous = Null): self.name = name if attrs == None: attrs = [] self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False def get(self, key, default = None): return self._getAttrMap().get(key, default) def __getitem__(self, key): return self._getAttrMap()[key] def __iter__(self): return iter(self.contents) def __len__(self): return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): return True def __setitem__(self, key, value): self._getAttrMap() self.attrMap[key] = value found = False for i in range(0, len(self.attrs)): if self.attrs[i][0] == key: self.attrs[i] = (key, value) found = True continue if not found: self.attrs.append((key, value)) self._getAttrMap()[key] = value def __delitem__(self, key): for item in self.attrs: if item[0] == key: self.attrs.remove(item) self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] continue def __call__(self, *args, **kwargs): return apply(self.fetch, args, kwargs) def __getattr__(self, tag): if len(tag) > 3 and tag.rfind('Tag') == len(tag) - 3: return self.first(tag[:-3]) if tag.find('__') != 0: return self.first(tag) def __eq__(self, other): if not hasattr(other, 'name') and not hasattr(other, 'attrs') and not hasattr(other, 'contents') and self.name != other.name and self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: return False return True def __ne__(self, other): return not (self == other) def __repr__(self): return str(self) def __unicode__(self): return self.__str__(1) def __str__(self, needUnicode = None, showStructureIndent = None): attrs = [] if self.attrs: for key, val in self.attrs: attrs.append('%s="%s"' % (key, val)) close = '' closeTag = '' if self.isSelfClosing(): close = ' /' else: closeTag = '</%s>' % self.name indentIncrement = None if showStructureIndent != None: indentIncrement = showStructureIndent if not self.hidden: indentIncrement += 1 contents = self.renderContents(indentIncrement, needUnicode = needUnicode) if showStructureIndent: space = '\n%s' % ' ' * showStructureIndent if self.hidden: s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if showStructureIndent: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) s.append(contents) if closeTag and showStructureIndent != None: s.append(space) s.append(closeTag) s = ''.join(s) isUnicode = type(s) == types.UnicodeType if needUnicode and not isUnicode: s = unicode(s) elif isUnicode and needUnicode == False: s = str(s) return s def prettify(self, needUnicode = None): return self.__str__(needUnicode, showStructureIndent = True) def renderContents(self, showStructureIndent = None, needUnicode = None): s = [] for c in self: text = None if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: text = unicode(c) elif isinstance(c, Tag): s.append(c.__str__(needUnicode, showStructureIndent)) elif needUnicode: text = unicode(c) else: text = str(c) if text: if showStructureIndent != None: if text[-1] == '\n': text = text[:-1] s.append(text) continue return ''.join(s) def firstText(self, text, recursive = True): return self.first(recursive = recursive, text = text) def fetchText(self, text, recursive = True, limit = None): return self.fetch(recursive = recursive, text = text, limit = limit) def first(self, name = None, attrs = { }, recursive = True, text = None): r = Null l = self.fetch(name, attrs, recursive, text, 1) if l: r = l[0] return r findChild = first def fetch(self, name = None, attrs = { }, recursive = True, text = None, limit = None): generator = self.recursiveChildGenerator if not recursive: generator = self.childGenerator return self._fetch(name, attrs, text, limit, generator) fetchChildren = fetch def isSelfClosing(self): return self.name in BeautifulSoup.SELF_CLOSING_TAGS def append(self, tag): self.contents.append(tag) def _getAttrMap(self): if not getattr(self, 'attrMap'): self.attrMap = { } for key, value in self.attrs: self.attrMap[key] = value return self.attrMap def childGenerator(self): for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration def recursiveChildGenerator(self): stack = [ (self, 0)] while stack: (tag, start) = stack.pop() if isinstance(tag, Tag): for i in range(start, len(tag.contents)): a = tag.contents[i] yield a if isinstance(a, Tag) and tag.contents: if i < len(tag.contents) - 1: stack.append((tag, i + 1)) stack.append((a, 0)) break continue raise StopIteration def isList(l): if not hasattr(l, '__iter__'): pass return type(l) in (types.ListType, types.TupleType) def buildTagMap(default, *args): built = { } for portion in args: if hasattr(portion, 'items'): for k, v in portion.items(): built[k] = v if isList(portion): for k in portion: built[k] = default built[portion] = default return built class BeautifulStoneSoup(Tag, SGMLParser): SELF_CLOSING_TAGS = { } NESTABLE_TAGS = { } RESET_NESTING_TAGS = { } QUOTE_TAGS = { } MS_CHARS = { '\x80': '€', '\x81': ' ', '\x82': '‚', '\x83': 'ƒ', '\x84': '„', '\x85': '…', '\x86': '†', '\x87': '‡', '\x88': '⁁', '\x89': '%', '\x8a': 'Š', '\x8b': '<', '\x8c': 'Œ', '\x8d': '?', '\x8e': 'Z', '\x8f': '?', '\x90': '?', '\x91': '‘', '\x92': '’', '\x93': '“', '\x94': '”', '\x95': '•', '\x96': '–', '\x97': '—', '\x98': '˜', '\x99': '™', '\x9a': 'š', '\x9b': '>', '\x9c': 'œ', '\x9d': '?', '\x9e': 'z', '\x9f': 'Ÿ' } PARSER_MASSAGE = [ (re.compile('(<[^<>]*)/>'), (lambda x: x.group(1) + ' />')), (re.compile('<!\\s+([^<>]*)>'), (lambda x: '<!' + x.group(1) + '>')), (re.compile('([\x80-\x9f])'), (lambda x: BeautifulStoneSoup.MS_CHARS.get(x.group(1))))] ROOT_TAG_NAME = '[document]' def __init__(self, text = None, avoidParserProblems = True, initialTextIsEverything = True): Tag.__init__(self, self.ROOT_TAG_NAME) if avoidParserProblems and not isList(avoidParserProblems): avoidParserProblems = self.PARSER_MASSAGE self.avoidParserProblems = avoidParserProblems SGMLParser.__init__(self) self.quoteStack = [] self.hidden = 1 self.reset() if hasattr(text, 'read'): text = text.read() if text: self.feed(text) if initialTextIsEverything: self.done() def __getattr__(self, methodName): if methodName.find('start_') == 0 and methodName.find('end_') == 0 or methodName.find('do_') == 0: return SGMLParser.__getattr__(self, methodName) if methodName.find('__') != 0: return Tag.__getattr__(self, methodName) raise AttributeError def feed(self, text): if self.avoidParserProblems: for fix, m in self.avoidParserProblems: text = fix.sub(m, text) SGMLParser.feed(self, text) def done(self): self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def reset(self): SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.pushTag(self) def popTag(self): tag = self.tagStack.pop() if len(self.currentTag.contents) == 1 and isinstance(self.currentTag.contents[0], NavigableText): self.currentTag.string = self.currentTag.contents[0] if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): if self.currentTag: self.currentTag.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self): currentData = ''.join(self.currentData) if currentData: if not currentData.strip(): if '\n' in currentData: currentData = '\n' else: currentData = ' ' c = NavigableString if type(currentData) == types.UnicodeType: c = NavigableUnicodeString o = c(currentData) o.setup(self.currentTag, self.previous) if self.previous: self.previous.next = o self.previous = o self.currentTag.contents.append(o) self.currentData = [] def _popToTag(self, name, inclusivePop = True): if name == self.ROOT_TAG_NAME: return None numPops = 0 mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): if name == self.tagStack[i].name: numPops = len(self.tagStack) - i break continue name == self.ROOT_TAG_NAME if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True for i in range(len(self.tagStack) - 1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: popTo = name break if (nestingResetTriggers != None or p.name in nestingResetTriggers or nestingResetTriggers == None) and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name): popTo = p.name inclusive = False break p = p.parent if popTo: self._popToTag(popTo, inclusive) def unknown_starttag(self, name, attrs, selfClosing = 0): if self.quoteStack: attrs = ''.join(map((lambda .0: (x, y) = .0' %s="%s"' % (x, y)), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return None self.endData() if name not in self.SELF_CLOSING_TAGS and not selfClosing: self._smartPop(name) tag = Tag(name, attrs, self.currentTag, self.previous) if self.previous: self.previous.next = tag self.previous = tag self.pushTag(tag) if selfClosing or name in self.SELF_CLOSING_TAGS: self.popTag() if name in self.QUOTE_TAGS: self.quoteStack.append(name) self.literal = 1 def unknown_endtag(self, name): if self.quoteStack and self.quoteStack[-1] != name: self.handle_data('</%s>' % name) return None self.endData() self._popToTag(name) if self.quoteStack and self.quoteStack[-1] == name: self.quoteStack.pop() self.literal = len(self.quoteStack) > 0 def handle_data(self, data): self.currentData.append(data) def handle_pi(self, text): self.handle_data('<?%s>' % text) def handle_comment(self, text): self.handle_data('<!--%s-->' % text) def handle_charref(self, ref): self.handle_data('%s;' % ref) def handle_entityref(self, ref): self.handle_data('&%s;' % ref) def handle_decl(self, data): self.handle_data('<!%s>' % data) def parse_declaration(self, i): j = None if self.rawdata[i:i + 9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) self.handle_data(self.rawdata[i + 9:k]) j = k + 3 else: try: j = SGMLParser.parse_declaration(self, i) except SGMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j class BeautifulSoup(BeautifulStoneSoup): SELF_CLOSING_TAGS = buildTagMap(None, [ 'br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) QUOTE_TAGS = { 'script': None } NESTABLE_INLINE_TAGS = [ 'span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center'] NESTABLE_BLOCK_TAGS = [ 'blockquote', 'div', 'fieldset', 'ins', 'del'] NESTABLE_LIST_TAGS = { 'ol': [], 'ul': [], 'li': [ 'ul', 'ol'], 'dl': [], 'dd': [ 'dl'], 'dt': [ 'dl'] } NESTABLE_TABLE_TAGS = { 'table': [], 'tr': [ 'table', 'tbody', 'tfoot', 'thead'], 'td': [ 'tr'], 'th': [ 'tr'] } NON_NESTABLE_BLOCK_TAGS = [ 'address', 'form', 'p', 'pre'] RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) class ICantBelieveItsBeautifulSoup(BeautifulSoup): I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = [ 'em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'] I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = [ 'noscript'] NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) class BeautifulSOAP(BeautifulStoneSoup): def popTag(self): if len(self.tagStack) > 1: tag = self.tagStack[-1] parent = self.tagStack[-2] parent._getAttrMap() if isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableText) and not parent.attrMap.has_key(tag.name): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) class RobustXMLParser(BeautifulStoneSoup): pass class RobustHTMLParser(BeautifulSoup): pass class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): pass class SimplifyingSOAPParser(BeautifulSOAP): pass if __name__ == '__main__': import sys soup = BeautifulStoneSoup(sys.stdin.read()) print soup.prettify()