home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import urllib
- from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser
- from lxml import etree
- from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE
- from lxml.html._html5builder import TreeBuilder
-
- try:
- _strings = basestring
- except NameError:
- _strings = (bytes, str)
-
-
- class HTMLParser(_HTMLParser):
-
- def __init__(self, strict = False):
- _HTMLParser.__init__(self, strict = strict, tree = TreeBuilder)
-
-
-
- class XHTMLParser(_XHTMLParser):
-
- def __init__(self, strict = False):
- _XHTMLParser.__init__(self, strict = strict, tree = TreeBuilder)
-
-
-
- def _find_tag(tree, tag):
- elem = tree.find(tag)
- if elem is not None:
- return elem
- return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
-
-
- def document_fromstring(html, guess_charset = True, parser = None):
- if not isinstance(html, _strings):
- raise TypeError('string required')
- isinstance(html, _strings)
- if parser is None:
- parser = html_parser
-
- return parser.parse(html, useChardet = guess_charset).getroot()
-
-
- def fragments_fromstring(html, no_leading_text = False, guess_charset = False, parser = None):
- if not isinstance(html, _strings):
- raise TypeError('string required')
- isinstance(html, _strings)
- if parser is None:
- parser = html_parser
-
- children = parser.parseFragment(html, 'div', useChardet = guess_charset)
- if children and isinstance(children[0], _strings):
- if no_leading_text:
- if children[0].strip():
- raise etree.ParserError('There is leading text: %r' % children[0])
- children[0].strip()
- del children[0]
-
-
- return children
-
-
- def fragment_fromstring(html, create_parent = False, guess_charset = False, parser = None):
- if not isinstance(html, _strings):
- raise TypeError('string required')
- isinstance(html, _strings)
- if create_parent:
- if not create_parent:
- pass
- container = 'div'
- html = '<%s>%s</%s>' % (container, html, container)
-
- children = fragments_fromstring(html, True, guess_charset, parser)
- if not children:
- raise etree.ParserError('No elements found')
- children
- if len(children) > 1:
- raise etree.ParserError('Multiple elements found')
- len(children) > 1
- result = children[0]
- if result.tail and result.tail.strip():
- raise etree.ParserError('Element followed by text: %r' % result.tail)
- result.tail.strip()
- result.tail = None
- return result
-
-
- def fromstring(html, guess_charset = True, parser = None):
- if not isinstance(html, _strings):
- raise TypeError('string required')
- isinstance(html, _strings)
- doc = document_fromstring(html, parser = parser, guess_charset = guess_charset)
- start = html[:50].lstrip().lower()
- if start.startswith('<html') or start.startswith('<!doctype'):
- return doc
- head = _find_tag(doc, 'head')
- if len(head):
- return doc
- body = _find_tag(doc, 'body')
- if len(body) == 1:
- if not (body.text) or not body.text.strip():
- if not (body[-1].tail) or not body[-1].tail.strip():
- return body[0]
- return body
-
-
- def parse(filename_url_or_file, guess_charset = True, parser = None):
- if parser is None:
- parser = html_parser
-
- if isinstance(filename_url_or_file, basestring):
- fp = urllib.urlopen(filename_url_or_file)
- else:
- fp = filename_url_or_file
- return parser.parse(fp, useChardet = guess_charset)
-
- html_parser = HTMLParser()
- xhtml_parser = XHTMLParser()
-