Maximum CD 2009 June

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2009 June / maximum-cd-2009-06.iso / DiscContents / digsby_setup.exe / lib / lxml / html / __init__.pyo (.txt) next >

Wrap

Python Compiled Bytecode | 2009-02-26 | 36.7 KB | 1,254 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.5) import threading import re try: from urlparse import urljoin except ImportError: from urllib.parse import urljoin import copy from lxml import etree from lxml.html import defs from lxml import cssselect from lxml.html._setmixin import SetMixin try: from UserDict import DictMixin except ImportError: from lxml.html._dictmixin import DictMixin try: set except NameError: from sets import Set as set try: bytes = __builtins__['bytes'] except (KeyError, NameError): bytes = str try: unicode = __builtins__['unicode'] except (KeyError, NameError): unicode = str try: basestring = __builtins__['basestring'] except (KeyError, NameError): basestring = (str, bytes) def __fix_docstring(s): import sys if s is None: return None if sys.version_info[0] >= 3: sub = re.compile("^(\\s*)u'", re.M).sub else: sub = re.compile("^(\\s*)b'", re.M).sub return sub("\\1'", s) __all__ = [ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] XHTML_NAMESPACE = 'http://www.w3.org/1999/xhtml' _rel_links_xpath = etree.XPath('descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]', namespaces = { 'x': XHTML_NAMESPACE }) _options_xpath = etree.XPath('descendant-or-self::option|descendant-or-self::x:option', namespaces = { 'x': XHTML_NAMESPACE }) _forms_xpath = etree.XPath('descendant-or-self::form|descendant-or-self::x:form', namespaces = { 'x': XHTML_NAMESPACE }) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath('descendant-or-self::*[@id=$id]') _collect_string_content = etree.XPath('string()') _css_url_re = re.compile('url\$(.*?)\$', re.I) _css_import_re = re.compile('@import "(.*?)"') _label_xpath = etree.XPath('//label[@for=$id]|//x:label[@for=$id]', namespaces = { 'x': XHTML_NAMESPACE }) _archive_re = re.compile('[^ ]+') def _transform_result(typ, result): if issubclass(typ, bytes): return tostring(result, encoding = 'utf-8') elif issubclass(typ, unicode): return tostring(result, encoding = unicode) else: return result def _nons(tag): if isinstance(tag, basestring): if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE: return tag.split('}')[-1] return tag class HtmlMixin(object): def base_url(self): return self.getroottree().docinfo.URL base_url = property(base_url, doc = base_url.__doc__) def forms(self): return _forms_xpath(self) forms = property(forms, doc = forms.__doc__) def body(self): return self.xpath('//body|//x:body', namespaces = { 'x': XHTML_NAMESPACE })[0] body = property(body, doc = body.__doc__) def head(self): return self.xpath('//head|//x:head', namespaces = { 'x': XHTML_NAMESPACE })[0] head = property(head, doc = head.__doc__) def _label__get(self): id = self.get('id') if not id: return None result = _label_xpath(self, id = id) if not result: return None else: return result[0] def _label__set(self, label): id = self.get('id') if not id: raise TypeError('You cannot set a label for an element (%r) that has no id' % self) if _nons(label.tag) != 'label': raise TypeError('You can only assign label to a label element (not %r)' % label) label.set('for', id) def _label__del(self): label = self.label if label is not None: del label.attrib['for'] label = property(_label__get, _label__set, _label__del, doc = _label__get.__doc__) def drop_tree(self): parent = self.getparent() if self.tail: previous = self.getprevious() if previous is None: if not parent.text: pass parent.text = '' + self.tail elif not previous.tail: pass previous.tail = '' + self.tail parent.remove(self) def drop_tag(self): parent = self.getparent() previous = self.getprevious() if self.text and isinstance(self.tag, basestring): if previous is None: if not parent.text: pass parent.text = '' + self.text elif not previous.tail: pass previous.tail = '' + self.text if self.tail: if len(self): last = self[-1] if not last.tail: pass last.tail = '' + self.tail elif previous is None: if not parent.text: pass parent.text = '' + self.tail elif not previous.tail: pass previous.tail = '' + self.tail index = parent.index(self) parent[index:index + 1] = self[:] def find_rel_links(self, rel): rel = rel.lower() return _[1] def find_class(self, class_name): return _class_xpath(self, class_name = class_name) def get_element_by_id(self, id, *default): try: return _id_xpath(self, id = id)[0] except IndexError: if default: return default[0] else: raise KeyError(id) except: default def text_content(self): return _collect_string_content(self) def cssselect(self, expr): return cssselect.CSSSelector(expr)(self) def make_links_absolute(self, base_url = None, resolve_base_href = True): if base_url is None: base_url = self.base_url if base_url is None: raise TypeError('No base_url given, and the document has no base_url') if resolve_base_href: self.resolve_base_href() def link_repl(href): return urljoin(base_url, href) self.rewrite_links(link_repl) def resolve_base_href(self): base_href = None basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces = { 'x': XHTML_NAMESPACE }) for b in basetags: base_href = b.get('href') b.drop_tree() if not base_href: return None self.make_links_absolute(base_href, resolve_base_href = False) def iterlinks(self): link_attrs = defs.link_attrs for el in self.iter(): attribs = el.attrib tag = _nons(el.tag) if tag != 'object': for attrib in link_attrs: if attrib in attribs: yield (el, attrib, attribs[attrib], 0) continue elif tag == 'object': codebase = None if 'codebase' in attribs: codebase = el.get('codebase') yield (el, 'codebase', codebase, 0) for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) yield (el, attrib, value, 0) continue if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) yield (el, 'archive', value, match.start()) if tag == 'param': if not el.get('valuetype'): pass valuetype = '' if valuetype.lower() == 'ref': yield (el, 'value', el.get('value'), 0) if tag == 'style' and el.text: for match in _css_url_re.finditer(el.text): yield (el, None, match.group(1), match.start(1)) for match in _css_import_re.finditer(el.text): yield (el, None, match.group(1), match.start(1)) if 'style' in attribs: for match in _css_url_re.finditer(attribs['style']): yield (el, 'style', match.group(1), match.start(1)) def rewrite_links(self, link_repl_func, resolve_base_href = True, base_href = None): if base_href is not None: self.make_links_absolute(base_href, resolve_base_href = resolve_base_href) elif resolve_base_href: self.resolve_base_href() for el, attrib, link, pos in self.iterlinks(): new_link = link_repl_func(link.strip()) if new_link == link: continue if new_link is None: if attrib is None: el.text = '' continue del el.attrib[attrib] continue if attrib is None: new = el.text[:pos] + new_link + el.text[pos + len(link):] el.text = new continue cur = el.attrib[attrib] if not pos and len(cur) == len(link): el.attrib[attrib] = new_link continue new = cur[:pos] + new_link + cur[pos + len(link):] el.attrib[attrib] = new class _MethodFunc(object): def __init__(self, name, copy = False, source_class = HtmlMixin): self.name = name self.copy = copy self.__doc__ = getattr(source_class, self.name).__doc__ def __call__(self, doc, *args, **kw): result_type = type(doc) if isinstance(doc, basestring): if 'copy' in kw: raise TypeError("The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) doc = fromstring(doc, **kw) elif 'copy' in kw: copy = kw.pop('copy') else: copy = self.copy if copy: doc = copy.deepcopy(doc) meth = getattr(doc, self.name) result = meth(*args, **kw) if result is None: return _transform_result(result_type, doc) else: return result find_rel_links = _MethodFunc('find_rel_links', copy = False) find_class = _MethodFunc('find_class', copy = False) make_links_absolute = _MethodFunc('make_links_absolute', copy = True) resolve_base_href = _MethodFunc('resolve_base_href', copy = True) iterlinks = _MethodFunc('iterlinks', copy = False) rewrite_links = _MethodFunc('rewrite_links', copy = True) class HtmlComment(etree.CommentBase, HtmlMixin): pass class HtmlElement(etree.ElementBase, HtmlMixin): pass class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): pass class HtmlEntity(etree.EntityBase, HtmlMixin): pass class HtmlElementClassLookup(etree.CustomElementClassLookup): _default_element_classes = { } def __init__(self, classes = None, mixins = None): etree.CustomElementClassLookup.__init__(self) if classes is None: classes = self._default_element_classes.copy() if mixins: mixers = { } for name, value in mixins: if name == '*': for n in classes.keys(): mixers.setdefault(n, []).append(value) mixers.setdefault(name, []).append(value) for name, mix_bases in mixers.items(): cur = classes.get(name, HtmlElement) bases = tuple(mix_bases + [ cur]) classes[name] = type(cur.__name__, bases, { }) self._element_classes = classes def lookup(self, node_type, document, namespace, name): if node_type == 'element': return self._element_classes.get(name.lower(), HtmlElement) elif node_type == 'comment': return HtmlComment elif node_type == 'PI': return HtmlProcessingInstruction elif node_type == 'entity': return HtmlEntity def document_fromstring(html, parser = None, **kw): if parser is None: parser = html_parser value = etree.fromstring(html, parser, **kw) if value is None: raise etree.ParserError('Document is empty') return value def fragments_fromstring(html, no_leading_text = False, base_url = None, parser = None, **kw): if parser is None: parser = html_parser start = html[:20].lstrip().lower() if not start.startswith('<html') and not start.startswith('<!doctype'): html = '<html><body>%s</body></html>' % html doc = document_fromstring(html, parser = parser, base_url = base_url, **kw) bodies = _[1] body = bodies[0] elements = [] if body.text and body.text.strip(): elements.append(body.text) elements.extend(body) return elements def fragment_fromstring(html, create_parent = False, base_url = None, parser = None, **kw): if parser is None: parser = html_parser if create_parent: if not isinstance(create_parent, basestring): create_parent = 'div' return fragment_fromstring('<%s>%s</%s>' % (create_parent, html, create_parent), parser = parser, base_url = base_url, **kw) elements = fragments_fromstring(html, parser = parser, no_leading_text = True, base_url = base_url, **kw) if not elements: raise etree.ParserError('No elements found') el = elements[0] if el.tail and el.tail.strip(): raise etree.ParserError('Element followed by text: %r' % el.tail) el.tail = None return el def fromstring(html, base_url = None, parser = None, **kw): if parser is None: parser = html_parser start = html[:10].lstrip().lower() if start.startswith('<html') or start.startswith('<!doctype'): return document_fromstring(html, parser = parser, base_url = base_url, **kw) doc = document_fromstring(html, parser = parser, base_url = base_url, **kw) bodies = doc.findall('body') if not bodies: bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) if bodies: body = bodies[0] if len(bodies) > 1: for other_body in bodies[1:]: if other_body.text: if len(body): if not body[-1].tail: pass body[-1].tail = '' + other_body.text elif not body.text: pass body.text = '' + other_body.text body.extend(other_body) other_body.drop_tree() else: body = None heads = doc.findall('head') if not heads: heads = doc.findall('{%s}head' % XHTML_NAMESPACE) if heads: head = heads[0] if len(heads) > 1: for other_head in heads[1:]: head.extend(other_head) other_head.drop_tree() return doc if len(body) == 1: if not (body.text) or not body.text.strip(): if not (body[-1].tail) or not body[-1].tail.strip(): return body[0] if _contains_block_level_tag(body): body.tag = 'div' else: body.tag = 'span' return body def parse(filename_or_url, parser = None, base_url = None, **kw): if parser is None: parser = html_parser return etree.parse(filename_or_url, parser, base_url = base_url, **kw) def _contains_block_level_tag(el): for el in el.iter(): if _nons(el.tag) in defs.block_tags: return True continue return False def _element_name(el): if isinstance(el, etree.CommentBase): return 'comment' elif isinstance(el, basestring): return 'string' else: return _nons(el.tag) class FormElement(HtmlElement): def inputs(self): return InputGetter(self) inputs = property(inputs, doc = inputs.__doc__) def _fields__get(self): return FieldsDict(self.inputs) def _fields__set(self, value): prev_keys = self.fields.keys() for key, value in value.iteritems(): if key in prev_keys: prev_keys.remove(key) self.fields[key] = value for key in prev_keys: if key is None: continue self.fields[key] = None fields = property(_fields__get, _fields__set, doc = _fields__get.__doc__) def _name(self): if self.get('name'): return self.get('name') elif self.get('id'): return '#' + self.get('id') forms = self.body.findall('form') if not forms: forms = self.body.findall('{%s}form' % XHTML_NAMESPACE) return str(forms.index(self)) def form_values(self): results = [] for el in self.inputs: name = el.name if not name: continue tag = _nons(el.tag) if tag == 'textarea': results.append((name, el.value)) continue if tag == 'select': value = el.value if el.multiple: for v in value: results.append((name, v)) elif value is not None: results.append((name, el.value)) el.multiple if el.checkable and not (el.checked): continue if el.type in ('submit', 'image', 'reset'): continue value = el.value if value is not None: results.append((name, el.value)) continue return results def _action__get(self): base_url = self.base_url action = self.get('action') if base_url and action is not None: return urljoin(base_url, action) else: return action def _action__set(self, value): self.set('action', value) def _action__del(self): if 'action' in self.attrib: del self.attrib['action'] action = property(_action__get, _action__set, _action__del, doc = _action__get.__doc__) def _method__get(self): return self.get('method', 'GET').upper() def _method__set(self, value): self.set('method', value.upper()) method = property(_method__get, _method__set, doc = _method__get.__doc__) HtmlElementClassLookup._default_element_classes['form'] = FormElement def submit_form(form, extra_values = None, open_http = None): values = form.form_values() if extra_values: if hasattr(extra_values, 'items'): extra_values = extra_values.items() values.extend(extra_values) if open_http is None: open_http = open_http_urllib return open_http(form.method, form.action, values) def open_http_urllib(method, url, values): import urllib if method == 'GET': if '?' in url: url += '&' else: url += '?' url += urllib.urlencode(values) data = None else: data = urllib.urlencode(values) return urllib.urlopen(url, data) class FieldsDict(DictMixin): def __init__(self, inputs): self.inputs = inputs def __getitem__(self, item): return self.inputs[item].value def __setitem__(self, item, value): self.inputs[item].value = value def __delitem__(self, item): raise KeyError('You cannot remove keys from ElementDict') def keys(self): return self.inputs.keys() def __contains__(self, item): return item in self.inputs def __repr__(self): return '<%s for form %s>' % (self.__class__.__name__, self.inputs.form._name()) class InputGetter(object): _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") def __init__(self, form): self.form = form def __repr__(self): return '<%s for form %s>' % (self.__class__.__name__, self.form._name()) def __getitem__(self, name): results = self._name_xpath(self.form, name = name) if results: type = results[0].get('type') if type == 'radio' and len(results) > 1: group = RadioGroup(results) group.name = name return group elif type == 'checkbox' and len(results) > 1: group = CheckboxGroup(results) group.name = name return group else: return results[0] else: raise KeyError('No input element with the name %r' % name) def __contains__(self, name): results = self._name_xpath(self.form, name = name) return bool(results) def keys(self): names = set() for el in self: names.add(el.name) if None in names: names.remove(None) return list(names) def __iter__(self): return iter(self._all_xpath(self.form)) class InputMixin(object): def _name__get(self): return self.get('name') def _name__set(self, value): self.set('name', value) def _name__del(self): if 'name' in self.attrib: del self.attrib['name'] name = property(_name__get, _name__set, _name__del, doc = _name__get.__doc__) def __repr__(self): type = getattr(self, 'type', None) if type: type = ' type=%r' % type else: type = '' return '<%s %x name=%r%s>' % (self.__class__.__name__, id(self), self.name, type) class TextareaElement(InputMixin, HtmlElement): def _value__get(self): if not self.text: pass return '' def _value__set(self, value): self.text = value def _value__del(self): self.text = '' value = property(_value__get, _value__set, _value__del, doc = _value__get.__doc__) HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement class SelectElement(InputMixin, HtmlElement): def _value__get(self): if self.multiple: return MultipleSelectOptions(self) for el in _options_xpath(self): if 'selected' in el.attrib: value = el.get('value') return value continue def _value__set(self, value): if self.multiple: if isinstance(value, basestring): raise TypeError('You must pass in a sequence') self.value.clear() self.value.update(value) return None if value is not None: for el in _options_xpath(self): if el.get('value') == value: checked_option = el break continue else: raise ValueError('There is no option with the value of %r' % value) for el in _options_xpath(self): if 'selected' in el.attrib: del el.attrib['selected'] continue if value is not None: checked_option.set('selected', '') def _value__del(self): if self.multiple: self.value.clear() else: self.value = None value = property(_value__get, _value__set, _value__del, doc = _value__get.__doc__) def value_options(self): return [ el.get('value') for el in _options_xpath(self) ] value_options = property(value_options, doc = value_options.__doc__) def _multiple__get(self): return 'multiple' in self.attrib def _multiple__set(self, value): if value: self.set('multiple', '') elif 'multiple' in self.attrib: del self.attrib['multiple'] multiple = property(_multiple__get, _multiple__set, doc = _multiple__get.__doc__) HtmlElementClassLookup._default_element_classes['select'] = SelectElement class MultipleSelectOptions(SetMixin): def __init__(self, select): self.select = select def options(self): return iter(_options_xpath(self.select)) options = property(options) def __iter__(self): for option in self.options: yield option.get('value') def add(self, item): for option in self.options: if option.get('value') == item: option.set('selected', '') break continue else: raise ValueError('There is no option with the value %r' % item) def remove(self, item): for option in self.options: if option.get('value') == item: if 'selected' in option.attrib: del option.attrib['selected'] else: raise ValueError('The option %r is not currently selected' % item) break continue else: raise ValueError('There is not option with the value %r' % item) def __repr__(self): return ', '.join % ([], []([ repr(v) for v in self ]), self.select.name) class RadioGroup(list): def _value__get(self): for el in self: if 'checked' in el.attrib: return el.get('value') continue def _value__set(self, value): if value is not None: for el in self: if el.get('value') == value: checked_option = el break continue else: raise ValueError('There is no radio input with the value %r' % value) for el in self: if 'checked' in el.attrib: del el.attrib['checked'] continue if value is not None: checked_option.set('checked', '') def _value__del(self): self.value = None value = property(_value__get, _value__set, _value__del, doc = _value__get.__doc__) def value_options(self): return [ el.get('value') for el in self ] value_options = property(value_options, doc = value_options.__doc__) def __repr__(self): return '%s(%s)' % (self.__class__.__name__, list.__repr__(self)) class CheckboxGroup(list): def _value__get(self): return CheckboxValues(self) def _value__set(self, value): self.value.clear() if not hasattr(value, '__iter__'): raise ValueError('A CheckboxGroup (name=%r) must be set to a sequence (not %r)' % (self[0].name, value)) self.value.update(value) def _value__del(self): self.value.clear() value = property(_value__get, _value__set, _value__del, doc = _value__get.__doc__) def __repr__(self): return '%s(%s)' % (self.__class__.__name__, list.__repr__(self)) class CheckboxValues(SetMixin): def __init__(self, group): self.group = group def __iter__(self): return [](_[1]) def add(self, value): for el in self.group: if el.get('value') == value: el.set('checked', '') break continue else: raise KeyError('No checkbox with value %r' % value) def remove(self, value): for el in self.group: if el.get('value') == value: if 'checked' in el.attrib: del el.attrib['checked'] else: raise KeyError('The checkbox with value %r was already unchecked' % value) break continue else: raise KeyError('No checkbox with value %r' % value) def __repr__(self): return ', '.join % ([], []([ repr(v) for v in self ]), self.group.name) class InputElement(InputMixin, HtmlElement): def _value__get(self): if self.checkable: if self.checked: if not self.get('value'): pass return 'on' else: return None return self.get('value') def _value__set(self, value): if self.checkable: if not value: self.checked = False else: self.checked = True if isinstance(value, basestring): self.set('value', value) else: self.set('value', value) def _value__del(self): if self.checkable: self.checked = False elif 'value' in self.attrib: del self.attrib['value'] value = property(_value__get, _value__set, _value__del, doc = _value__get.__doc__) def _type__get(self): return self.get('type', 'text').lower() def _type__set(self, value): self.set('type', value) type = property(_type__get, _type__set, doc = _type__get.__doc__) def checkable(self): return self.type in ('checkbox', 'radio') checkable = property(checkable, doc = checkable.__doc__) def _checked__get(self): if not self.checkable: raise AttributeError('Not a checkable input type') return 'checked' in self.attrib def _checked__set(self, value): if not self.checkable: raise AttributeError('Not a checkable input type') if value: self.set('checked', '') elif 'checked' in self.attrib: del self.attrib['checked'] checked = property(_checked__get, _checked__set, doc = _checked__get.__doc__) HtmlElementClassLookup._default_element_classes['input'] = InputElement class LabelElement(HtmlElement): def _for_element__get(self): id = self.get('for') if not id: return None return self.body.get_element_by_id(id) def _for_element__set(self, other): id = other.get('id') if not id: raise TypeError('Element %r has no id attribute' % other) self.set('for', id) def _for_element__del(self): if 'id' in self.attrib: del self.attrib['id'] for_element = property(_for_element__get, _for_element__set, _for_element__del, doc = _for_element__get.__doc__) HtmlElementClassLookup._default_element_classes['label'] = LabelElement def html_to_xhtml(html): try: html = html.getroot() except AttributeError: pass prefix = '{%s}' % XHTML_NAMESPACE for el in html.iter(): tag = el.tag if isinstance(tag, basestring): if tag[0] != '{': el.tag = prefix + tag tag[0] != '{' def xhtml_to_html(xhtml): try: xhtml = xhtml.getroot() except AttributeError: pass prefix = '{%s}' % XHTML_NAMESPACE prefix_len = len(prefix) for el in xhtml.iter(prefix + '*'): el.tag = el.tag[prefix_len:] __replace_meta_content_type = re.compile('<meta http-equiv="Content-Type".*?>').sub def tostring(doc, pretty_print = False, include_meta_content_type = False, encoding = None, method = 'html'): html = etree.tostring(doc, method = method, pretty_print = pretty_print, encoding = encoding) if not include_meta_content_type: html = __replace_meta_content_type('', html) return html tostring.__doc__ = __fix_docstring(tostring.__doc__) def open_in_browser(doc): import os import webbrowser try: write_doc = doc.write except AttributeError: write_doc = etree.ElementTree(element = doc).write fn = os.tempnam() + '.html' write_doc(fn, method = 'html') url = 'file://' + fn.replace(os.path.sep, '/') print url webbrowser.open(url) class HTMLParser(etree.HTMLParser): def __init__(self, **kwargs): super(HTMLParser, self).__init__(**kwargs) self.set_element_class_lookup(HtmlElementClassLookup()) class XHTMLParser(etree.XMLParser): def __init__(self, **kwargs): super(XHTMLParser, self).__init__(**kwargs) self.set_element_class_lookup(HtmlElementClassLookup()) def Element(*args, **kw): v = html_parser.makeelement(*args, **kw) return v html_parser = HTMLParser() xhtml_parser = XHTMLParser()