Personal Computer World 2008 February

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2008 February / PCWFEB08.iso / Software / Freeware / Miro 1.0 / Miro_Installer.exe / Miro_Downloader.exe / feedparser.pyc (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2007-11-12 | 99.0 KB | 3,456 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.5) '''Universal feed parser Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds Visit http://feedparser.org/ for the latest version Visit http://feedparser.org/docs/ for the latest documentation Required: Python 2.1 or later Recommended: Python 2.3 or later Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> ''' __version__ = '4.1' __license__ = "Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice,\n this list of conditions and the following disclaimer.\n* Redistributions in binary form must reproduce the above copyright notice,\n this list of conditions and the following disclaimer in the documentation\n and/or other materials provided with the distribution.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE." __author__ = 'Mark Pilgrim <http://diveintomark.org/>' __contributors__ = [ 'Jason Diamond <http://injektilo.org/>', 'John Beimler <http://john.beimler.org/>', 'Fazal Majid <http://www.majid.info/mylos/weblog/>', 'Aaron Swartz <http://aaronsw.com/>', 'Kevin Marks <http://epeus.blogspot.com/>'] _debug = 0 USER_AGENT = 'UniversalFeedParser/%s +http://feedparser.org/' % __version__ import config import prefs USER_AGENT += ' %s/%s (%s)' % (config.get(prefs.SHORT_APP_NAME), config.get(prefs.APP_VERSION), config.get(prefs.PROJECT_URL)) ACCEPT_HEADER = 'application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1' PREFERRED_XML_PARSERS = [ 'drv_libxml2'] TIDY_MARKUP = 0 PREFERRED_TIDY_INTERFACES = [ 'uTidy', 'mxTidy'] import sgmllib import re import sys import copy import urlparse import time import rfc822 import types import cgi import urllib import urllib2 try: from cStringIO import StringIO as _StringIO except: from StringIO import StringIO as _StringIO try: import gzip except: gzip = None try: import zlib except: zlib = None try: import xml.sax as xml xml.sax.make_parser(PREFERRED_XML_PARSERS) from xml.sax.saxutils import escape as _xmlescape _XML_AVAILABLE = 1 except: _XML_AVAILABLE = 0 def _xmlescape(data): data = data.replace('&', '&') data = data.replace('>', '>') data = data.replace('<', '<') return data try: import base64 import binascii except: base64 = binascii = None try: import cjkcodecs.aliases as cjkcodecs except: pass try: import iconv_codec except: pass try: import chardet if _debug: import chardet.constants as chardet chardet.constants._debug = 1 except: chardet = None class ThingsNobodyCaresAboutButMe(Exception): pass class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass class UndeclaredNamespace(Exception): pass sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.special = re.compile('<!') sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]') SUPPORTED_VERSIONS = { '': 'unknown', 'rss090': 'RSS 0.90', 'rss091n': 'RSS 0.91 (Netscape)', 'rss091u': 'RSS 0.91 (Userland)', 'rss092': 'RSS 0.92', 'rss093': 'RSS 0.93', 'rss094': 'RSS 0.94', 'rss20': 'RSS 2.0', 'rss10': 'RSS 1.0', 'rss': 'RSS (unknown version)', 'atom01': 'Atom 0.1', 'atom02': 'Atom 0.2', 'atom03': 'Atom 0.3', 'atom10': 'Atom 1.0', 'atom': 'Atom (unknown version)', 'cdf': 'CDF', 'hotrss': 'Hot RSS' } try: UserDict = dict except NameError: from UserDict import UserDict def dict(aList): rc = { } for k, v in aList: rc[k] = v return rc def _entry_equal(a, b): if type(a) == list and type(b) == list: if len(a) != len(b): return False for i in xrange(len(a)): if not _entry_equal(a[i], b[i]): return False continue return True try: return a.equal(b) except: try: return b.equal(a) return a == b class FeedParserDict(UserDict): keymap = { 'channel': 'feed', 'items': 'entries', 'guid': 'id', 'length': 'filesize', 'image': 'thumbnail', 'date': 'updated', 'date_parsed': 'updated_parsed', 'description': [ 'subtitle', 'summary'], 'url': [ 'href'], 'modified': 'updated', 'modified_parsed': 'updated_parsed', 'issued': 'published', 'issued_parsed': 'published_parsed', 'copyright': 'rights', 'copyright_detail': 'rights_detail', 'tagline': 'subtitle', 'tagline_detail': 'subtitle_detail' } reverse_keymap = { } for key in keymap: if type(keymap[key]) == types.ListType: for k in keymap[key]: reverse_keymap[k] = key reverse_keymap[keymap[key]] = key def __init__(self, initialData = None): if isinstance(initialData, dict): UserDict.__init__(self) for key in initialData: self[key] = initialData[key] elif initialData is not None: UserDict.__init__(self, initialData) else: UserDict.__init__(self) def reverse_key(self, key): if self.reverse_keymap.has_key(key): return self.reverse_keymap[key] else: return key def get_iter(self): class ExtendedIter: def __init__(self, container): self.container = container self.subiter = UserDict.__iter__(container) def __iter__(self): return self def next(self): return self.container.reverse_key(self.subiter.next()) return ExtendedIter(self) def equal(self, other): try: iter = other.get_iter() except: iter = other.__iter__() try: checked = { } for key in iter: if not _entry_equal(self[key], other[key]): return False checked[key] = key for key in self.get_iter(): if not checked.has_key(key): return False continue return True except: return False def __getitem__(self, key): if key == 'category': return UserDict.__getitem__(self, 'tags')[0]['term'] realkey = self.keymap.get(key, key) if type(realkey) == types.ListType: for k in realkey: if UserDict.has_key(self, k): return UserDict.__getitem__(self, k) continue None if key == 'categories' else [] if UserDict.has_key(self, key): return UserDict.__getitem__(self, key) return UserDict.__getitem__(self, realkey) def __setitem__(self, key, value): for k in self.keymap.keys(): if key == k: key = self.keymap[k] if type(key) == types.ListType: key = key[0] type(key) == types.ListType return UserDict.__setitem__(self, key, value) def get(self, key, default = None): if self.has_key(key): return self[key] else: return default def setdefault(self, key, value): if not self.has_key(key): self[key] = value return self[key] def has_key(self, key): try: if not hasattr(self, key): pass return UserDict.has_key(self, key) except AttributeError: return False def __getattr__(self, key): try: if not not key.startswith('_'): raise AssertionError return self.__getitem__(key) except: raise AttributeError, "object has no attribute '%s'" % key def __setattr__(self, key, value): if key.startswith('_') or key == 'data': self.__dict__[key] = value else: return self.__setitem__(key, value) def __contains__(self, key): return self.has_key(key) def zopeCompatibilityHack(): global FeedParserDict, FeedParserDict del FeedParserDict def FeedParserDict(aDict = None): rc = { } if aDict: rc.update(aDict) return rc _ebcdic_to_ascii_map = None def _ebcdic_to_ascii(s): global _ebcdic_to_ascii_map if not _ebcdic_to_ascii_map: emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 250, 251, 252, 253, 254, 255) import string _ebcdic_to_ascii_map = string.maketrans(''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(_ebcdic_to_ascii_map) _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub('\\1\\3', uri) return urlparse.urljoin(base, uri) class _FeedParserMixin: namespaces = { '': '', 'http://backend.userland.com/rss': '', 'http://blogs.law.harvard.edu/tech/rss': '', 'http://purl.org/rss/1.0/': '', 'http://my.netscape.com/rdf/simple/0.9/': '', 'http://example.com/newformat#': '', 'http://example.com/necho': '', 'http://purl.org/echo/': '', 'uri/of/echo/namespace#': '', 'http://purl.org/pie/': '', 'http://purl.org/atom/ns#': '', 'http://www.w3.org/2005/Atom': '', 'http://purl.org/rss/1.0/modules/rss091#': '', 'http://webns.net/mvcb/': 'admin', 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', 'http://media.tangent.org/rss/1.0/': 'audio', 'http://backend.userland.com/blogChannelModule': 'blogChannel', 'http://web.resource.org/cc/': 'cc', 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', 'http://purl.org/rss/1.0/modules/company': 'co', 'http://purl.org/rss/1.0/modules/content/': 'content', 'http://my.theinfo.org/changed/1.0/rss/': 'cp', 'http://purl.org/dc/elements/1.1/': 'dc', 'http://purl.org/dc/terms/': 'dcterms', 'http://purl.org/rss/1.0/modules/email/': 'email', 'http://purl.org/rss/1.0/modules/event/': 'ev', 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', 'http://freshmeat.net/rss/fm/': 'fm', 'http://xmlns.com/foaf/0.1/': 'foaf', 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', 'http://postneo.com/icbm/': 'icbm', 'http://purl.org/rss/1.0/modules/image/': 'image', 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', 'http://purl.org/rss/1.0/modules/link/': 'l', 'http://search.yahoo.com/mrss': 'media', 'http://search.yahoo.com/mrss/': 'media', 'http://docs.yahoo.com/mediaModule': 'media', 'http://tools.search.yahoo.com/mrss/': 'media', 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', 'http://purl.org/rss/1.0/modules/reference/': 'ref', 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', 'http://purl.org/rss/1.0/modules/search/': 'search', 'http://purl.org/rss/1.0/modules/slash/': 'slash', 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', 'http://hacks.benhammersley.com/rss/streaming/': 'str', 'http://purl.org/rss/1.0/modules/subscription/': 'sub', 'http://purl.org/rss/1.0/modules/syndication/': 'sy', 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', 'http://purl.org/rss/1.0/modules/threading/': 'thr', 'http://purl.org/rss/1.0/modules/textinput/': 'ti', 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', 'http://wellformedweb.org/commentAPI/': 'wfw', 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', 'http://www.w3.org/1999/xhtml': 'xhtml', 'http://www.w3.org/XML/1998/namespace': 'xml', 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', 'http://participatoryculture.org/RSSModules/dtv/1.0': 'dtv' } _matchnamespaces = { } can_be_relative_uri = [ 'link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] can_contain_relative_uris = [ 'content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] can_contain_dangerous_markup = [ 'content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] html_types = [ 'text/html', 'application/xhtml+xml'] def __init__(self, baseuri = None, baselang = None, encoding = 'utf-8'): if _debug: sys.stderr.write('initializing FeedParser\n') if not self._matchnamespaces: for k, v in self.namespaces.items(): self._matchnamespaces[k.lower()] = v self.feeddata = FeedParserDict() self.encoding = encoding self.entries = [] self.version = '' self.namespacesInUse = { } self.infeed = 0 self.inentry = 0 self.incontent = 0 self.intextinput = 0 self.inimage = 0 self.inauthor = 0 self.incontributor = 0 self.inenclosure = 0 self.inpublisher = 0 self.insource = 0 self.sourcedata = FeedParserDict() self.contentparams = FeedParserDict() self._summaryKey = None self.namespacemap = { } self.elementstack = [] self.basestack = [] self.langstack = [] if not baseuri: pass self.baseuri = '' if not baselang: pass self.lang = None if baselang: self.feeddata['language'] = baselang def unknown_starttag(self, tag, attrs): if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) attrs = [ (k.lower(), v) for k, v in attrs ] attrs = [ (k, v) for k, v in attrs ] attrsD = FeedParserDict(attrs) if not attrsD.get('xml:base', attrsD.get('base')): pass baseuri = self.baseuri self.baseuri = _urljoin(self.baseuri, baseuri) lang = attrsD.get('xml:lang', attrsD.get('lang')) if lang == '': lang = None elif lang is None: lang = self.lang self.lang = lang self.basestack.append(self.baseuri) self.langstack.append(lang) for prefix, uri in attrs: if prefix.startswith('xmlns:'): self.trackNamespace(prefix[6:], uri) continue None if lang else [] if prefix == 'xmlns': self.trackNamespace(None, uri) continue if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): self.contentparams['type'] = 'application/xhtml+xml' if tag.find(':') != -1: (prefix, suffix) = tag.split(':', 1) else: prefix = '' suffix = tag prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' if not prefix and tag not in ('title', 'link', 'description', 'name'): self.intextinput = 0 if not prefix and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): self.inimage = 0 methodname = '_start_' + prefix + suffix try: method = getattr(self, methodname) return method(attrsD) except AttributeError: return self.push(prefix + suffix, 1) def unknown_endtag(self, tag): if _debug: sys.stderr.write('end %s\n' % tag) if tag.find(':') != -1: (prefix, suffix) = tag.split(':', 1) else: prefix = '' suffix = tag prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' methodname = '_end_' + prefix + suffix try: method = getattr(self, methodname) method() except AttributeError: self.pop(prefix + suffix) if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': tag = tag.split(':')[-1] self.handle_data('</%s>' % tag, escape = 0) if self.basestack: self.basestack.pop() if self.basestack and self.basestack[-1]: self.baseuri = self.basestack[-1] if self.langstack: self.langstack.pop() if self.langstack: self.lang = self.langstack[-1] def handle_charref(self, ref): if not self.elementstack: return None ref = ref.lower() if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): text = '&#%s;' % ref elif ref[0] == 'x': c = int(ref[1:], 16) else: c = int(ref) text = unichr(c).encode('utf-8') self.elementstack[-1][2].append(text) def handle_entityref(self, ref): if not self.elementstack: return None if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref else: def name2cp(k): import htmlentitydefs if hasattr(htmlentitydefs, 'name2codepoint'): return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] if k.startswith('&#') and k.endswith(';'): return int(k[2:-1]) return ord(k) try: name2cp(ref) except KeyError: text = '&%s;' % ref text = unichr(name2cp(ref)).encode('utf-8') self.elementstack[-1][2].append(text) def handle_data(self, text, escape = 1): if not self.elementstack: return None if escape and self.contentparams.get('type') == 'application/xhtml+xml': text = _xmlescape(text) self.elementstack[-1][2].append(text) def handle_comment(self, text): pass def handle_pi(self, text): pass def handle_decl(self, text): pass def parse_declaration(self, i): if _debug: sys.stderr.write('entering parse_declaration\n') if self.rawdata[i:i + 9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) self.handle_data(_xmlescape(self.rawdata[i + 9:k]), 0) return k + 3 else: k = self.rawdata.find('>', i) return k + 1 def mapContentType(self, contentType): contentType = contentType.lower() if contentType == 'text': contentType = 'text/plain' elif contentType == 'html': contentType = 'text/html' elif contentType == 'xhtml': contentType = 'application/xhtml+xml' return contentType def trackNamespace(self, prefix, uri): loweruri = uri.lower() if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not (self.version): self.version = 'rss090' if loweruri == 'http://purl.org/rss/1.0/' and not (self.version): self.version = 'rss10' if loweruri == 'http://www.w3.org/2005/atom' and not (self.version): self.version = 'atom10' if loweruri.find('backend.userland.com/rss') != -1: uri = 'http://backend.userland.com/rss' loweruri = uri if self._matchnamespaces.has_key(loweruri): self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri elif not prefix: pass self.namespacesInUse[''] = uri def resolveURI(self, uri): if not self.baseuri: pass return _urljoin('', uri) def decodeEntities(self, element, data): return data def push(self, element, expectingText): self.elementstack.append([ element, expectingText, []]) def pop(self, element, stripWhitespace = 1): if not self.elementstack: return None if self.elementstack[-1][0] != element: return None (element, expectingText, pieces) = self.elementstack.pop() output = ''.join(pieces) if stripWhitespace: output = output.strip() if not expectingText: return output if base64 and self.contentparams.get('base64', 0): try: output = base64.decodestring(output) except binascii.Error: pass except binascii.Incomplete: pass except: None<EXCEPTION MATCH>binascii.Error None<EXCEPTION MATCH>binascii.Error if element in self.can_be_relative_uri and output: output = self.resolveURI(output) if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) try: del self.contentparams['mode'] except KeyError: pass try: del self.contentparams['base64'] except KeyError: pass if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding) if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: if element in self.can_contain_dangerous_markup: output = sanitizeHTML(output, self.encoding) if self.encoding and type(output) != type(u''): try: output = unicode(output, self.encoding) if element == 'category': return output if self.inentry and not (self.insource): if element == 'content': self.entries[-1].setdefault(element, []) contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.entries[-1][element].append(contentparams) elif element == 'link': self.entries[-1][element] = output if output: self.entries[-1]['links'][-1]['href'] = output elif element == 'description': element = 'summary' self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.entries[-1][element + '_detail'] = contentparams elif (self.infeed or self.insource) and not (self.intextinput) and not (self.inimage): context = self._getContext() if element == 'description': element = 'subtitle' context[element] = output if element == 'link': context['links'][-1]['href'] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output context[element + '_detail'] = contentparams return output def pushContent(self, tag, attrsD, defaultContentType, expectingText): self.incontent += 1 self.contentparams = FeedParserDict({ 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 'language': self.lang, 'base': self.baseuri }) self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) self.push(tag, expectingText) def popContent(self, tag): value = self.pop(tag) self.incontent -= 1 self.contentparams.clear() return value def _mapToStandardPrefix(self, name): colonpos = name.find(':') if colonpos != -1: prefix = name[:colonpos] suffix = name[colonpos + 1:] prefix = self.namespacemap.get(prefix, prefix) name = prefix + ':' + suffix return name def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) def _isBase64(self, attrsD, contentparams): if attrsD.get('mode', '') == 'base64': return 1 else: return 0 if self.contentparams['type'].startswith('text/'): return 0 if self.contentparams['type'].endswith('+xml'): return 0 if self.contentparams['type'].endswith('/xml'): return 0 return 1 def _itsAnHrefDamnIt(self, attrsD): href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) if href: try: del attrsD['url'] except KeyError: pass try: del attrsD['uri'] except KeyError: pass attrsD['href'] = href return attrsD def _save(self, key, value): context = self._getContext() context.setdefault(key, value) def _start_rss(self, attrsD): versionmap = { '0.91': 'rss091u', '0.92': 'rss092', '0.93': 'rss093', '0.94': 'rss094' } if not self.version: attr_version = attrsD.get('version', '') version = versionmap.get(attr_version) if version: self.version = version elif attr_version.startswith('2.'): self.version = 'rss20' else: self.version = 'rss' def _start_dlhottitles(self, attrsD): self.version = 'hotrss' def _start_channel(self, attrsD): self.infeed = 1 self._cdf_common(attrsD) _start_feedinfo = _start_channel def _cdf_common(self, attrsD): if attrsD.has_key('lastmod'): self._start_modified({ }) self.elementstack[-1][-1] = attrsD['lastmod'] self._end_modified() if attrsD.has_key('href'): self._start_link({ }) self.elementstack[-1][-1] = attrsD['href'] self._end_link() def _start_feed(self, attrsD): self.infeed = 1 versionmap = { '0.1': 'atom01', '0.2': 'atom02', '0.3': 'atom03' } if not self.version: attr_version = attrsD.get('version') version = versionmap.get(attr_version) if version: self.version = version else: self.version = 'atom' def _end_channel(self): self.infeed = 0 _end_feed = _end_channel def _start_image(self, attrsD): self.inimage = 1 self.push('image', 0) context = self._getContext() context.setdefault('image', FeedParserDict()) def _end_image(self): self.pop('image') self.inimage = 0 def _start_textinput(self, attrsD): self.intextinput = 1 self.push('textinput', 0) context = self._getContext() context.setdefault('textinput', FeedParserDict()) _start_textInput = _start_textinput def _end_textinput(self): self.pop('textinput') self.intextinput = 0 _end_textInput = _end_textinput def _start_author(self, attrsD): self.inauthor = 1 self.push('author', 1) _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author _start_itunes_author = _start_author def _end_author(self): self.pop('author') self.inauthor = 0 self._sync_author_detail() _end_managingeditor = _end_author _end_dc_author = _end_author _end_dc_creator = _end_author _end_itunes_author = _end_author def _start_itunes_owner(self, attrsD): self.inpublisher = 1 self.push('publisher', 0) def _end_itunes_owner(self): self.pop('publisher') self.inpublisher = 0 self._sync_author_detail('publisher') def _start_contributor(self, attrsD): self.incontributor = 1 context = self._getContext() context.setdefault('contributors', []) context['contributors'].append(FeedParserDict()) self.push('contributor', 0) def _end_contributor(self): self.pop('contributor') self.incontributor = 0 def _start_dc_contributor(self, attrsD): self.incontributor = 1 context = self._getContext() context.setdefault('contributors', []) context['contributors'].append(FeedParserDict()) self.push('name', 0) def _end_dc_contributor(self): self._end_name() self.incontributor = 0 def _start_name(self, attrsD): self.push('name', 0) _start_itunes_name = _start_name def _end_name(self): value = self.pop('name') if self.inpublisher: self._save_author('name', value, 'publisher') elif self.inauthor: self._save_author('name', value) elif self.incontributor: self._save_contributor('name', value) elif self.intextinput: context = self._getContext() context['textinput']['name'] = value _end_itunes_name = _end_name def _start_width(self, attrsD): self.push('width', 0) def _end_width(self): value = self.pop('width') try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() context['image']['width'] = value def _start_height(self, attrsD): self.push('height', 0) def _end_height(self): value = self.pop('height') try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() context['image']['height'] = value def _start_url(self, attrsD): self.push('href', 1) _start_homepage = _start_url _start_uri = _start_url def _end_url(self): value = self.pop('href') if self.inauthor: self._save_author('href', value) elif self.incontributor: self._save_contributor('href', value) elif self.inimage: context = self._getContext() context['image']['href'] = value elif self.intextinput: context = self._getContext() context['textinput']['link'] = value _end_homepage = _end_url _end_uri = _end_url def _start_email(self, attrsD): self.push('email', 0) _start_itunes_email = _start_email def _end_email(self): value = self.pop('email') if self.inpublisher: self._save_author('email', value, 'publisher') elif self.inauthor: self._save_author('email', value) elif self.incontributor: self._save_contributor('email', value) _end_itunes_email = _end_email def _getContext(self): if self.insource: context = self.sourcedata elif self.inentry: context = self.entries[-1] else: context = self.feeddata return context def _save_author(self, key, value, prefix = 'author'): context = self._getContext() context.setdefault(prefix + '_detail', FeedParserDict()) context[prefix + '_detail'][key] = value self._sync_author_detail() def _save_contributor(self, key, value): context = self._getContext() context.setdefault('contributors', [ FeedParserDict()]) context['contributors'][-1][key] = value def _sync_author_detail(self, key = 'author'): context = self._getContext() detail = context.get('%s_detail' % key) if detail: name = detail.get('name') email = detail.get('email') if name and email: context[key] = '%s (%s)' % (name, email) elif name: context[key] = name elif email: context[key] = email else: author = context.get(key) if not author: return None emailmatch = re.search('(([a-zA-Z0-9\\_\\-\\.\\+]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?))', author) if not emailmatch: return None email = emailmatch.group(0) author = author.replace(email, '') author = author.replace('()', '') author = author.strip() if author and author[0] == '(': author = author[1:] if author and author[-1] == ')': author = author[:-1] author = author.strip() context.setdefault('%s_detail' % key, FeedParserDict()) context['%s_detail' % key]['name'] = author context['%s_detail' % key]['email'] = email def _start_subtitle(self, attrsD): self.pushContent('subtitle', attrsD, 'text/plain', 1) _start_tagline = _start_subtitle _start_itunes_subtitle = _start_subtitle def _end_subtitle(self): self.popContent('subtitle') _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle def _start_rights(self, attrsD): self.pushContent('rights', attrsD, 'text/plain', 1) _start_dc_rights = _start_rights _start_copyright = _start_rights def _end_rights(self): self.popContent('rights') _end_dc_rights = _end_rights _end_copyright = _end_rights def _start_item(self, attrsD): self.entries.append(FeedParserDict()) self.push('item', 0) self.inentry = 1 self.guidislink = 0 id = self._getAttribute(attrsD, 'rdf:about') if id: context = self._getContext() context['id'] = id self._cdf_common(attrsD) _start_entry = _start_item _start_product = _start_item def _end_item(self): self.pop('item') self.inentry = 0 _end_entry = _end_item def _start_dc_language(self, attrsD): self.push('language', 1) _start_language = _start_dc_language def _end_dc_language(self): self.lang = self.pop('language') _end_language = _end_dc_language def _start_dc_publisher(self, attrsD): self.push('publisher', 1) _start_webmaster = _start_dc_publisher def _end_dc_publisher(self): self.pop('publisher') self._sync_author_detail('publisher') _end_webmaster = _end_dc_publisher def _start_published(self, attrsD): self.push('published', 1) _start_dcterms_issued = _start_published _start_issued = _start_published def _end_published(self): value = self.pop('published') self._save('published_parsed', _parse_date(value)) _end_dcterms_issued = _end_published _end_issued = _end_published def _start_updated(self, attrsD): self.push('updated', 1) _start_modified = _start_updated _start_dcterms_modified = _start_updated _start_pubdate = _start_updated _start_dc_date = _start_updated def _end_updated(self): value = self.pop('updated') parsed_value = _parse_date(value) self._save('updated_parsed', parsed_value) _end_modified = _end_updated _end_dcterms_modified = _end_updated _end_pubdate = _end_updated _end_dc_date = _end_updated def _start_created(self, attrsD): self.push('created', 1) _start_dcterms_created = _start_created def _end_created(self): value = self.pop('created') self._save('created_parsed', _parse_date(value)) _end_dcterms_created = _end_created def _start_expirationdate(self, attrsD): self.push('expired', 1) def _end_expirationdate(self): self._save('expired_parsed', _parse_date(self.pop('expired'))) def _start_cc_license(self, attrsD): self.push('license', 1) value = self._getAttribute(attrsD, 'rdf:resource') if value: self.elementstack[-1][2].append(value) self.pop('license') def _start_creativecommons_license(self, attrsD): self.push('license', 1) def _end_creativecommons_license(self): self.pop('license') def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) if not term and not scheme and not label: return None value = FeedParserDict({ 'term': term, 'scheme': scheme, 'label': label }) if value not in tags: tags.append(FeedParserDict({ 'term': term, 'scheme': scheme, 'label': label })) def _start_category(self, attrsD): if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) term = attrsD.get('term') scheme = attrsD.get('scheme', attrsD.get('domain')) label = attrsD.get('label') self._addTag(term, scheme, label) self.push('category', 1) _start_dc_subject = _start_category _start_keywords = _start_category _start_media_category = _start_category def _end_itunes_keywords(self): for term in self.pop('itunes_keywords').split(): self._addTag(term, 'http://www.itunes.com/', None) def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) self.push('category', 1) def _end_category(self): value = self.pop('category') if not value: return None context = self._getContext() tags = context['tags'] if value and len(tags) and not tags[-1]['term']: tags[-1]['term'] = value else: self._addTag(value, None, None) _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category _end_media_category = _end_category def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') attrsD.setdefault('type', 'text/html') attrsD = self._itsAnHrefDamnIt(attrsD) if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) if not self.infeed and self.inentry: pass expectingText = self.insource context = self._getContext() context.setdefault('links', []) context['links'].append(FeedParserDict(attrsD)) if attrsD['rel'] == 'enclosure': self._start_enclosure(attrsD) if attrsD.has_key('href'): expectingText = 0 if attrsD.get('rel') == 'alternate' and self.mapContentType(attrsD.get('type')) in self.html_types: context['link'] = attrsD['href'] else: self.push('link', expectingText) _start_producturl = _start_link def _end_link(self): value = self.pop('link') context = self._getContext() if self.intextinput: context['textinput']['link'] = value if self.inimage: context['image']['link'] = value _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = attrsD.get('ispermalink', 'true') == 'true' self.push('id', 1) def _end_guid(self): value = self.pop('id') if self.guidislink: pass self._save('guidislink', not self._getContext().has_key('link')) if self.guidislink: self._save('link', value) def _start_title(self, attrsD): if not self.infeed and self.inentry: pass self.pushContent('title', attrsD, 'text/plain', self.insource) _start_dc_title = _start_title _start_media_title = _start_title def _end_title(self): value = self.popContent('title') context = self._getContext() if self.intextinput: context['textinput']['title'] = value elif self.inimage: context['image']['title'] = value _end_dc_title = _end_title _end_media_title = _end_title def _start_description(self, attrsD): context = self._getContext() if context.has_key('summary'): self._summaryKey = 'content' self._start_content(attrsD) elif not self.infeed and self.inentry: pass self.pushContent('description', attrsD, 'text/html', self.insource) def _start_abstract(self, attrsD): if not self.infeed and self.inentry: pass self.pushContent('description', attrsD, 'text/plain', self.insource) def _end_description(self): if self._summaryKey == 'content': self._end_content() else: value = self.popContent('description') context = self._getContext() if self.intextinput: context['textinput']['description'] = value elif self.inimage: context['image']['description'] = value self._summaryKey = None _end_abstract = _end_description def _start_info(self, attrsD): self.pushContent('info', attrsD, 'text/plain', 1) _start_feedburner_browserfriendly = _start_info def _end_info(self): self.popContent('info') _end_feedburner_browserfriendly = _end_info def _start_generator(self, attrsD): if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) self._getContext()['generator_detail'] = FeedParserDict(attrsD) self.push('generator', 1) def _end_generator(self): value = self.pop('generator') context = self._getContext() if context.has_key('generator_detail'): context['generator_detail']['name'] = value def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) value = self._getAttribute(attrsD, 'rdf:resource') if value: self.elementstack[-1][2].append(value) self.pop('generator') self._getContext()['generator_detail'] = FeedParserDict({ 'href': value }) def _start_admin_errorreportsto(self, attrsD): self.push('errorreportsto', 1) value = self._getAttribute(attrsD, 'rdf:resource') if value: self.elementstack[-1][2].append(value) self.pop('errorreportsto') def _start_summary(self, attrsD): context = self._getContext() if context.has_key('summary'): self._summaryKey = 'content' self._start_content(attrsD) else: self._summaryKey = 'summary' self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) _start_itunes_summary = _start_summary def _end_summary(self): if self._summaryKey == 'content': self._end_content() elif not self._summaryKey: pass self.popContent('summary') self._summaryKey = None _end_itunes_summary = _end_summary def _start_enclosure(self, attrsD): self.inenclosure += 1 attrsD = self._itsAnHrefDamnIt(attrsD) self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) href = attrsD.get('href') if href: context = self._getContext() if not context.get('id'): context['id'] = href _start_media_content = _start_enclosure def _end_enclosure(self): self.inenclosure -= 1 _end_media_content = _end_enclosure def _start_media_thumbnail(self, attrsD): self.push('media:thumbnail', 1) if self.inentry: if self.inenclosure: self.entries[-1]['enclosures'][-1]['thumbnail'] = FeedParserDict(attrsD) else: self.entries[-1]['thumbnail'] = FeedParserDict(attrsD) def _end_media_thumbnail(self): self.pop('media:thumbnail') def _start_media_text(self, attrsD): self.push('media:text', 1) def _end_media_text(self): value = self.pop('media:text') if self.inentry: if self.inenclosure: self.entries[-1]['enclosures'][-1]['text'] = value else: self.entries[-1]['text'] = value def _start_media_people(self, attrsD): self.push('media:people', 1) try: self.peoplerole = attrsD['role'] except: self.peoplerole = 'unknown' def _end_media_people(self): value = self.pop('media:people').split('|') if self.inentry: if self.inenclosure: self.entries[-1]['enclosures'][-1].setdefault('roles', { }) self.entries[-1]['enclosures'][-1].roles[self.peoplerole] = value else: self.entries[-1].setdefault('roles', { }) self.entries[-1].roles[self.peoplerole] = value def _start_dtv_startnback(self, attrsD): self.push('dtv:startnback', 1) def _end_dtv_startnback(self): self.feeddata['startnback'] = self.pop('dtv:startnback') def _start_dtv_librarylink(self, attrsD): self.push('dtv:librarylink', 1) def _end_dtv_librarylink(self): self.feeddata['librarylink'] = self.pop('dtv:librarylink') def _start_dtv_releasedate(self, attrsD): self.push('dtv:releasedate', 1) def _end_dtv_releasedate(self): value = self.pop('dtv:releasedate') if self.inentry: if self.inenclosure: self.entries[-1]['enclosures'][-1]['releasedate'] = value self.entries[-1]['enclosures'][-1]['releasedate_parsed'] = _parse_date(value) else: self.entries[-1]['releasedate'] = value self.entries[-1]['releasedate_parsed'] = _parse_date(value) def _start_dtv_paymentlink(self, attrsD): self.incontent += 1 self.contentparams['mode'] = 'xml' self.contentparams['type'] = 'application/xhtml+xml' self.push('dtv:paymentlink', 1) if self.inentry: if attrsD.has_key('url'): if self.inenclosure: self.entries[-1]['enclosures'][-1]['payment_url'] = attrsD['url'] else: self.entries[-1]['payment_url'] = attrsD['url'] def _end_dtv_paymentlink(self): value = sanitizeHTML(self.pop('dtv:paymentlink'), self.encoding) self.incontent -= 1 self.contentparams.clear() if self.inentry: if self.inenclosure: self.entries[-1]['enclosures'][-1]['payment_html'] = value else: self.entries[-1]['payment_html'] = value def _start_source(self, attrsD): self.insource = 1 def _end_source(self): self.insource = 0 self._getContext()['source'] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() def _start_content(self, attrsD): self.pushContent('content', attrsD, 'text/plain', 1) src = attrsD.get('src') if src: self.contentparams['src'] = src self.push('content', 1) def _start_prodlink(self, attrsD): self.pushContent('content', attrsD, 'text/html', 1) def _start_body(self, attrsD): self.pushContent('content', attrsD, 'application/xhtml+xml', 1) _start_xhtml_body = _start_body def _start_content_encoded(self, attrsD): self.pushContent('content', attrsD, 'text/html', 1) _start_fullitem = _start_content_encoded def _end_content(self): copyToDescription = self.mapContentType(self.contentparams.get('type')) in [ 'text/plain'] + self.html_types value = self.popContent('content') if copyToDescription: self._save('description', value) _end_body = _end_content _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content _end_prodlink = _end_content def _start_itunes_image(self, attrsD): self.push('itunes_image', 0) self._getContext()['image'] = FeedParserDict({ 'href': attrsD.get('href') }) _start_itunes_link = _start_itunes_image def _end_itunes_block(self): value = self.pop('itunes_block', 0) if not value == 'yes' or 1: pass self._getContext()['itunes_block'] = 0 def _end_itunes_explicit(self): value = self.pop('itunes_explicit', 0) if not value == 'yes' or 1: pass self._getContext()['itunes_explicit'] = 0 if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None def startPrefixMapping(self, prefix, uri): self.trackNamespace(prefix, uri) def startElementNS(self, name, qname, attrs): (namespace, localname) = name if not namespace: pass lowernamespace = str('').lower() if lowernamespace.find('backend.userland.com/rss') != -1: namespace = 'http://backend.userland.com/rss' lowernamespace = namespace if qname and qname.find(':') > 0: givenprefix = qname.split(':')[0] else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if givenprefix: if (prefix == None or prefix == '' or lowernamespace == '') and not self.namespacesInUse.has_key(givenprefix): raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix if prefix: localname = prefix + ':' + localname localname = str(localname).lower() if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) attrsD = { } for namespace, attrlocalname in attrs._attrs.items(): attrvalue = None if not namespace: pass lowernamespace = ''.lower() prefix = self._matchnamespaces.get(lowernamespace, '') if prefix: attrlocalname = prefix + ':' + attrlocalname attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) self.unknown_starttag(localname, attrsD.items()) def characters(self, text): self.handle_data(text) def endElementNS(self, name, qname): (namespace, localname) = name if not namespace: pass lowernamespace = str('').lower() if qname and qname.find(':') > 0: givenprefix = qname.split(':')[0] else: givenprefix = '' prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: localname = prefix + ':' + localname localname = str(localname).lower() self.unknown_endtag(localname) def error(self, exc): self.bozo = 1 self.exc = exc def fatalError(self, exc): self.error(exc) raise exc class _BaseHTMLProcessor(sgmllib.SGMLParser): elements_no_end_tag = [ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'] def __init__(self, encoding): self.encoding = encoding if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: return '<' + tag + ' />' else: return '<' + tag + '></' + tag + '>' def feed(self, data): data = re.compile('<!((?!DOCTYPE|--|\\[))', re.IGNORECASE).sub('<!\\1', data) data = re.sub('<([^<\\s]+?)\\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') if self.encoding and type(data) == type(u''): data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) def normalize_attrs(self, attrs): attrs = [ (k.lower(), v) for k, v in attrs ] attrs = [ (k, v) for k, v in attrs ] return attrs def parse_starttag(self, i): retval = sgmllib.SGMLParser.parse_starttag(self, i) try: if self.get_starttag_text()[-2:] == '/>': self.finish_endtag(self.lasttag) except: pass return retval def unknown_starttag(self, tag, attrs): if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag) uattrs = [] for key, value in attrs: if type(value) != type(u''): value = unicode(value, self.encoding) uattrs.append((unicode(key, self.encoding), value)) strattrs = []([ u' %s="%s"' % (key, value) for key, value in uattrs ]).encode(self.encoding) def unknown_endtag(self, tag): if tag not in self.elements_no_end_tag: self.pieces.append('</%(tag)s>' % locals()) def handle_charref(self, ref): self.pieces.append('&#%(ref)s;' % locals()) def handle_entityref(self, ref): self.pieces.append('&%(ref)s;' % locals()) def handle_data(self, text): if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) self.pieces.append(text) def handle_comment(self, text): self.pieces.append('' % locals()) def handle_pi(self, text): self.pieces.append('<?%(text)s>' % locals()) def handle_decl(self, text): self.pieces.append('<!%(text)s>' % locals()) _new_declname_match = re.compile('[a-zA-Z][-_.a-zA-Z0-9:]*\\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return (None, -1) m = self._new_declname_match(rawdata, i) if m: s = m.group() name = s.strip() if i + len(s) == n: return (None, -1) return (name.lower(), m.end()) else: self.handle_data(rawdata) return (None, -1) def output(self): '''Return processed HTML as a single string''' return []([ str(p) for p in self.pieces ]) class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): def __init__(self, baseuri, baselang, encoding): sgmllib.SGMLParser.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) def decodeEntities(self, element, data): data = data.replace('<', '<') data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace('"', '"') data = data.replace(''', ''') data = data.replace(''', ''') if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace(''', "'") return data class _RelativeURIResolver(_BaseHTMLProcessor): relative_uris = [ ('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'), ('iframe', 'longdesc'), ('iframe', 'src'), ('head', 'profile'), ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'), ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'), ('object', 'classid'), ('object', 'codebase'), ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), ('script', 'src')] def __init__(self, baseuri, encoding): _BaseHTMLProcessor.__init__(self, encoding) self.baseuri = baseuri def resolveURI(self, uri): return _urljoin(self.baseuri, uri) def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) attrs = [ (key, value) for key, value in attrs ] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def _resolveRelativeURIs(htmlSource, baseURI, encoding): if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') p = _RelativeURIResolver(baseURI, encoding) p.feed(htmlSource) return p.output() class _HTMLSanitizer(_BaseHTMLProcessor): acceptable_elements = [ 'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var'] acceptable_attributes = [ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] unacceptable_elements_with_end_tag = [ 'script', 'applet'] def reset(self): _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 def unknown_starttag(self, tag, attrs): if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 return None attrs = self.normalize_attrs(attrs) attrs = _[1] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def unknown_endtag(self, tag): if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 return None _BaseHTMLProcessor.unknown_endtag(self, tag) def handle_pi(self, text): pass def handle_decl(self, text): pass def handle_data(self, text): if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) def sanitizeHTML(htmlSource, encoding): p = _HTMLSanitizer(encoding) p.feed(htmlSource) data = p.output() if TIDY_MARKUP: _tidy = None for tidy_interface in PREFERRED_TIDY_INTERFACES: try: if tidy_interface == 'uTidy': _utidy = parseString import tidy def _tidy(data, **kwargs): return str(_utidy(data, **kwargs)) break elif tidy_interface == 'mxTidy': _mxtidy = Tidy import mx.Tidy def _tidy(data, **kwargs): (nerrors, nwarnings, data, errordata) = _mxtidy.tidy(data, **kwargs) return data break continue continue if _tidy: utf8 = type(data) == type(u'') if utf8: data = data.encode('utf-8') data = _tidy(data, output_xhtml = 1, numeric_entities = 1, wrap = 0, char_encoding = 'utf8') if utf8: data = unicode(data, 'utf-8') if data.count('<body'): data = data.split('<body', 1)[1] if data.count('>'): data = data.split('>', 1)[1] if data.count('</body'): data = data.split('</body', 1)[0] data = data.strip().replace('\r\n', '\n') return data class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): def http_error_default(self, req, fp, code, msg, headers): if code / 100 == 3 and code != 304: return self.http_error_302(req, fp, code, msg, headers) infourl = urllib.addinfourl(fp, headers, req.get_full_url()) infourl.status = code return infourl def http_error_302(self, req, fp, code, msg, headers): if headers.dict.has_key('location'): infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) else: infourl = urllib.addinfourl(fp, headers, req.get_full_url()) if not hasattr(infourl, 'status'): infourl.status = code return infourl def http_error_301(self, req, fp, code, msg, headers): if headers.dict.has_key('location'): infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) else: infourl = urllib.addinfourl(fp, headers, req.get_full_url()) if not hasattr(infourl, 'status'): infourl.status = code return infourl http_error_300 = http_error_302 http_error_303 = http_error_302 http_error_307 = http_error_302 def http_error_401(self, req, fp, code, msg, headers): host = urlparse.urlparse(req.get_full_url())[1] try: if not sys.version.split()[0] >= '2.3.3': raise AssertionError if not base64 != None: raise AssertionError (user, passw) = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] self.add_password(realm, host, user, passw) retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) self.reset_retry_count() return retry except: return self.http_error_default(req, fp, code, msg, headers) def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): """URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. If the etag argument is supplied, it will be used as the value of an If-None-Match request header. If the modified argument is supplied, it must be a tuple of 9 integers as returned by gmtime() in the standard Python time module. This MUST be in GMT (Greenwich Mean Time). The formatted date/time will be used as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. If the referrer argument is supplied, it will be used as the value of a Referer[sic] request header. If handlers is supplied, it is a list of handlers used to build a urllib2 opener. """ if hasattr(url_file_stream_or_string, 'read'): return url_file_stream_or_string if url_file_stream_or_string == '-': return sys.stdin if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): if not agent: agent = USER_AGENT auth = None if base64: (urltype, rest) = urllib.splittype(url_file_stream_or_string) (realhost, rest) = urllib.splithost(rest) if realhost: (user_passwd, realhost) = urllib.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) auth = base64.encodestring(user_passwd).strip() request = urllib2.Request(url_file_stream_or_string) request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) if modified: short_weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) if referrer: request.add_header('Referer', referrer) if gzip and zlib: request.add_header('Accept-encoding', 'gzip, deflate') elif gzip: request.add_header('Accept-encoding', 'gzip') elif zlib: request.add_header('Accept-encoding', 'deflate') else: request.add_header('Accept-encoding', '') if auth: request.add_header('Authorization', 'Basic %s' % auth) if ACCEPT_HEADER: request.add_header('Accept', ACCEPT_HEADER) request.add_header('A-IM', 'feed') opener = apply(urllib2.build_opener, tuple([ _FeedURLHandler()] + handlers)) opener.addheaders = [] try: return opener.open(request) finally: opener.close() try: return open(url_file_stream_or_string) except: pass return _StringIO(str(url_file_stream_or_string)) _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' _date_handlers.insert(0, func) _iso8601_tmpl = [ 'YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', '-YY-?MM', '-OOO', '-YY', '--MM-?DD', '--MM', '---DD', 'CC', ''] _iso8601_re = [ tmpl.replace('YYYY', '(?P<year>\\d{4})').replace('YY', '(?P<year>\\d\\d)').replace('MM', '(?P<month>[01]\\d)').replace('DD', '(?P<day>[0123]\\d)').replace('OOO', '(?P<ordinal>[0123]\\d\\d)').replace('CC', '(?P<century>\\d\\d$)') + '(T?(?P<hour>\\d{2}):(?P<minute>\\d{2})' + '(:(?P<second>\\d{2}))?' + '(?P<tz>[+-](?P<tzhour>\\d{2})(:(?P<tzmin>\\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl ] del tmpl _iso8601_matches = [ re.compile(regex).match for regex in _iso8601_re ] del regex def _parse_date_iso8601(dateString): '''Parse a variety of ISO-8601-compatible formats like 20040105''' m = None for _iso8601_match in _iso8601_matches: m = _iso8601_match(dateString) if m: break continue if not m: return None if m.span() == (0, 0): return None params = m.groupdict() ordinal = params.get('ordinal', 0) if ordinal: ordinal = int(ordinal) else: ordinal = 0 year = params.get('year', '--') if not year or year == '--': year = time.gmtime()[0] elif len(year) == 2: year = 100 * int(time.gmtime()[0] / 100) + int(year) else: year = int(year) month = params.get('month', '-') if not month or month == '-': if ordinal: month = 1 else: month = time.gmtime()[1] month = int(month) day = params.get('day', 0) if not day: if ordinal: day = ordinal elif params.get('century', 0) and params.get('year', 0) or params.get('month', 0): day = 1 else: day = time.gmtime()[2] else: day = int(day) if 'century' in params.keys(): year = (int(params['century']) - 1) * 100 + 1 for field in [ 'hour', 'minute', 'second', 'tzhour', 'tzmin']: if not params.get(field, None): params[field] = 0 continue hour = int(params.get('hour', 0)) minute = int(params.get('minute', 0)) second = int(params.get('second', 0)) weekday = 0 daylight_savings_flag = 0 tm = [ year, month, day, hour, minute, second, weekday, ordinal, daylight_savings_flag] tz = params.get('tz') if tz and tz != 'Z': if tz[0] == '-': tm[3] += int(params.get('tzhour', 0)) tm[4] += int(params.get('tzmin', 0)) elif tz[0] == '+': tm[3] -= int(params.get('tzhour', 0)) tm[4] -= int(params.get('tzmin', 0)) else: return None return time.localtime(time.mktime(tm)) registerDateHandler(_parse_date_iso8601) _korean_year = u'δàä' _korean_month = u'∞¢ö' _korean_day = u'∞¥╝' _korean_am = u'∞ÿñ∞áä' _korean_pm = u'∞ÿñφ¢ä' _korean_onblog_date_re = re.compile('(\\d{4})%s\\s+(\\d{2})%s\\s+(\\d{2})%s\\s+(\\d{2}):(\\d{2}):(\\d{2})' % (_korean_year, _korean_month, _korean_day)) _korean_nate_date_re = re.compile(u'(\\d{4})-(\\d{2})-(\\d{2})\\s+(%s|%s)\\s+(\\d{,2}):(\\d{,2}):(\\d{,2})' % (_korean_am, _korean_pm)) def _parse_date_onblog(dateString): '''Parse a string according to the OnBlog 8-bit date format''' m = _korean_onblog_date_re.match(dateString) if not m: return None w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % { 'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+09:00' } if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_onblog) def _parse_date_nate(dateString): '''Parse a string according to the Nate 8-bit date format''' m = _korean_nate_date_re.match(dateString) if not m: return None hour = int(m.group(5)) ampm = m.group(4) if ampm == _korean_pm: hour += 12 hour = str(hour) if len(hour) == 1: hour = '0' + hour w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % { 'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': hour, 'minute': m.group(6), 'second': m.group(7), 'zonediff': '+09:00' } if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_nate) _mssql_date_re = re.compile('(\\d{4})-(\\d{2})-(\\d{2})\\s+(\\d{2}):(\\d{2}):(\\d{2})(\\.\\d+)?') def _parse_date_mssql(dateString): '''Parse a string according to the MS SQL date format''' m = _mssql_date_re.match(dateString) if not m: return None w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % { 'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+09:00' } if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_mssql) _greek_months = { u'╬Ö╬▒╬╜': u'Jan', u'╬ª╬╡╬▓': u'Feb', u'╬£╬¼╧Ä': u'Mar', u'╬£╬▒╧Ä': u'Mar', u'╬æ╧Ç╧ü': u'Apr', u'╬£╬¼╬╣': u'May', u'╬£╬▒╧è': u'May', u'╬£╬▒╬╣': u'May', u'╬Ö╬┐╧ì╬╜': u'Jun', u'╬Ö╬┐╬╜': u'Jun', u'╬Ö╬┐╧ì╬╗': u'Jul', u'╬Ö╬┐╬╗': u'Jul', u'╬æ╧ì╬│': u'Aug', u'╬æ╧à╬│': u'Aug', u'╬ú╬╡╧Ç': u'Sep', u'╬ƒ╬║╧ä': u'Oct', u'╬¥╬┐╬¡': u'Nov', u'╬¥╬┐╬╡': u'Nov', u'╬ö╬╡╬║': u'Dec' } _greek_wdays = { u'╬Ü╧à╧ü': u'Sun', u'╬ö╬╡╧à': u'Mon', u'╬ñ╧ü╬╣': u'Tue', u'╬ñ╬╡╧ä': u'Wed', u'╬á╬╡╬╝': u'Thu', u'╬á╬▒╧ü': u'Fri', u'╬ú╬▒╬▓': u'Sat' } _greek_date_format_re = re.compile(u'([^,]+),\\s+(\\d{2})\\s+([^\\s]+)\\s+(\\d{4})\\s+(\\d{2}):(\\d{2}):(\\d{2})\\s+([^\\s]+)') def _parse_date_greek(dateString): '''Parse a string according to a Greek 8-bit date format.''' m = _greek_date_format_re.match(dateString) if not m: return None try: wday = _greek_wdays[m.group(1)] month = _greek_months[m.group(3)] except: return None rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % { 'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4), 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7), 'zonediff': m.group(8) } if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) return _parse_date_rfc822(rfc822date) registerDateHandler(_parse_date_greek) _hungarian_months = { u'janu├ír': u'01', u'febru├íri': u'02', u'm├írcius': u'03', u'├íprilis': u'04', u'm├íujus': u'05', u'j├║nius': u'06', u'j├║lius': u'07', u'augusztus': u'08', u'szeptember': u'09', u'okt├│ber': u'10', u'november': u'11', u'december': u'12' } _hungarian_date_format_re = re.compile(u'(\\d{4})-([^-]+)-(\\d{,2})T(\\d{,2}):(\\d{2})((\\+|-)(\\d{,2}:\\d{2}))') def _parse_date_hungarian(dateString): '''Parse a string according to a Hungarian 8-bit date format.''' m = _hungarian_date_format_re.match(dateString) if not m: return None try: month = _hungarian_months[m.group(2)] day = m.group(3) if len(day) == 1: day = '0' + day hour = m.group(4) if len(hour) == 1: hour = '0' + hour except: return None w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % { 'year': m.group(1), 'month': month, 'day': day, 'hour': hour, 'minute': m.group(5), 'zonediff': m.group(6) } if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_hungarian) def _parse_date_w3dtf(dateString): def __extract_date(m): year = int(m.group('year')) if year < 100: year = 100 * int(time.gmtime()[0] / 100) + int(year) if year < 1000: return (0, 0, 0) julian = m.group('julian') if julian: julian = int(julian) month = julian / 30 + 1 day = julian % 30 + 1 jday = None while jday != julian: t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) jday = time.gmtime(t)[-2] diff = abs(jday - julian) if jday > julian: if diff < day: day = day - diff else: month = month - 1 day = 31 diff < day if jday < julian: if day + diff < 28: day = day + diff else: month = month + 1 day + diff < 28 return (year, month, day) month = m.group('month') day = 1 if month is None: month = 1 else: month = int(month) day = m.group('day') if day: day = int(day) else: day = 1 return (year, month, day) def __extract_time(m): if not m: return (0, 0, 0) hours = m.group('hours') if not hours: return (0, 0, 0) hours = int(hours) minutes = int(m.group('minutes')) seconds = m.group('seconds') if seconds: seconds = int(seconds) else: seconds = 0 return (hours, minutes, seconds) def __extract_tzd(m): '''Return the Time Zone Designator as an offset in seconds from UTC.''' if not m: return 0 tzd = m.group('tzd') if not tzd: return 0 if tzd == 'Z': return 0 hours = int(m.group('tzdhours')) minutes = m.group('tzdminutes') if minutes: minutes = int(minutes) else: minutes = 0 offset = (hours * 60 + minutes) * 60 if tzd[0] == '+': return -offset return offset __date_re = '(?P<year>\\d\\d\\d\\d)(?:(?P<dsep>-|)(?:(?P<julian>\\d\\d\\d)|(?P<month>\\d\\d)(?:(?P=dsep)(?P<day>\\d\\d))?))?' __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\\d\\d)(?::?(?P<tzdminutes>\\d\\d))|Z)' __tzd_rx = re.compile(__tzd_re) __time_re = '(?P<hours>\\d\\d)(?P<tsep>:|)(?P<minutes>\\d\\d)(?:(?P=tsep)(?P<seconds>\\d\\d(?:[.,]\\d+)?))?' + __tzd_re __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) __datetime_rx = re.compile(__datetime_re) m = __datetime_rx.match(dateString) if m is None or m.group() != dateString: return None gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) if gmt[0] == 0: return None return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) registerDateHandler(_parse_date_w3dtf) def _parse_date_rfc822(dateString): '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' data = dateString.split() if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: del data[0] if len(data) == 4: s = data[3] i = s.find('+') if i > 0: data[3:] = [ s[:i], s[i + 1:]] else: data.append('') dateString = ' '.join(data) if len(data) < 5: dateString += ' 00:00:00 GMT' tm = rfc822.parsedate_tz(dateString) if tm: return time.gmtime(rfc822.mktime_tz(tm)) _additional_timezones = { 'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800 } rfc822._timezones.update(_additional_timezones) registerDateHandler(_parse_date_rfc822) def _parse_date(dateString): '''Parses a variety of date formats into a 9-tuple in GMT''' for handler in _date_handlers: try: date9tuple = handler(dateString) if not date9tuple: continue if len(date9tuple) != 9: if _debug: sys.stderr.write('date handler function must return 9-tuple\n') raise ValueError map(int, date9tuple) return date9tuple continue except Exception: e = None if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) _debug return None def _getCharacterEncoding(http_headers, xml_data): """Get the character encoding of the XML document http_headers is a dictionary xml_data is a raw string (not Unicode) This is so much trickier than it sounds, it's not even funny. According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type is application/xml, application/*+xml, application/xml-external-parsed-entity, or application/xml-dtd, the encoding given in the charset parameter of the HTTP Content-Type takes precedence over the encoding given in the XML prefix within the document, and defaults to 'utf-8' if neither are specified. But, if the HTTP Content-Type is text/xml, text/*+xml, or text/xml-external-parsed-entity, the encoding given in the XML prefix within the document is ALWAYS IGNORED and only the encoding given in the charset parameter of the HTTP Content-Type header should be respected, and it defaults to 'us-ascii' if not specified. Furthermore, discussion on the atom-syntax mailing list with the author of RFC 3023 leads me to the conclusion that any document served with a Content-Type of text/* and no charset parameter must be treated as us-ascii. (We now do this.) And also that it must always be flagged as non-well-formed. (We now do this too.) If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to 'iso-8859-1' as per the HTTP specification (RFC 2616). Then, assuming we didn't find a character encoding in the HTTP headers (and the HTTP Content-type allowed us to look in the body), we need to sniff the first few bytes of the XML data and try to determine whether the encoding is ASCII-compatible. Section F of the XML specification shows the way here: http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info If the sniffed encoding is not ASCII-compatible, we need to make it ASCII compatible so that we can sniff further into the XML declaration to find the encoding attribute, which will tell us the true encoding. Of course, none of this guarantees that we will be able to parse the feed in the declared character encoding (assuming it was declared correctly, which many are not). CJKCodecs and iconv_codec help a lot; you should definitely install them if you can. http://cjkpython.i18n.org/ """ def _parseHTTPContentType(content_type): """takes HTTP Content-Type header and returns (content type, charset) If no charset is specified, returns (content type, '') If no content type is specified, returns ('', '') Both return parameters are guaranteed to be lowercase strings """ if not content_type: pass content_type = '' (content_type, params) = cgi.parse_header(content_type) return (content_type, params.get('charset', '').replace("'", '')) sniffed_xml_encoding = '' xml_encoding = '' true_encoding = '' (http_content_type, http_encoding) = _parseHTTPContentType(http_headers.get('content-type')) try: if xml_data[:4] == 'Lo\xa7\x94': xml_data = _ebcdic_to_ascii(xml_data) elif xml_data[:4] == '\x00<\x00?': sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif len(xml_data) >= 4 and xml_data[:2] == '\xfe\xff' and xml_data[2:4] != '\x00\x00': sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '<\x00?\x00': sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif len(xml_data) >= 4 and xml_data[:2] == '\xff\xfe' and xml_data[2:4] != '\x00\x00': sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00<': sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '<\x00\x00\x00': sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') xml_encoding_match = re.compile('^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>').match(xml_data) except: xml_encoding_match = None if xml_encoding_match: xml_encoding = xml_encoding_match.groups()[0].lower() if sniffed_xml_encoding and xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16'): xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') text_content_types = ('text/xml', 'text/xml-external-parsed-entity') if (http_content_type in application_content_types or http_content_type.startswith('application/')) and http_content_type.endswith('+xml'): acceptable_content_type = 1 if not http_encoding and xml_encoding: pass true_encoding = 'utf-8' elif (http_content_type in text_content_types or http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): acceptable_content_type = 1 if not http_encoding: pass true_encoding = 'us-ascii' elif http_content_type.startswith('text/'): if not http_encoding: pass true_encoding = 'us-ascii' elif http_headers and not http_headers.has_key('content-type'): if not xml_encoding: pass true_encoding = 'iso-8859-1' elif not xml_encoding: pass true_encoding = 'utf-8' return (true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type) def _toUTF8(data, encoding): '''Changes an XML data stream on the fly to specify a new encoding data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases ''' if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) if len(data) >= 4 and data[:2] == '\xfe\xff' and data[2:4] != '\x00\x00': if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16be': sys.stderr.write('trying utf-16be instead\n') encoding = 'utf-16be' data = data[2:] elif len(data) >= 4 and data[:2] == '\xff\xfe' and data[2:4] != '\x00\x00': if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16le': sys.stderr.write('trying utf-16le instead\n') encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-8': sys.stderr.write('trying utf-8 instead\n') encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32be': sys.stderr.write('trying utf-32be instead\n') encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32le': sys.stderr.write('trying utf-32le instead\n') encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding) if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) declmatch = re.compile('^<\\?xml[^>]*?>') newdecl = "<?xml version='1.0' encoding='utf-8'?>" if declmatch.search(newdata): newdata = declmatch.sub(newdecl, newdata) else: newdata = newdecl + u'\n' + newdata return newdata.encode('utf-8') def _stripDoctype(data): """Strips DOCTYPE from XML document, returns (rss_version, stripped_data) rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE """ entity_pattern = re.compile('<!ENTITY([^>]*?)>', re.MULTILINE) data = entity_pattern.sub('', data) doctype_pattern = re.compile('<!DOCTYPE([^>]*?)>', re.MULTILINE) doctype_results = doctype_pattern.findall(data) if not doctype_results or doctype_results[0]: pass doctype = '' if doctype.lower().count('netscape'): version = 'rss091n' else: version = None data = doctype_pattern.sub('', data) return (version, data) def parse(url_file_stream_or_string, etag = None, modified = None, agent = None, referrer = None, handlers = []): '''Parse a feed from a URL, file, stream, or string''' result = FeedParserDict() result['feed'] = FeedParserDict() result['entries'] = [] if _XML_AVAILABLE: result['bozo'] = 0 if type(handlers) == types.InstanceType: handlers = [ handlers] try: f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) data = f.read() except Exception: e = None result['bozo'] = 1 result['bozo_exception'] = e data = '' f = None if f and data and hasattr(f, 'headers'): if gzip and f.headers.get('content-encoding', '') == 'gzip': try: data = gzip.GzipFile(fileobj = _StringIO(data)).read() except Exception: e = None result['bozo'] = 1 result['bozo_exception'] = e data = '' except: None<EXCEPTION MATCH>Exception None<EXCEPTION MATCH>Exception if zlib and f.headers.get('content-encoding', '') == 'deflate': try: data = zlib.decompress(data, -(zlib.MAX_WBITS)) except Exception: e = None result['bozo'] = 1 result['bozo_exception'] = e data = '' except: None<EXCEPTION MATCH>Exception None<EXCEPTION MATCH>Exception if hasattr(f, 'info'): info = f.info() result['etag'] = info.getheader('ETag') last_modified = info.getheader('Last-Modified') if last_modified: result['modified'] = _parse_date(last_modified) if hasattr(f, 'url'): result['href'] = f.url result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status if hasattr(f, 'headers'): result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() http_headers = result.get('headers', { }) (result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type) = _getCharacterEncoding(http_headers, data) if http_headers and not acceptable_content_type: if http_headers.has_key('content-type'): bozo_message = '%s is not an XML media type' % http_headers['content-type'] else: bozo_message = 'no Content-type specified' result['bozo'] = 1 result['bozo_exception'] = NonXMLContentType(bozo_message) (result['version'], data) = _stripDoctype(data) baseuri = http_headers.get('content-location', result.get('href')) baselang = http_headers.get('content-language', None) if result.get('status', 0) == 304: result['version'] = '' result['debug_message'] = 'The feed has not changed since you last checked, ' + 'so the server sent no data. This is a feature, not a bug!' return result if not data: return result use_strict_parser = 0 known_encoding = 0 tried_encodings = [] for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): if not proposed_encoding: continue if proposed_encoding in tried_encodings: continue tried_encodings.append(proposed_encoding) try: data = _toUTF8(data, proposed_encoding) known_encoding = use_strict_parser = 1 continue continue if not known_encoding and chardet: try: proposed_encoding = chardet.detect(data)['encoding'] if proposed_encoding and proposed_encoding not in tried_encodings: tried_encodings.append(proposed_encoding) data = _toUTF8(data, proposed_encoding) known_encoding = use_strict_parser = 1 if not known_encoding and 'utf-8' not in tried_encodings: try: proposed_encoding = 'utf-8' tried_encodings.append(proposed_encoding) data = _toUTF8(data, proposed_encoding) known_encoding = use_strict_parser = 1 if not known_encoding and 'windows-1252' not in tried_encodings: try: proposed_encoding = 'windows-1252' tried_encodings.append(proposed_encoding) data = _toUTF8(data, proposed_encoding) known_encoding = use_strict_parser = 1 if not known_encoding: result['bozo'] = 1 result['bozo_exception'] = CharacterEncodingUnknown('document encoding unknown, I tried ' + '%s, %s, utf-8, and windows-1252 but nothing worked' % (result['encoding'], xml_encoding)) result['encoding'] = '' elif proposed_encoding != result['encoding']: result['bozo'] = 1 result['bozo_exception'] = CharacterEncodingOverride('documented declared as %s, but parsed as %s' % (result['encoding'], proposed_encoding)) result['encoding'] = proposed_encoding if not _XML_AVAILABLE: use_strict_parser = 0 if use_strict_parser: feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) saxparser.setContentHandler(feedparser) saxparser.setErrorHandler(feedparser) source = xml.sax.xmlreader.InputSource() source.setByteStream(_StringIO(data)) if hasattr(saxparser, '_ns_stack'): saxparser._ns_stack.append({ 'http://www.w3.org/XML/1998/namespace': 'xml' }) try: saxparser.parse(source) except Exception: e = None if _debug: import traceback traceback.print_stack() traceback.print_exc() sys.stderr.write('xml parsing failed\n') result['bozo'] = 1 if not feedparser.exc: pass result['bozo_exception'] = e use_strict_parser = 0 except: None<EXCEPTION MATCH>Exception None<EXCEPTION MATCH>Exception if not use_strict_parser: if not known_encoding or 'utf-8': pass feedparser = _LooseFeedParser(baseuri, baselang, '') feedparser.feed(data) result['feed'] = feedparser.feeddata result['entries'] = feedparser.entries if not result['version']: pass result['version'] = feedparser.version result['namespaces'] = feedparser.namespacesInUse return result if __name__ == '__main__': zopeCompatibilityHack() from pprint import pprint for url in urls: print url print result = parse(url) pprint(result) print