Maximum CD 2011 January

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_1461 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-10-31 | 13.7 KB | 563 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __version__ = '2.39' __author__ = 'Aaron Swartz (me@aaronsw.com)' __copyright__ = '(C) 2004-2008 Aaron Swartz. GNU GPL 3.' __contributors__ = [ "Martin 'Joey' Schulze", 'Ricardo Reyes', 'Kevin Jay North'] if not hasattr(__builtins__, 'True'): (True, False) = (1, 0) import re import sys import urllib import htmlentitydefs import codecs import StringIO import types import sgmllib import urlparse sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') try: from textwrap import wrap except: pass UNICODE_SNOB = 0 LINKS_EACH_PARAGRAPH = 0 BODY_WIDTH = 78 SKIP_INTERNAL_LINKS = False def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, 'name2codepoint'): return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] if k.startswith('&#') and k.endswith(';'): return int(k[2:-1]) return ord(codecs.latin_1_decode(k)[0]) unifiable = { 'rsquo': "'", 'lsquo': "'", 'rdquo': '"', 'ldquo': '"', 'copy': '(C)', 'mdash': '--', 'nbsp': ' ', 'rarr': '->', 'larr': '<-', 'middot': '*', 'ndash': '-', 'oelig': 'oe', 'aelig': 'ae', 'agrave': 'a', 'aacute': 'a', 'acirc': 'a', 'atilde': 'a', 'auml': 'a', 'aring': 'a', 'egrave': 'e', 'eacute': 'e', 'ecirc': 'e', 'euml': 'e', 'igrave': 'i', 'iacute': 'i', 'icirc': 'i', 'iuml': 'i', 'ograve': 'o', 'oacute': 'o', 'ocirc': 'o', 'otilde': 'o', 'ouml': 'o', 'ugrave': 'u', 'uacute': 'u', 'ucirc': 'u', 'uuml': 'u' } unifiable_n = { } for k in unifiable.keys(): unifiable_n[name2cp(k)] = unifiable[k] def charref(name): if name[0] in ('x', 'X'): c = int(name[1:], 16) else: c = int(name) if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] return unichr(c) def entityref(c): if not UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] try: name2cp(c) except KeyError: c in unifiable.keys() c in unifiable.keys() return '&' + c return unichr(name2cp(c)) def replaceEntities(s): s = s.group(1) if s[0] == '#': return charref(s[1:]) return entityref(s) r_unescape = re.compile('&(#?[xX]?(?:[0-9a-fA-F]+|\\w{1,8}));') def unescape(s): return r_unescape.sub(replaceEntities, s) def fixattrs(attrs): if not attrs: return attrs newattrs = [] for attr in attrs: newattrs.append((attr[0], unescape(attr[1]))) return newattrs def onlywhite(line): for c in line: if c is not ' ' and c is not ' ': return c is ' ' return line def optwrap(text): if not BODY_WIDTH: return text result = '' newlines = 0 for para in text.split('\n'): if len(para) > 0: if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': for line in wrap(para, BODY_WIDTH): result += line + '\n' result += '\n' newlines = 2 elif not onlywhite(para): result += para + '\n' newlines = 1 para[0] is not '*' if newlines < 2: result += '\n' newlines += 1 continue BODY_WIDTH return result def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) if n in range(1, 10): return n except ValueError: return 0 None<EXCEPTION MATCH>ValueError class _html2text(sgmllib.SGMLParser): def __init__(self, out = None, baseurl = ''): sgmllib.SGMLParser.__init__(self) if out is None: self.out = self.outtextf else: self.out = out self.outtext = u'' self.quiet = 0 self.p_p = 0 self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.lastWasNL = 0 self.abbr_title = None self.abbr_data = None self.abbr_list = { } self.baseurl = baseurl def outtextf(self, s): self.outtext += s def close(self): sgmllib.SGMLParser.close(self) self.pbr() self.o('', 0, 'end') return self.outtext def handle_charref(self, c): self.o(charref(c)) def handle_entityref(self, c): self.o(entityref(c)) def unknown_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) def unknown_endtag(self, tag): self.handle_tag(tag, None, 0) def previousIndex(self, attrs): if not attrs.has_key('href'): return None i = -1 for a in self.a: i += 1 match = 0 if a.has_key('href') and a['href'] == attrs['href']: if a.has_key('title') or attrs.has_key('title'): if a.has_key('title') and attrs.has_key('title') and a['title'] == attrs['title']: match = True else: match = True if match: return i def handle_tag(self, tag, attrs, start): attrs = fixattrs(attrs) if hn(tag): self.p() if start: self.o(hn(tag) * '#' + ' ') if tag in ('p', 'div'): self.p() if tag == 'br' and start: self.o(' \n') if tag == 'hr' and start: self.p() self.o('* * *') self.p() if tag in ('head', 'style', 'script'): if start: self.quiet += 1 else: self.quiet -= 1 if tag in ('body',): self.quiet = 0 if tag == 'blockquote': if start: self.p() self.o('> ', 0, 1) self.start = 1 self.blockquote += 1 else: self.blockquote -= 1 self.p() if tag in ('em', 'i', 'u'): self.o('_') if tag in ('strong', 'b'): self.o('**') if tag == 'code' and not (self.pre): self.o('`') if tag == 'abbr': if start: attrsD = { } for x, y in attrs: attrsD[x] = y attrs = attrsD self.abbr_title = None self.abbr_data = '' if attrs.has_key('title'): self.abbr_title = attrs['title'] elif self.abbr_title != None: self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None self.abbr_data = '' if tag == 'a': if start: attrsD = { } for x, y in attrs: attrsD[x] = y attrs = attrsD if attrs.has_key('href'): if SKIP_INTERNAL_LINKS: pass None if not attrs['href'].startswith('#') else not attrs['href'].startswith('#') self.astack.append(None) elif self.astack: a = self.astack.pop() if a: i = self.previousIndex(a) self.o('][' + `a['count']` + ']') if tag == 'img' and start: attrsD = { } for x, y in attrs: attrsD[x] = y attrs = attrsD if attrs.has_key('src'): attrs['href'] = attrs['src'] alt = attrs.get('alt', '') i = self.previousIndex(attrs) self.o('![') self.o(alt) self.o('][' + `attrs['count']` + ']') if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() if tag == 'dd' and start: self.o(' ') if tag == 'dd' and not start: self.pbr() if tag in ('ol', 'ul'): if start: self.list.append({ 'name': tag, 'num': 0 }) elif self.list: self.list.pop() self.p() if tag == 'li': if start: self.pbr() if self.list: li = self.list[-1] else: li = { 'name': 'ul', 'num': 0 } self.o(' ' * len(self.list)) if li['name'] == 'ul': self.o('* ') elif li['name'] == 'ol': li['num'] += 1 self.o(`li['num']` + '. ') self.start = 1 else: self.pbr() if tag in ('table', 'tr') and start: self.p() if tag == 'td': self.pbr() if tag == 'pre': if start: self.startpre = 1 self.pre = 1 else: self.pre = 0 self.p() def pbr(self): if self.p_p == 0: self.p_p = 1 def p(self): self.p_p = 2 def o(self, data, puredata = 0, force = 0): if self.abbr_data is not None: self.abbr_data += data if not self.quiet: if puredata and not (self.pre): data = re.sub('\\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return None if self.startpre: self.startpre = 0 bq = '>' * self.blockquote if force and data: pass if not (data[0] == '>') and self.blockquote: bq += ' ' if self.pre: bq += ' ' data = data.replace('\n', '\n' + bq) if self.start: self.space = 0 self.p_p = 0 self.start = 0 if force == 'end': self.p_p = 0 self.out('\n') self.space = 0 if self.p_p: self.out(('\n' + bq) * self.p_p) self.space = 0 if self.space: if not self.lastWasNL: self.out(' ') self.space = 0 if self.a: if self.p_p == 2 or LINKS_EACH_PARAGRAPH or force == 'end': if force == 'end': self.out('\n') newa = [] for link in self.a: if self.outcount > link['outcount']: self.out(' [' + `link['count']` + ']: ' + urlparse.urljoin(self.baseurl, link['href'])) if link.has_key('title'): self.out(' (' + link['title'] + ')') self.out('\n') continue newa.append(link) if self.a != newa: self.out('\n') self.a = newa if self.abbr_list and force == 'end': for abbr, definition in self.abbr_list.items(): self.out(' *[' + abbr + ']: ' + definition + '\n') self.p_p = 0 self.out(data) if data: pass self.lastWasNL = data[-1] == '\n' self.outcount += 1 def handle_data(self, data): if '\\/script>' in data: self.quiet -= 1 self.o(data, 1) def unknown_decl(self, data): pass def wrapwrite(text): sys.stdout.write(text.encode('utf8')) def html2text_file(html, out = wrapwrite, baseurl = ''): h = _html2text(out, baseurl) h.feed(html) h.feed('') return h.close() def html2text(html, baseurl = ''): return optwrap(html2text_file(html, None, baseurl)) if __name__ == '__main__': baseurl = '' if sys.argv[1:]: arg = sys.argv[1] if arg.startswith('http://') or arg.startswith('https://'): baseurl = arg j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: encoding = 'utf8' if len(sys.argv) > 2: encoding = sys.argv[2] data = open(arg, 'r').read().decode(encoding) else: data = sys.stdin.read().decode('utf8') wrapwrite(html2text(data, baseurl))