Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_876 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-08-06 | 18.9 KB | 256 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import functools import re from calibre import entity_to_unicode XMLDECL_RE = re.compile('^\\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' convert_entities = functools.partial(entity_to_unicode, result_exceptions = { u'<': '<', u'>': '>', u"'": ''', u'"': '"', u'&': '&' }) _span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE) LIGATURES = { u'∩¼Ç': u'ff', u'∩¼ü': u'fi', u'∩¼é': u'fl', u'∩¼â': u'ffi', u'∩¼ä': u'ffl', u'∩¼à': u'ft', u'∩¼å': u'st' } _ligpat = re.compile(u'|'.join(LIGATURES)) def sanitize_head(match): x = match.group(1) x = _span_pat.sub('', x) return '<head>\n%s\n</head>' % x def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: return '<h1>' + chap + '</h1><br/>\n' return '<h1>' + chap + '<br/>\n' + title + '</h1><br/>\n' def wrap_lines(match): ital = match.group('ital') if not ital: return ' ' return ital + ' ' def line_length(raw, percent): raw = raw.replace(' ', ' ') linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) lines = linere.findall(raw) lengths = [] for line in lines: if len(line) > 0: lengths.append(len(line)) continue if not lengths: return 0 lengths = list(set(lengths)) total = sum(lengths) avg = total / len(lengths) max_line = avg * 2 lengths = sorted(lengths) for i in range(len(lengths) - 1, -1, -1): if lengths[i] > max_line: del lengths[i] continue lengths if percent > 1: percent = 1 if percent < 0: percent = 0 index = int(len(lengths) * percent) - 1 return lengths[index] class CSSPreProcessor(object): PAGE_PAT = re.compile('@page[^{]*?{[^}]*?}') def __call__(self, data, add_namespace = False): XHTML_CSS_NAMESPACE = XHTML_CSS_NAMESPACE import calibre.ebooks.oeb.base data = self.PAGE_PAT.sub('', data) if not add_namespace: return data ans = [] namespaced = False for line in data.splitlines(): ll = line.lstrip() if not namespaced and ll.startswith('@import') or ll.startswith('@charset'): ans.append(XHTML_CSS_NAMESPACE.strip()) namespaced = True ans.append(line) return u'\n'.join(ans) class HTMLPreProcessor(object): PREPROCESS = [ (re.compile('<head[^>]*>\\n*(.*?)\\n*</head>', re.IGNORECASE | re.DOTALL), sanitize_head), (re.compile('&(\\S+?);'), convert_entities), (re.compile('</{0,1}!\\[(end){0,1}if\\]{0,1}>', re.IGNORECASE), (lambda match: ''))] PDFTOHTML = [ (re.compile(u'┬¿\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├╢')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├û')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╝')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├£')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├½')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ï')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├»')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├Å')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├ñ')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├ä')), (re.compile(u'`\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├▓')), (re.compile(u'`\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├Æ')), (re.compile(u'`\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╣')), (re.compile(u'`\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├Ö')), (re.compile(u'`\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├¿')), (re.compile(u'`\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ê')), (re.compile(u'`\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├¼')), (re.compile(u'`\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├î')), (re.compile(u'`\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├á')), (re.compile(u'`\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├Ç')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├│')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├ô')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├║')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├Ü')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├⌐')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ë')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├¡')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├ì')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├í')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├ü')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├┤')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├ö')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╗')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├¢')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├¬')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├è')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├«')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├Ä')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├ó')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├é')), (re.compile(u'┬╕\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'├º')), (re.compile(u'┬╕\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'├ç')), (re.compile('<a name=\\d+></a>', re.IGNORECASE), (lambda match: '')), (re.compile('<hr.*?>', re.IGNORECASE), (lambda match: '<br />')), (re.compile('<br.*?>\\s*<br.*?>', re.IGNORECASE), (lambda match: '<p>')), (re.compile('-<br.*?>\\n\\r?'), (lambda match: '')), (re.compile('<BODY[^<>]+>'), (lambda match: '<BODY>')), (re.compile(u'┬á'), (lambda match: ' ')), (re.compile('(?=<(/?br|p))(<(/?br|p)[^>]*)?>\\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\\s*([\\d\\w-]+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\\n?((?=(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile('(?=<(/?br|p))(<(/?br|p)[^>]*)?>\\s*(?P<chap>([A-Z \\\'"!]{5,})\\s*(\\d+|\\w+)?)(</?p[^>]*>|<br[^>]*>)\\n?((?=(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), (re.compile('<br.*?>'), (lambda match: '<p>')), (re.compile(u'(?<=[\\.,;\\?!ΓÇ¥"\'])[\\s^ ]*(?=<)'), (lambda match: ' ')), (re.compile(u'(?<=[^\\s][-ΓÇô])[\\s]*(</p>)*[\\s]*(<p>)*\\s*(?=[^\\s])'), (lambda match: '')), (re.compile(u'(?<!ΓÇ£)<i>'), (lambda match: ' <i>')), (re.compile('</i>(?=\\w)'), (lambda match: '</i> '))] BOOK_DESIGNER = [ (re.compile('<hr>', re.IGNORECASE), (lambda match: '<span style="page-break-after:always"> </span>')), (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h1 id="BookTitle" align="%s">%s</h1>' if match.group(2) else 'center', match.group(3)))), (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h2 id="BookAuthor" align="%s">%s</h2>' if match.group(2) else 'center', match.group(3)))), (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h2 class="title">%s</h2>' % (match.group(1),))), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h3 class="subtitle">%s</h3>' % (match.group(1),)))] def __init__(self, input_plugin_preprocess, plugin_preprocess, extra_opts = None): self.input_plugin_preprocess = input_plugin_preprocess self.plugin_preprocess = plugin_preprocess self.extra_opts = extra_opts def is_baen(self, src): return re.compile('<meta\\s+name="Publisher"\\s+content=".*?Baen.*?"', re.IGNORECASE).search(src) is not None def is_book_designer(self, raw): return re.search('<H2[^><]*id=BookTitle', raw) is not None def is_pdftohtml(self, src): return "" in src[:1000] def __call__(self, html, remove_special_chars = None): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\x00', '') if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): rules = self.PDFTOHTML else: rules = [] if not self.extra_opts.keep_ligatures: html = _ligpat.sub((lambda m: LIGATURES[m.group()]), html) end_rules = [] if getattr(self.extra_opts, 'remove_header', None): try: rules.insert(0, (re.compile(self.extra_opts.header_regex), (lambda match: ''))) import traceback print 'Failed to parse remove_header regexp' traceback.print_exc() if getattr(self.extra_opts, 'remove_footer', None): try: rules.insert(0, (re.compile(self.extra_opts.footer_regex), (lambda match: ''))) import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() if getattr(self.extra_opts, 'unwrap_factor', 0) > 0.01: length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: end_rules.append((re.compile('(?<=.{%i}[a-z\\.,;:)\\-IA])\\s*(?P<ital></(i|b|u)>)?\\s*(<p.*?>)\\s*(?=(<(i|b|u)>)?\\s*[\\w\\d(])' % length, re.UNICODE), wrap_lines)) for rule in self.PREPROCESS + rules + end_rules: html = rule[0].sub(rule[1], html) if 'svg:' in html and SVG_NS not in html: html = html.replace('<html', '<html xmlns:svg="%s"' % SVG_NS, 1) if 'xlink:' in html and XLINK_NS not in html: html = html.replace('<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) html = XMLDECL_RE.sub('', html) if getattr(self.extra_opts, 'asciiize', False): Unidecoder = Unidecoder import calibre.ebooks.unidecode.unidecoder unidecoder = Unidecoder() html = unidecoder.decode(html) if self.plugin_preprocess: html = self.input_plugin_preprocess(html) return html