Maximum CD 2011 January

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_869 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-10-31 | 29.8 KB | 456 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import functools import re from calibre import entity_to_unicode XMLDECL_RE = re.compile('^\\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' convert_entities = functools.partial(entity_to_unicode, result_exceptions = { u'<': '<', u'>': '>', u"'": ''', u'"': '"', u'&': '&' }) _span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE) LIGATURES = { u'∩¼Ç': u'ff', u'∩¼ü': u'fi', u'∩¼é': u'fl', u'∩¼â': u'ffi', u'∩¼ä': u'ffl', u'∩¼à': u'ft', u'∩¼å': u'st' } _ligpat = re.compile(u'|'.join(LIGATURES)) def sanitize_head(match): x = match.group(1) x = _span_pat.sub('', x) return '<head>\n%s\n</head>' % x def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: return '<h1>' + chap + '</h1><br/>\n' return '<h1>' + chap + '</h1>\n<h3>' + title + '</h3>\n' def wrap_lines(match): ital = match.group('ital') if not ital: return ' ' return ital + ' ' class DocAnalysis(object): def __init__(self, format = 'html', raw = ''): raw = raw.replace(' ', ' ') if format == 'html': linere = re.compile('(?<=<p)(?![^>]*>\\s*</p>).*?(?=</p>)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=<br>)(?!\\s*<br>).*?(?=<br>)', re.DOTALL) elif format == 'spanned_html': linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) self.lines = linere.findall(raw) def line_length(self, percent): lengths = [] for line in self.lines: if len(line) > 0: lengths.append(len(line)) continue if not lengths: return 0 lengths = list(set(lengths)) total = sum(lengths) avg = total / len(lengths) max_line = avg * 2 lengths = sorted(lengths) for i in range(len(lengths) - 1, -1, -1): if lengths[i] > max_line: del lengths[i] continue lengths if percent > 1: percent = 1 if percent < 0: percent = 0 index = int(len(lengths) * percent) - 1 return lengths[index] def line_histogram(self, percent): minLineLength = 20 maxLineLength = 1900 buckets = 20 hRaw = [ 0 for i in range(0, buckets) ] for line in self.lines: l = len(line) if l > minLineLength and l < maxLineLength: l = int(l / 100) hRaw[l] += 1 continue [] totalLines = len(self.lines) h = [ float(count) / totalLines for count in hRaw ] maxValue = 0 for i in range(0, len(h)): if h[i] > maxValue: maxValue = h[i] continue [] if maxValue < percent: return False return True class Dehyphenator(object): def __init__(self): self.removesuffixes = re.compile("((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) self.prefixes = re.compile('^(dis|re|un|in|ex)$', re.IGNORECASE) self.removeprefix = re.compile('^(dis|re|un|in|ex)', re.IGNORECASE) def dehyphenate(self, match): firsthalf = match.group('firstpart') secondhalf = match.group('secondpart') try: wraptags = match.group('wraptags') except: wraptags = '' hyphenated = unicode(firsthalf) + '-' + unicode(secondhalf) dehyphenated = unicode(firsthalf) + unicode(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) try: searchresult = self.html.find(lookupword.lower()) except: return hyphenated if self.format == 'html_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: return dehyphenated if self.html.find(hyphenated) != -1: return hyphenated return firsthalf + u'ΓÇö' + wraptags + secondhalf self.format == 'html_cleanup' if self.html.find(lookupword) != -1 or searchresult != -1: return dehyphenated return hyphenated def __call__(self, html, format, length = 1): self.html = html self.format = format if format == 'html': intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\$\$ΓÇ£"\\s>]+)-\\s*(?=<)(?P<wraptags></span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?P<secondpart>[\\w\\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\$\$ΓÇ£"\\s>]+)-\\s*(?P<wraptags><p>|</[iub]>\\s*<p>\\s*<[iub]>)\\s*(?P<secondpart>[\\w\\d]+)' % length) elif format == 'individual_words': intextmatch = re.compile(u'>[^<]*\x08(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\$\$"\\s>]+)-(?P<secondpart)\\w+)\x08[^<]*<') elif format == 'html_cleanup': intextmatch = re.compile(u'(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\$\$ΓÇ£"\\s>]+)-\\s*(?=<)(?P<wraptags></span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?P<secondpart>[\\w\\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html class CSSPreProcessor(object): PAGE_PAT = re.compile('@page[^{]*?{[^}]*?}') def __call__(self, data, add_namespace = False): XHTML_CSS_NAMESPACE = XHTML_CSS_NAMESPACE import calibre.ebooks.oeb.base data = self.PAGE_PAT.sub('', data) if not add_namespace: return data ans = [] namespaced = False for line in data.splitlines(): ll = line.lstrip() if not namespaced and ll.startswith('@import') or ll.startswith('@charset'): ans.append(XHTML_CSS_NAMESPACE.strip()) namespaced = True ans.append(line) return u'\n'.join(ans) class HTMLPreProcessor(object): PREPROCESS = [ (re.compile('<head[^>]*>\\n*(.*?)\\n*</head>', re.IGNORECASE | re.DOTALL), sanitize_head), (re.compile('&(\\S+?);'), convert_entities), (re.compile('</{0,1}!\\[(end){0,1}if\\]{0,1}>', re.IGNORECASE), (lambda match: ''))] PDFTOHTML = [ (re.compile(u'┬¿\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├ñ')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├ä')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├½')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ï')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├»')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├Å')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├╢')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├û')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╝')), (re.compile(u'┬¿\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├£')), (re.compile(u'`\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├á')), (re.compile(u'`\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├Ç')), (re.compile(u'`\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├¿')), (re.compile(u'`\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ê')), (re.compile(u'`\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├¼')), (re.compile(u'`\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├î')), (re.compile(u'`\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├▓')), (re.compile(u'`\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├Æ')), (re.compile(u'`\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╣')), (re.compile(u'`\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├Ö')), (re.compile(u'a\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├á')), (re.compile(u'A\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├Ç')), (re.compile(u'e\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├¿')), (re.compile(u'E\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├ê')), (re.compile(u'i\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├¼')), (re.compile(u'I\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├î')), (re.compile(u'o\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├▓')), (re.compile(u'O\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├Æ')), (re.compile(u'u\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├╣')), (re.compile(u'U\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├Ö')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├í')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├ü')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'─ç')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'─å')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├⌐')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├ë')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├¡')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├ì')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├│')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├ô')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*n', re.UNICODE), (lambda match: u'┼ä')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*N', re.UNICODE), (lambda match: u'┼â')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*s', re.UNICODE), (lambda match: u'┼¢')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*S', re.UNICODE), (lambda match: u'┼Ü')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├║')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├Ü')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*z', re.UNICODE), (lambda match: u'┼║')), (re.compile(u'┬┤\\s*(<br.*?>)*\\s*Z', re.UNICODE), (lambda match: u'┼╣')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'├ó')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'├é')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'├¬')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'├è')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'├«')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'├Ä')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├┤')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'├ö')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╗')), (re.compile(u'╦å\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'├¢')), (re.compile(u'┬╕\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'├º')), (re.compile(u'┬╕\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'├ç')), (re.compile(u'\\s*╦¢\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'─à')), (re.compile(u'\\s*╦¢\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'─ä')), (re.compile(u'╦¢\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'─Ö')), (re.compile(u'╦¢\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'─ÿ')), (re.compile(u'╦Ö\\s*(<br.*?>)*\\s*z', re.UNICODE), (lambda match: u'┼╝')), (re.compile(u'╦Ö\\s*(<br.*?>)*\\s*Z', re.UNICODE), (lambda match: u'┼╗')), (re.compile('((?<=</a>)\\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\\s*<hr>))', re.IGNORECASE), (lambda match: '')), (re.compile(u'<br>\\s*(?P<break>([*#ΓÇó]+\\s*)+)\\s*<br>'), (lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>')), (re.compile('<a name=\\d+></a>', re.IGNORECASE), (lambda match: '')), (re.compile('<hr.*?>', re.IGNORECASE), (lambda match: '<br>')), (re.compile('<BODY[^<>]+>'), (lambda match: '<BODY>')), (re.compile('<br>\\s*(?P<chap>(<[ibu]>){0,2}\\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\\s*([\\d\\w-]+\\s*){0,3}\\s*(</[ibu]>){0,2})\\s*(<br>\\s*){1,3}\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*<br>)?', re.IGNORECASE), chap_head), (re.compile('<br>\\s*(?P<chap>([A-Z]\\s+){4,}\\s*([\\d\\w-]+\\s*){0,3}\\s*)\\s*(<br>\\s*){1,3}\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*(<br>))?'), chap_head), (re.compile('<br.*?>'), (lambda match: '<p>')), (re.compile(u'(?<=[\\.,;\\?!ΓÇ¥"\'])[\\s^ ]*(?=<)'), (lambda match: ' ')), (re.compile(u'(?<!ΓÇ£)<i>'), (lambda match: ' <i>')), (re.compile('</i>(?=\\w)'), (lambda match: '</i> '))] BOOK_DESIGNER = [ (re.compile('<hr>', re.IGNORECASE), (lambda match: '<span style="page-break-after:always"> </span>')), (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h1 id="BookTitle" align="%s">%s</h1>' if match.group(2) else 'center', match.group(3)))), (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h2 id="BookAuthor" align="%s">%s</h2>' if match.group(2) else 'center', match.group(3)))), (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h2 class="title">%s</h2>' % (match.group(1),))), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h3 class="subtitle">%s</h3>' % (match.group(1),)))] def __init__(self, input_plugin_preprocess, plugin_preprocess, extra_opts = None): self.input_plugin_preprocess = input_plugin_preprocess self.plugin_preprocess = plugin_preprocess self.extra_opts = extra_opts def is_baen(self, src): return re.compile('<meta\\s+name="Publisher"\\s+content=".*?Baen.*?"', re.IGNORECASE).search(src) is not None def is_book_designer(self, raw): return re.search('<H2[^><]*id=BookTitle', raw) is not None def is_pdftohtml(self, src): return "" in src[:1000] def __call__(self, html, remove_special_chars = None, get_preprocess_html = False): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\x00', '') is_pdftohtml = self.is_pdftohtml(html) if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif is_pdftohtml: rules = self.PDFTOHTML else: rules = [] start_rules = [] if is_pdftohtml: start_rules.append((re.compile(u'┬á'), (lambda match: ' '))) if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub((lambda m: LIGATURES[m.group()]), html) end_rules = [] if getattr(self.extra_opts, 'remove_header', None): try: rules.insert(0, (re.compile(self.extra_opts.header_regex), (lambda match: ''))) import traceback print 'Failed to parse remove_header regexp' traceback.print_exc() if getattr(self.extra_opts, 'remove_footer', None): try: rules.insert(0, (re.compile(self.extra_opts.footer_regex), (lambda match: ''))) import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() if is_pdftohtml: end_rules.append((re.compile(u'[┬¡](\\s*<p>)+\\s*(?=[[a-z\\d])'), (lambda match: ''))) end_rules.append((re.compile(u'[┬¡]\\s*(</(i|u|b)>)+(\\s*<p>)+\\s*(<(i|u|b)>)+\\s*(?=[[a-z\\d])'), (lambda match: ''))) if getattr(self.extra_opts, 'preprocess_html', None): if is_pdftohtml: end_rules.append((re.compile('<p>\\s*(?P<chap>(<[ibu]>){0,2}\\s*([A-Z \\\'"!]{3,})\\s*([\\dA-Z:]+\\s){0,4}\\s*(</[ibu]>){0,2})\\s*<p>\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*<p>)?'), chap_head)) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0) > 0.01: docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: end_rules.append((re.compile(u'(?<=.{%i}[ΓÇôΓÇö])\\s*<p>\\s*(?=[[a-z\\d])' % length), (lambda match: ''))) end_rules.append((re.compile(u'(?<=.{%i}([a-z,:)\\IA├ƒ]|(?<!\\&\\w{4});))\\s*(?P<ital></(i|b|u)>)?\\s*(<p.*?>\\s*)+\\s*(?=(<(i|b|u)>)?\\s*[\\w\\d$(])' % length, re.UNICODE), wrap_lines)) for rule in self.PREPROCESS + start_rules: html = rule[0].sub(rule[1], html) if get_preprocess_html: return html def dump(raw, where): import os dp = getattr(self.extra_opts, 'debug_pipeline', None) if dp and os.path.exists(dp): odir = os.path.join(dp, 'input') if os.path.exists(odir): odir = os.path.join(odir, where) if not os.path.exists(odir): os.makedirs(odir) (name, i) = (None, 0) while not name or os.path.exists(os.path.join(odir, name)): i += 1 name = '%04d.html' % i try: f = _[1] f.write(raw.encode('utf-8')) finally: pass for rule in rules + end_rules: html = rule[0].sub(rule[1], html) if is_pdftohtml and length > -1: dehyphenator = Dehyphenator() html = dehyphenator(html, 'pdf', length) if 'svg:' in html and SVG_NS not in html: html = html.replace('<html', '<html xmlns:svg="%s"' % SVG_NS, 1) if 'xlink:' in html and XLINK_NS not in html: html = html.replace('<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) html = XMLDECL_RE.sub('', html) if getattr(self.extra_opts, 'asciiize', False): Unidecoder = Unidecoder import calibre.ebooks.unidecode.unidecoder unidecoder = Unidecoder() html = unidecoder.decode(html) if self.plugin_preprocess: html = self.input_plugin_preprocess(self.extra_opts, html) if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars if unsupported_unicode_chars: Unidecoder = Unidecoder import calibre.ebooks.unidecode.unidecoder unidecoder = Unidecoder() for char in unsupported_unicode_chars: asciichar = unidecoder.decode(char) html = html.replace(char, asciichar) return html def smarten_punctuation(self, html): smartyPants = smartyPants import calibre.utils.smartypants substitute_entites = substitute_entites import calibre.ebooks.chardet uuid4 = uuid4 import uuid start = 'calibre-smartypants-' + str(uuid4()) stop = 'calibre-smartypants-' + str(uuid4()) html = html.replace('', stop) html = smartyPants(html) html = html.replace(start, '') return substitute_entites(html)