home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL v3'
- __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
- __docformat__ = 'restructuredtext en'
- import functools
- import re
- from calibre import entity_to_unicode
- XMLDECL_RE = re.compile('^\\s*<[?]xml.*?[?]>')
- SVG_NS = 'http://www.w3.org/2000/svg'
- XLINK_NS = 'http://www.w3.org/1999/xlink'
- convert_entities = functools.partial(entity_to_unicode, result_exceptions = {
- u'<': '<',
- u'>': '>',
- u"'": ''',
- u'"': '"',
- u'&': '&' })
- _span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
- LIGATURES = {
- u'ff': u'ff',
- u'fi': u'fi',
- u'fl': u'fl',
- u'ffi': u'ffi',
- u'ffl': u'ffl',
- u'ſt': u'ft',
- u'st': u'st' }
- _ligpat = re.compile(u'|'.join(LIGATURES))
-
- def sanitize_head(match):
- x = match.group(1)
- x = _span_pat.sub('', x)
- return '<head>\n%s\n</head>' % x
-
-
- def chap_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- return '<h1>' + chap + '</h1><br/>\n'
- return '<h1>' + chap + '<br/>\n' + title + '</h1><br/>\n'
-
-
- def wrap_lines(match):
- ital = match.group('ital')
- if not ital:
- return ' '
- return ital + ' '
-
-
- def line_length(raw, percent):
- raw = raw.replace(' ', ' ')
- linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
- lines = linere.findall(raw)
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
- continue
-
- if not lengths:
- return 0
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
- continue
- lengths
-
- if percent > 1:
- percent = 1
-
- if percent < 0:
- percent = 0
-
- index = int(len(lengths) * percent) - 1
- return lengths[index]
-
-
- class CSSPreProcessor(object):
- PAGE_PAT = re.compile('@page[^{]*?{[^}]*?}')
-
- def __call__(self, data, add_namespace = False):
- XHTML_CSS_NAMESPACE = XHTML_CSS_NAMESPACE
- import calibre.ebooks.oeb.base
- data = self.PAGE_PAT.sub('', data)
- if not add_namespace:
- return data
- ans = []
- namespaced = False
- for line in data.splitlines():
- ll = line.lstrip()
- if not namespaced and ll.startswith('@import') or ll.startswith('@charset'):
- ans.append(XHTML_CSS_NAMESPACE.strip())
- namespaced = True
-
- ans.append(line)
-
- return u'\n'.join(ans)
-
-
-
- class HTMLPreProcessor(object):
- PREPROCESS = [
- (re.compile('<head[^>]*>\\n*(.*?)\\n*</head>', re.IGNORECASE | re.DOTALL), sanitize_head),
- (re.compile('&(\\S+?);'), convert_entities),
- (re.compile('</{0,1}!\\[(end){0,1}if\\]{0,1}>', re.IGNORECASE), (lambda match: ''))]
- PDFTOHTML = [
- (re.compile(u'¨\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'ö')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ö')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'ü')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ü')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'ë')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'Ë')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'ï')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Ï')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'ä')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Ä')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├▓')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ò')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╣')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ù')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'è')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'È')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'ì')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Ì')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'à')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'À')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├│')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ó')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├║')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ú')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'é')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'É')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'í')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Í')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'á')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Á')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'ô')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ô')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'û')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Û')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'ê')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'Ê')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'î')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Î')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'â')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Â')),
- (re.compile(u'¸\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'ç')),
- (re.compile(u'¸\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'Ç')),
- (re.compile('<a name=\\d+></a>', re.IGNORECASE), (lambda match: '')),
- (re.compile('<hr.*?>', re.IGNORECASE), (lambda match: '<br />')),
- (re.compile('<br.*?>\\s*<br.*?>', re.IGNORECASE), (lambda match: '<p>')),
- (re.compile('-<br.*?>\\n\\r?'), (lambda match: '')),
- (re.compile('<BODY[^<>]+>'), (lambda match: '<BODY>')),
- (re.compile(u' '), (lambda match: ' ')),
- (re.compile('(?=<(/?br|p))(<(/?br|p)[^>]*)?>\\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\\s*([\\d\\w-]+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\\n?((?=(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
- (re.compile('(?=<(/?br|p))(<(/?br|p)[^>]*)?>\\s*(?P<chap>([A-Z \\\'"!]{5,})\\s*(\\d+|\\w+)?)(</?p[^>]*>|<br[^>]*>)\\n?((?=(<i>)?\\s*\\w+(\\s+\\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
- (re.compile('<br.*?>'), (lambda match: '<p>')),
- (re.compile(u'(?<=[\\.,;\\?!ΓÇ¥"\'])[\\s^ ]*(?=<)'), (lambda match: ' ')),
- (re.compile(u'(?<=[^\\s][-ΓÇô])[\\s]*(</p>)*[\\s]*(<p>)*\\s*(?=[^\\s])'), (lambda match: '')),
- (re.compile(u'(?<!ΓÇ£)<i>'), (lambda match: ' <i>')),
- (re.compile('</i>(?=\\w)'), (lambda match: '</i> '))]
- BOOK_DESIGNER = [
- (re.compile('<hr>', re.IGNORECASE), (lambda match: '<span style="page-break-after:always"> </span>')),
- (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h1 id="BookTitle" align="%s">%s</h1>' if match.group(2) else 'center', match.group(3)))),
- (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h2 id="BookAuthor" align="%s">%s</h2>' if match.group(2) else 'center', match.group(3)))),
- (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h2 class="title">%s</h2>' % (match.group(1),))),
- (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h3 class="subtitle">%s</h3>' % (match.group(1),)))]
-
- def __init__(self, input_plugin_preprocess, plugin_preprocess, extra_opts = None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
- self.extra_opts = extra_opts
-
-
- def is_baen(self, src):
- return re.compile('<meta\\s+name="Publisher"\\s+content=".*?Baen.*?"', re.IGNORECASE).search(src) is not None
-
-
- def is_book_designer(self, raw):
- return re.search('<H2[^><]*id=BookTitle', raw) is not None
-
-
- def is_pdftohtml(self, src):
- return "<!-- created by calibre's pdftohtml -->" in src[:1000]
-
-
- def __call__(self, html, remove_special_chars = None):
- if remove_special_chars is not None:
- html = remove_special_chars.sub('', html)
-
- html = html.replace('\x00', '')
- if self.is_baen(html):
- rules = []
- elif self.is_book_designer(html):
- rules = self.BOOK_DESIGNER
- elif self.is_pdftohtml(html):
- rules = self.PDFTOHTML
- else:
- rules = []
- if not self.extra_opts.keep_ligatures:
- html = _ligpat.sub((lambda m: LIGATURES[m.group()]), html)
-
- end_rules = []
- if getattr(self.extra_opts, 'remove_header', None):
-
- try:
- rules.insert(0, (re.compile(self.extra_opts.header_regex), (lambda match: '')))
- import traceback
- print 'Failed to parse remove_header regexp'
- traceback.print_exc()
-
-
- if getattr(self.extra_opts, 'remove_footer', None):
-
- try:
- rules.insert(0, (re.compile(self.extra_opts.footer_regex), (lambda match: '')))
- import traceback
- print 'Failed to parse remove_footer regexp'
- traceback.print_exc()
-
-
- if getattr(self.extra_opts, 'unwrap_factor', 0) > 0.01:
- length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
- if length:
- end_rules.append((re.compile('(?<=.{%i}[a-z\\.,;:)\\-IA])\\s*(?P<ital></(i|b|u)>)?\\s*(<p.*?>)\\s*(?=(<(i|b|u)>)?\\s*[\\w\\d(])' % length, re.UNICODE), wrap_lines))
-
-
- for rule in self.PREPROCESS + rules + end_rules:
- html = rule[0].sub(rule[1], html)
-
- if 'svg:' in html and SVG_NS not in html:
- html = html.replace('<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
-
- if 'xlink:' in html and XLINK_NS not in html:
- html = html.replace('<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
-
- html = XMLDECL_RE.sub('', html)
- if getattr(self.extra_opts, 'asciiize', False):
- Unidecoder = Unidecoder
- import calibre.ebooks.unidecode.unidecoder
- unidecoder = Unidecoder()
- html = unidecoder.decode(html)
-
- if self.plugin_preprocess:
- html = self.input_plugin_preprocess(html)
-
- return html
-
-
-