home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL v3'
- __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
- __docformat__ = 'restructuredtext en'
- import functools
- import re
- from calibre import entity_to_unicode
- XMLDECL_RE = re.compile('^\\s*<[?]xml.*?[?]>')
- SVG_NS = 'http://www.w3.org/2000/svg'
- XLINK_NS = 'http://www.w3.org/1999/xlink'
- convert_entities = functools.partial(entity_to_unicode, result_exceptions = {
- u'<': '<',
- u'>': '>',
- u"'": ''',
- u'"': '"',
- u'&': '&' })
- _span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
- LIGATURES = {
- u'ff': u'ff',
- u'fi': u'fi',
- u'fl': u'fl',
- u'ffi': u'ffi',
- u'ffl': u'ffl',
- u'ſt': u'ft',
- u'st': u'st' }
- _ligpat = re.compile(u'|'.join(LIGATURES))
-
- def sanitize_head(match):
- x = match.group(1)
- x = _span_pat.sub('', x)
- return '<head>\n%s\n</head>' % x
-
-
- def chap_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- return '<h1>' + chap + '</h1><br/>\n'
- return '<h1>' + chap + '</h1>\n<h3>' + title + '</h3>\n'
-
-
- def wrap_lines(match):
- ital = match.group('ital')
- if not ital:
- return ' '
- return ital + ' '
-
-
- class DocAnalysis(object):
-
- def __init__(self, format = 'html', raw = ''):
- raw = raw.replace(' ', ' ')
- if format == 'html':
- linere = re.compile('(?<=<p)(?![^>]*>\\s*</p>).*?(?=</p>)', re.DOTALL)
- elif format == 'pdf':
- linere = re.compile('(?<=<br>)(?!\\s*<br>).*?(?=<br>)', re.DOTALL)
- elif format == 'spanned_html':
- linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
-
- self.lines = linere.findall(raw)
-
-
- def line_length(self, percent):
- lengths = []
- for line in self.lines:
- if len(line) > 0:
- lengths.append(len(line))
- continue
-
- if not lengths:
- return 0
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
- continue
- lengths
-
- if percent > 1:
- percent = 1
-
- if percent < 0:
- percent = 0
-
- index = int(len(lengths) * percent) - 1
- return lengths[index]
-
-
- def line_histogram(self, percent):
- minLineLength = 20
- maxLineLength = 1900
- buckets = 20
- hRaw = [ 0 for i in range(0, buckets) ]
- for line in self.lines:
- l = len(line)
- if l > minLineLength and l < maxLineLength:
- l = int(l / 100)
- hRaw[l] += 1
- continue
- []
-
- totalLines = len(self.lines)
- h = [ float(count) / totalLines for count in hRaw ]
- maxValue = 0
- for i in range(0, len(h)):
- if h[i] > maxValue:
- maxValue = h[i]
- continue
- []
-
- if maxValue < percent:
- return False
- return True
-
-
-
- class Dehyphenator(object):
-
- def __init__(self):
- self.removesuffixes = re.compile("((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
- self.prefixes = re.compile('^(dis|re|un|in|ex)$', re.IGNORECASE)
- self.removeprefix = re.compile('^(dis|re|un|in|ex)', re.IGNORECASE)
-
-
- def dehyphenate(self, match):
- firsthalf = match.group('firstpart')
- secondhalf = match.group('secondpart')
-
- try:
- wraptags = match.group('wraptags')
- except:
- wraptags = ''
-
- hyphenated = unicode(firsthalf) + '-' + unicode(secondhalf)
- dehyphenated = unicode(firsthalf) + unicode(secondhalf)
- lookupword = self.removesuffixes.sub('', dehyphenated)
- if self.prefixes.match(firsthalf) is None:
- lookupword = self.removeprefix.sub('', lookupword)
-
-
- try:
- searchresult = self.html.find(lookupword.lower())
- except:
- return hyphenated
-
- if self.format == 'html_cleanup':
- if self.html.find(lookupword) != -1 or searchresult != -1:
- return dehyphenated
- if self.html.find(hyphenated) != -1:
- return hyphenated
- return firsthalf + u'ΓÇö' + wraptags + secondhalf
- self.format == 'html_cleanup'
- if self.html.find(lookupword) != -1 or searchresult != -1:
- return dehyphenated
- return hyphenated
-
-
- def __call__(self, html, format, length = 1):
- self.html = html
- self.format = format
- if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\\(\\)ΓÇ£"\\s>]+)-\\s*(?=<)(?P<wraptags></span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?P<secondpart>[\\w\\d]+)' % length)
- elif format == 'pdf':
- intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\\(\\)ΓÇ£"\\s>]+)-\\s*(?P<wraptags><p>|</[iub]>\\s*<p>\\s*<[iub]>)\\s*(?P<secondpart>[\\w\\d]+)' % length)
- elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\x08(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\\(\\)"\\s>]+)-(?P<secondpart)\\w+)\x08[^<]*<')
- elif format == 'html_cleanup':
- intextmatch = re.compile(u'(?P<firstpart>[^\\[\\]\\\\^\\$\\.\\|\\?\\*\\+\\(\\)ΓÇ£"\\s>]+)-\\s*(?=<)(?P<wraptags></span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?P<secondpart>[\\w\\d]+)')
-
- html = intextmatch.sub(self.dehyphenate, html)
- return html
-
-
-
- class CSSPreProcessor(object):
- PAGE_PAT = re.compile('@page[^{]*?{[^}]*?}')
-
- def __call__(self, data, add_namespace = False):
- XHTML_CSS_NAMESPACE = XHTML_CSS_NAMESPACE
- import calibre.ebooks.oeb.base
- data = self.PAGE_PAT.sub('', data)
- if not add_namespace:
- return data
- ans = []
- namespaced = False
- for line in data.splitlines():
- ll = line.lstrip()
- if not namespaced and ll.startswith('@import') or ll.startswith('@charset'):
- ans.append(XHTML_CSS_NAMESPACE.strip())
- namespaced = True
-
- ans.append(line)
-
- return u'\n'.join(ans)
-
-
-
- class HTMLPreProcessor(object):
- PREPROCESS = [
- (re.compile('<head[^>]*>\\n*(.*?)\\n*</head>', re.IGNORECASE | re.DOTALL), sanitize_head),
- (re.compile('&(\\S+?);'), convert_entities),
- (re.compile('</{0,1}!\\[(end){0,1}if\\]{0,1}>', re.IGNORECASE), (lambda match: ''))]
- PDFTOHTML = [
- (re.compile(u'¨\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'ä')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Ä')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'ë')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'Ë')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'ï')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Ï')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'ö')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ö')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'ü')),
- (re.compile(u'¨\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ü')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'à')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'À')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'è')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'È')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'ì')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Ì')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├▓')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ò')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├╣')),
- (re.compile(u'`\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ù')),
- (re.compile(u'a\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'à')),
- (re.compile(u'A\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'À')),
- (re.compile(u'e\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'è')),
- (re.compile(u'E\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'È')),
- (re.compile(u'i\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'ì')),
- (re.compile(u'I\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'Ì')),
- (re.compile(u'o\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├▓')),
- (re.compile(u'O\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'Ò')),
- (re.compile(u'u\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'├╣')),
- (re.compile(u'U\\s*(<br.*?>)*\\s*`', re.UNICODE), (lambda match: u'Ù')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'á')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Á')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'─ç')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'─å')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'é')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'É')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'í')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Í')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'├│')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ó')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*n', re.UNICODE), (lambda match: u'ń')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*N', re.UNICODE), (lambda match: u'Ń')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*s', re.UNICODE), (lambda match: u'ś')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*S', re.UNICODE), (lambda match: u'Ś')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'├║')),
- (re.compile(u'´\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Ú')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*z', re.UNICODE), (lambda match: u'┼║')),
- (re.compile(u'┬┤\\s*(<br.*?>)*\\s*Z', re.UNICODE), (lambda match: u'┼╣')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'â')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Â')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'ê')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'Ê')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*i', re.UNICODE), (lambda match: u'î')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*I', re.UNICODE), (lambda match: u'Î')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*o', re.UNICODE), (lambda match: u'ô')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*O', re.UNICODE), (lambda match: u'Ô')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*u', re.UNICODE), (lambda match: u'û')),
- (re.compile(u'ˆ\\s*(<br.*?>)*\\s*U', re.UNICODE), (lambda match: u'Û')),
- (re.compile(u'¸\\s*(<br.*?>)*\\s*c', re.UNICODE), (lambda match: u'ç')),
- (re.compile(u'¸\\s*(<br.*?>)*\\s*C', re.UNICODE), (lambda match: u'Ç')),
- (re.compile(u'\\s*˛\\s*(<br.*?>)*\\s*a', re.UNICODE), (lambda match: u'ą')),
- (re.compile(u'\\s*˛\\s*(<br.*?>)*\\s*A', re.UNICODE), (lambda match: u'Ą')),
- (re.compile(u'˛\\s*(<br.*?>)*\\s*e', re.UNICODE), (lambda match: u'ę')),
- (re.compile(u'˛\\s*(<br.*?>)*\\s*E', re.UNICODE), (lambda match: u'Ę')),
- (re.compile(u'˙\\s*(<br.*?>)*\\s*z', re.UNICODE), (lambda match: u'ż')),
- (re.compile(u'˙\\s*(<br.*?>)*\\s*Z', re.UNICODE), (lambda match: u'Ż')),
- (re.compile('((?<=</a>)\\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\\s*<hr>))', re.IGNORECASE), (lambda match: '')),
- (re.compile(u'<br>\\s*(?P<break>([*#ΓÇó]+\\s*)+)\\s*<br>'), (lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>')),
- (re.compile('<a name=\\d+></a>', re.IGNORECASE), (lambda match: '')),
- (re.compile('<hr.*?>', re.IGNORECASE), (lambda match: '<br>')),
- (re.compile('<BODY[^<>]+>'), (lambda match: '<BODY>')),
- (re.compile('<br>\\s*(?P<chap>(<[ibu]>){0,2}\\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\\s*([\\d\\w-]+\\s*){0,3}\\s*(</[ibu]>){0,2})\\s*(<br>\\s*){1,3}\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*<br>)?', re.IGNORECASE), chap_head),
- (re.compile('<br>\\s*(?P<chap>([A-Z]\\s+){4,}\\s*([\\d\\w-]+\\s*){0,3}\\s*)\\s*(<br>\\s*){1,3}\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*(<br>))?'), chap_head),
- (re.compile('<br.*?>'), (lambda match: '<p>')),
- (re.compile(u'(?<=[\\.,;\\?!ΓÇ¥"\'])[\\s^ ]*(?=<)'), (lambda match: ' ')),
- (re.compile(u'(?<!ΓÇ£)<i>'), (lambda match: ' <i>')),
- (re.compile('</i>(?=\\w)'), (lambda match: '</i> '))]
- BOOK_DESIGNER = [
- (re.compile('<hr>', re.IGNORECASE), (lambda match: '<span style="page-break-after:always"> </span>')),
- (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h1 id="BookTitle" align="%s">%s</h1>' if match.group(2) else 'center', match.group(3)))),
- (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h2 id="BookAuthor" align="%s">%s</h2>' if match.group(2) else 'center', match.group(3)))),
- (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h2 class="title">%s</h2>' % (match.group(1),))),
- (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h3 class="subtitle">%s</h3>' % (match.group(1),)))]
-
- def __init__(self, input_plugin_preprocess, plugin_preprocess, extra_opts = None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
- self.extra_opts = extra_opts
-
-
- def is_baen(self, src):
- return re.compile('<meta\\s+name="Publisher"\\s+content=".*?Baen.*?"', re.IGNORECASE).search(src) is not None
-
-
- def is_book_designer(self, raw):
- return re.search('<H2[^><]*id=BookTitle', raw) is not None
-
-
- def is_pdftohtml(self, src):
- return "<!-- created by calibre's pdftohtml -->" in src[:1000]
-
-
- def __call__(self, html, remove_special_chars = None, get_preprocess_html = False):
- if remove_special_chars is not None:
- html = remove_special_chars.sub('', html)
-
- html = html.replace('\x00', '')
- is_pdftohtml = self.is_pdftohtml(html)
- if self.is_baen(html):
- rules = []
- elif self.is_book_designer(html):
- rules = self.BOOK_DESIGNER
- elif is_pdftohtml:
- rules = self.PDFTOHTML
- else:
- rules = []
- start_rules = []
- if is_pdftohtml:
- start_rules.append((re.compile(u' '), (lambda match: ' ')))
-
- if not getattr(self.extra_opts, 'keep_ligatures', False):
- html = _ligpat.sub((lambda m: LIGATURES[m.group()]), html)
-
- end_rules = []
- if getattr(self.extra_opts, 'remove_header', None):
-
- try:
- rules.insert(0, (re.compile(self.extra_opts.header_regex), (lambda match: '')))
- import traceback
- print 'Failed to parse remove_header regexp'
- traceback.print_exc()
-
-
- if getattr(self.extra_opts, 'remove_footer', None):
-
- try:
- rules.insert(0, (re.compile(self.extra_opts.footer_regex), (lambda match: '')))
- import traceback
- print 'Failed to parse remove_footer regexp'
- traceback.print_exc()
-
-
- if is_pdftohtml:
- end_rules.append((re.compile(u'[­](\\s*<p>)+\\s*(?=[[a-z\\d])'), (lambda match: '')))
- end_rules.append((re.compile(u'[­]\\s*(</(i|u|b)>)+(\\s*<p>)+\\s*(<(i|u|b)>)+\\s*(?=[[a-z\\d])'), (lambda match: '')))
-
- if getattr(self.extra_opts, 'preprocess_html', None):
- if is_pdftohtml:
- end_rules.append((re.compile('<p>\\s*(?P<chap>(<[ibu]>){0,2}\\s*([A-Z \\\'"!]{3,})\\s*([\\dA-Z:]+\\s){0,4}\\s*(</[ibu]>){0,2})\\s*<p>\\s*(?P<title>(<[ibu]>){0,2}(\\s*\\w+){1,4}\\s*(</[ibu]>){0,2}\\s*<p>)?'), chap_head))
-
-
- length = -1
- if getattr(self.extra_opts, 'unwrap_factor', 0) > 0.01:
- docanalysis = DocAnalysis('pdf', html)
- length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
- if length:
- end_rules.append((re.compile(u'(?<=.{%i}[ΓÇôΓÇö])\\s*<p>\\s*(?=[[a-z\\d])' % length), (lambda match: '')))
- end_rules.append((re.compile(u'(?<=.{%i}([a-z,:)\\IAß]|(?<!\\&\\w{4});))\\s*(?P<ital></(i|b|u)>)?\\s*(<p.*?>\\s*)+\\s*(?=(<(i|b|u)>)?\\s*[\\w\\d$(])' % length, re.UNICODE), wrap_lines))
-
-
- for rule in self.PREPROCESS + start_rules:
- html = rule[0].sub(rule[1], html)
-
- if get_preprocess_html:
- return html
-
- def dump(raw, where):
- import os
- dp = getattr(self.extra_opts, 'debug_pipeline', None)
- if dp and os.path.exists(dp):
- odir = os.path.join(dp, 'input')
- if os.path.exists(odir):
- odir = os.path.join(odir, where)
- if not os.path.exists(odir):
- os.makedirs(odir)
-
- (name, i) = (None, 0)
- while not name or os.path.exists(os.path.join(odir, name)):
- i += 1
- name = '%04d.html' % i
-
- try:
- f = _[1]
- f.write(raw.encode('utf-8'))
- finally:
- pass
-
-
-
-
- for rule in rules + end_rules:
- html = rule[0].sub(rule[1], html)
-
- if is_pdftohtml and length > -1:
- dehyphenator = Dehyphenator()
- html = dehyphenator(html, 'pdf', length)
-
- if 'svg:' in html and SVG_NS not in html:
- html = html.replace('<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
-
- if 'xlink:' in html and XLINK_NS not in html:
- html = html.replace('<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
-
- html = XMLDECL_RE.sub('', html)
- if getattr(self.extra_opts, 'asciiize', False):
- Unidecoder = Unidecoder
- import calibre.ebooks.unidecode.unidecoder
- unidecoder = Unidecoder()
- html = unidecoder.decode(html)
-
- if self.plugin_preprocess:
- html = self.input_plugin_preprocess(self.extra_opts, html)
-
- if getattr(self.extra_opts, 'smarten_punctuation', False):
- html = self.smarten_punctuation(html)
-
- unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
- if unsupported_unicode_chars:
- Unidecoder = Unidecoder
- import calibre.ebooks.unidecode.unidecoder
- unidecoder = Unidecoder()
- for char in unsupported_unicode_chars:
- asciichar = unidecoder.decode(char)
- html = html.replace(char, asciichar)
-
-
- return html
-
-
- def smarten_punctuation(self, html):
- smartyPants = smartyPants
- import calibre.utils.smartypants
- substitute_entites = substitute_entites
- import calibre.ebooks.chardet
- uuid4 = uuid4
- import uuid
- start = 'calibre-smartypants-' + str(uuid4())
- stop = 'calibre-smartypants-' + str(uuid4())
- html = html.replace('<!--', start)
- html = html.replace('-->', stop)
- html = smartyPants(html)
- html = html.replace(start, '<!--')
- html = html.replace(stop, '-->')
- return substitute_entites(html)
-
-
-