home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL v3'
- __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
- __docformat__ = 'restructuredtext en'
- import re
- from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
- from calibre.utils.logging import default_log
-
- class PreProcessor(object):
-
- def __init__(self, extra_opts = None, log = None):
- self.log = None if log is None else log
- self.html_preprocess_sections = 0
- self.found_indents = 0
- self.extra_opts = extra_opts
-
-
- def chapter_head(self, match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters. - ' + unicode(chap))
- return '<h2>' + chap + '</h2>\n'
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters & titles. - ' + unicode(chap) + ', ' + unicode(title))
- return '<h2>' + chap + '</h2>\n<h3>' + title + '</h3>\n'
-
-
- def chapter_break(self, match):
- chap = match.group('section')
- styles = match.group('styles')
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log('marked ' + unicode(self.html_preprocess_sections) + ' section markers based on punctuation. - ' + unicode(chap))
- return '<' + styles + ' style="page-break-before:always">' + chap
-
-
- def insert_indent(self, match):
- pstyle = match.group('formatting')
- span = match.group('span')
- self.found_indents = self.found_indents + 1
- if pstyle:
- if not span:
- return '<p ' + pstyle + ' style="text-indent:3%">'
- return '<p ' + pstyle + ' style="text-indent:3%">' + span
- pstyle
- if not span:
- return '<p style="text-indent:3%">'
- return '<p style="text-indent:3%">' + span
-
-
- def no_markup(self, raw, percent):
- htm_end_ere = re.compile('</p>', re.DOTALL)
- line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
- htm_end = htm_end_ere.findall(raw)
- line_end = line_end_ere.findall(raw)
- tot_htm_ends = len(htm_end)
- tot_ln_fds = len(line_end)
- self.log('There are ' + unicode(tot_ln_fds) + ' total Line feeds, and ' + unicode(tot_htm_ends) + ' marked up endings')
- if percent > 1:
- percent = 1
-
- if percent < 0:
- percent = 0
-
- min_lns = tot_ln_fds * percent
- self.log('There must be fewer than ' + unicode(min_lns) + ' unmarked lines to add markup')
- if min_lns > tot_htm_ends:
- return True
-
-
- def __call__(self, html):
- self.log('********* Preprocessing HTML *********')
- html = re.sub('\\s*</p>', '</p>\n', html)
- html = re.sub('\\s*<p>\\s*', '\n<p>', html)
- if self.no_markup(html, 0.1):
- self.log('not enough paragraph markers, adding now')
- pre = re.compile('<pre>', re.IGNORECASE)
- if len(pre.findall(html)) == 1:
- self.log('Running Text Processing')
- convert_basic = convert_basic
- preserve_spaces = preserve_spaces
- separate_paragraphs_single_line = separate_paragraphs_single_line
- import calibre.ebooks.txt.processor
- outerhtml = re.compile('.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE | re.DOTALL)
- html = outerhtml.sub('\\g<text>', html)
- html = separate_paragraphs_single_line(html)
- html = preserve_spaces(html)
- html = convert_basic(html, epub_split_size_kb = 0)
- else:
- add_markup = re.compile('(?<!>)(\n)')
- html = add_markup.sub('</p>\n<p>', html)
-
- txtindent = re.compile(u'<p(?P<formatting>[^>]*)>\\s*(?P<span>(<span[^>]*>\\s*)+)?\\s*( ){2,}', re.IGNORECASE)
- html = txtindent.sub(self.insert_indent, html)
- if self.found_indents > 1:
- self.log('replaced ' + unicode(self.found_indents) + ' nbsp indents with inline styles')
-
- html = re.sub(u' ', ' ', html)
- html = re.sub(u'\\s*<o:p>\\s*</o:p>', ' ', html)
- html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]*>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html)
- html = re.sub('\\s*<[ibu][^>]*>\\s*(<[ibu][^>]*>\\s*</[ibu]>\\s*){0,2}\\s*</[ibu]>', ' ', html)
- html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html)
- linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE | re.DOTALL)
- blankreg = re.compile('\\s*(?P<openline><p[^>]*>)\\s*(?P<closeline></p>)', re.IGNORECASE)
- blanklines = blankreg.findall(html)
- lines = linereg.findall(html)
- blanks_between_paragraphs = False
- if len(lines) > 1:
- self.log('There are ' + unicode(len(blanklines)) + ' blank lines. ' + unicode(float(len(blanklines)) / float(len(lines))) + ' percent blank')
- if float(len(blanklines)) / float(len(lines)) > 0.4 and getattr(self.extra_opts, 'remove_paragraph_spacing', False):
- self.log('deleting blank lines')
- html = blankreg.sub('', html)
- elif float(len(blanklines)) / float(len(lines)) > 0.4:
- blanks_between_paragraphs = True
- else:
- blanks_between_paragraphs = False
-
- lookahead = '(?=<(p|div))'
- chapter_line_open = '<(?P<outer>p|div)[^>]*>\\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\\s*'
- chapter_header_open = '(?P<chap>'
- chapter_header_close = ')\\s*'
- chapter_line_close = '(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)\\s[^>]*>)?\\s*</(?P=outer)>\\s*'
- if blanks_between_paragraphs:
- blank_lines = '(\\s*<p[^>]*>\\s*</p>){0,2}\\s*'
- else:
- blank_lines = ''
- opt_title_open = '('
- title_line_open = '<(?P<outer2>p|div)[^>]*>\\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\\s*'
- title_header_open = '(?P<title>'
- title_header_close = ')\\s*'
- title_line_close = '(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)\\s[^>]*>)?\\s*</(?P=outer2)>'
- opt_title_close = ')?'
- default_title = '(\\s*[\\w\\\'\\"-]+){1,5}(?!<)'
- typical_chapters = '.?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\\s|Prologue|Book\\s|Part\\s|Dedication)\\s*([\\d\\w-]+\\:?\\s*){0,4}'
- numeric_chapters = '.?(\\d+\\.?|(CHAPTER\\s*([\\dA-Z\\-\\\'\\"\\?\\.!#,]+\\s*){1,10}))\\s*'
- uppercase_chapters = '\\s*.?([A-Z#]+(\\s|-){0,3}){1,5}\\s*'
- chapter_marker = lookahead + chapter_line_open + chapter_header_open + typical_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
- heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
- self.html_preprocess_sections = len(heading.findall(html))
- self.log('found ' + unicode(self.html_preprocess_sections) + ' pre-existing headings')
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile('%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect.sub(self.chapter_head, html)
-
- if self.html_preprocess_sections < 10:
- self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying numeric chapters')
- chapter_marker = lookahead + chapter_line_open + chapter_header_open + numeric_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
- chapdetect2 = re.compile('%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect2.sub(self.chapter_head, html)
-
- if self.html_preprocess_sections < 10:
- self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying with uppercase words')
- chapter_marker = lookahead + chapter_line_open + chapter_header_open + uppercase_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
- chapdetect2 = re.compile('%s' % chapter_marker, re.UNICODE)
- html = chapdetect2.sub(self.chapter_head, html)
-
- paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
- spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
- paras = len(paras_reg.findall(html))
- spans = len(spans_reg.findall(html))
- if spans > 1:
- if float(paras) / float(spans) < 0.75:
- format = 'spanned_html'
- else:
- format = 'html'
- else:
- format = 'html'
- docanalysis = DocAnalysis(format, html)
- hardbreaks = docanalysis.line_histogram(0.5)
- self.log('Hard line breaks check returned ' + unicode(hardbreaks))
- unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
- length = docanalysis.line_length(unwrap_factor)
- self.log('*** Median line length is ' + unicode(length) + ', calculated with ' + format + ' format ***')
- if hardbreaks or unwrap_factor < 0.4:
- self.log('Unwrapping required, unwrapping Lines')
- html = re.sub(u'(?<=.{%i}[ΓÇôΓÇö])\\s*(?=<)(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?=[[a-z\\d])' % length, '', html)
- self.log('Unwrapping/Removing hyphens')
- dehyphenator = Dehyphenator()
- html = dehyphenator(html, 'html', length)
- self.log('Done dehyphenating')
- unwrap = re.compile(u'(?<=.{%i}([a-z,:)\\IAß]|(?<!\\&\\w{4});))\\s*</(span|p|div)>\\s*(</(p|span|div)>)?\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*<(span|div|p)[^>]*>\\s*(<(span|div|p)[^>]*>)?\\s*' % length, re.UNICODE)
- html = unwrap.sub(' ', html)
- dehyphenator = Dehyphenator()
- html = dehyphenator(html, 'html_cleanup', length)
- else:
- self.log('Cleaning up hyphenation')
- dehyphenator = Dehyphenator()
- html = dehyphenator(html, 'html_cleanup', length)
- self.log('Done dehyphenating')
- html = re.sub(u'­\\s*(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*', '', html)
- if self.html_preprocess_sections < 10:
- self.log('Looking for more split points based on punctuation, currently have ' + unicode(self.html_preprocess_sections))
- chapdetect3 = re.compile('<(?P<styles>(p|div)[^>]*)>\\s*(?P<section>(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*.?(?=[a-z#\\-*\\s]+<)([a-z#-*]+\\s*){1,5}\\s*\\s*(</span>)?(</[ibu]>){0,2}\\s*(</span>)?\\s*(</[ibu]>){0,2}\\s*(</span>)?\\s*</(p|div)>)', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_break, html)
-
- doubleheading = re.compile('(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\\s*(<(?!h\\d)[^>]*>\\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
- html = doubleheading.sub('\\g<firsthead>\n<h3' + '\\g<secondhead>' + '</h3>', html)
- html = blankreg.sub(u'\n\\g<openline> ' + '\\g<closeline>', html)
- return html
-
-
-