Maximum CD 2011 January

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_870 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-10-31 | 9.8 KB | 205 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log class PreProcessor(object): def __init__(self, extra_opts = None, log = None): self.log = None if log is None else log self.html_preprocess_sections = 0 self.found_indents = 0 self.extra_opts = extra_opts def chapter_head(self, match): chap = match.group('chap') title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters. - ' + unicode(chap)) return '<h2>' + chap + '</h2>\n' self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters & titles. - ' + unicode(chap) + ', ' + unicode(title)) return '<h2>' + chap + '</h2>\n<h3>' + title + '</h3>\n' def chapter_break(self, match): chap = match.group('section') styles = match.group('styles') self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log('marked ' + unicode(self.html_preprocess_sections) + ' section markers based on punctuation. - ' + unicode(chap)) return '<' + styles + ' style="page-break-before:always">' + chap def insert_indent(self, match): pstyle = match.group('formatting') span = match.group('span') self.found_indents = self.found_indents + 1 if pstyle: if not span: return '<p ' + pstyle + ' style="text-indent:3%">' return '<p ' + pstyle + ' style="text-indent:3%">' + span pstyle if not span: return '<p style="text-indent:3%">' return '<p style="text-indent:3%">' + span def no_markup(self, raw, percent): htm_end_ere = re.compile('</p>', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) tot_htm_ends = len(htm_end) tot_ln_fds = len(line_end) self.log('There are ' + unicode(tot_ln_fds) + ' total Line feeds, and ' + unicode(tot_htm_ends) + ' marked up endings') if percent > 1: percent = 1 if percent < 0: percent = 0 min_lns = tot_ln_fds * percent self.log('There must be fewer than ' + unicode(min_lns) + ' unmarked lines to add markup') if min_lns > tot_htm_ends: return True def __call__(self, html): self.log('********* Preprocessing HTML *********') html = re.sub('\\s*</p>', '</p>\n', html) html = re.sub('\\s*<p>\\s*', '\n<p>', html) if self.no_markup(html, 0.1): self.log('not enough paragraph markers, adding now') pre = re.compile('<pre>', re.IGNORECASE) if len(pre.findall(html)) == 1: self.log('Running Text Processing') convert_basic = convert_basic preserve_spaces = preserve_spaces separate_paragraphs_single_line = separate_paragraphs_single_line import calibre.ebooks.txt.processor outerhtml = re.compile('.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE | re.DOTALL) html = outerhtml.sub('\\g<text>', html) html = separate_paragraphs_single_line(html) html = preserve_spaces(html) html = convert_basic(html, epub_split_size_kb = 0) else: add_markup = re.compile('(?<!>)(\n)') html = add_markup.sub('</p>\n<p>', html) txtindent = re.compile(u'<p(?P<formatting>[^>]*)>\\s*(?P<span>(<span[^>]*>\\s*)+)?\\s*(┬á){2,}', re.IGNORECASE) html = txtindent.sub(self.insert_indent, html) if self.found_indents > 1: self.log('replaced ' + unicode(self.found_indents) + ' nbsp indents with inline styles') html = re.sub(u'┬á', ' ', html) html = re.sub(u'\\s*<o:p>\\s*</o:p>', ' ', html) html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]*>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html) html = re.sub('\\s*<[ibu][^>]*>\\s*(<[ibu][^>]*>\\s*</[ibu]>\\s*){0,2}\\s*</[ibu]>', ' ', html) html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE | re.DOTALL) blankreg = re.compile('\\s*(?P<openline><p[^>]*>)\\s*(?P<closeline></p>)', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) blanks_between_paragraphs = False if len(lines) > 1: self.log('There are ' + unicode(len(blanklines)) + ' blank lines. ' + unicode(float(len(blanklines)) / float(len(lines))) + ' percent blank') if float(len(blanklines)) / float(len(lines)) > 0.4 and getattr(self.extra_opts, 'remove_paragraph_spacing', False): self.log('deleting blank lines') html = blankreg.sub('', html) elif float(len(blanklines)) / float(len(lines)) > 0.4: blanks_between_paragraphs = True else: blanks_between_paragraphs = False lookahead = '(?=<(p|div))' chapter_line_open = '<(?P<outer>p|div)[^>]*>\\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\\s*' chapter_header_open = '(?P<chap>' chapter_header_close = ')\\s*' chapter_line_close = '(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)\\s[^>]*>)?\\s*</(?P=outer)>\\s*' if blanks_between_paragraphs: blank_lines = '(\\s*<p[^>]*>\\s*</p>){0,2}\\s*' else: blank_lines = '' opt_title_open = '(' title_line_open = '<(?P<outer2>p|div)[^>]*>\\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\\s*' title_header_open = '(?P<title>' title_header_close = ')\\s*' title_line_close = '(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)\\s[^>]*>)?\\s*</(?P=outer2)>' opt_title_close = ')?' default_title = '(\\s*[\\w\\\'\\"-]+){1,5}(?!<)' typical_chapters = '.?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\\s|Prologue|Book\\s|Part\\s|Dedication)\\s*([\\d\\w-]+\\:?\\s*){0,4}' numeric_chapters = '.?(\\d+\\.?|(CHAPTER\\s*([\\dA-Z\\-\\\'\\"\\?\\.!#,]+\\s*){1,10}))\\s*' uppercase_chapters = '\\s*.?([A-Z#]+(\\s|-){0,3}){1,5}\\s*' chapter_marker = lookahead + chapter_line_open + chapter_header_open + typical_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log('found ' + unicode(self.html_preprocess_sections) + ' pre-existing headings') if self.html_preprocess_sections < 10: chapdetect = re.compile('%s' % chapter_marker, re.IGNORECASE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying numeric chapters') chapter_marker = lookahead + chapter_line_open + chapter_header_open + numeric_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close chapdetect2 = re.compile('%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying with uppercase words') chapter_marker = lookahead + chapter_line_open + chapter_header_open + uppercase_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close chapdetect2 = re.compile('%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) paras = len(paras_reg.findall(html)) spans = len(spans_reg.findall(html)) if spans > 1: if float(paras) / float(spans) < 0.75: format = 'spanned_html' else: format = 'html' else: format = 'html' docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(0.5) self.log('Hard line breaks check returned ' + unicode(hardbreaks)) unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) length = docanalysis.line_length(unwrap_factor) self.log('*** Median line length is ' + unicode(length) + ', calculated with ' + format + ' format ***') if hardbreaks or unwrap_factor < 0.4: self.log('Unwrapping required, unwrapping Lines') html = re.sub(u'(?<=.{%i}[ΓÇôΓÇö])\\s*(?=<)(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?=[[a-z\\d])' % length, '', html) self.log('Unwrapping/Removing hyphens') dehyphenator = Dehyphenator() html = dehyphenator(html, 'html', length) self.log('Done dehyphenating') unwrap = re.compile(u'(?<=.{%i}([a-z,:)\\IA├ƒ]|(?<!\\&\\w{4});))\\s*</(span|p|div)>\\s*(</(p|span|div)>)?\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*<(span|div|p)[^>]*>\\s*(<(span|div|p)[^>]*>)?\\s*' % length, re.UNICODE) html = unwrap.sub(' ', html) dehyphenator = Dehyphenator() html = dehyphenator(html, 'html_cleanup', length) else: self.log('Cleaning up hyphenation') dehyphenator = Dehyphenator() html = dehyphenator(html, 'html_cleanup', length) self.log('Done dehyphenating') html = re.sub(u'┬¡\\s*(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*', '', html) if self.html_preprocess_sections < 10: self.log('Looking for more split points based on punctuation, currently have ' + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile('<(?P<styles>(p|div)[^>]*)>\\s*(?P<section>(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*.?(?=[a-z#\\-*\\s]+<)([a-z#-*]+\\s*){1,5}\\s*\\s*(</span>)?(</[ibu]>){0,2}\\s*(</span>)?\\s*(</[ibu]>){0,2}\\s*(</span>)?\\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) doubleheading = re.compile('(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\\s*(<(?!h\\d)[^>]*>\\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) html = doubleheading.sub('\\g<firsthead>\n<h3' + '\\g<secondhead>' + '</h3>', html) html = blankreg.sub(u'\n\\g<openline>┬á' + '\\g<closeline>', html) return html