home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_870 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-10-31  |  9.8 KB  |  205 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. __license__ = 'GPL v3'
  5. __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
  6. __docformat__ = 'restructuredtext en'
  7. import re
  8. from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
  9. from calibre.utils.logging import default_log
  10.  
  11. class PreProcessor(object):
  12.     
  13.     def __init__(self, extra_opts = None, log = None):
  14.         self.log = None if log is None else log
  15.         self.html_preprocess_sections = 0
  16.         self.found_indents = 0
  17.         self.extra_opts = extra_opts
  18.  
  19.     
  20.     def chapter_head(self, match):
  21.         chap = match.group('chap')
  22.         title = match.group('title')
  23.         if not title:
  24.             self.html_preprocess_sections = self.html_preprocess_sections + 1
  25.             self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters. - ' + unicode(chap))
  26.             return '<h2>' + chap + '</h2>\n'
  27.         self.html_preprocess_sections = self.html_preprocess_sections + 1
  28.         self.log('found ' + unicode(self.html_preprocess_sections) + ' chapters & titles. - ' + unicode(chap) + ', ' + unicode(title))
  29.         return '<h2>' + chap + '</h2>\n<h3>' + title + '</h3>\n'
  30.  
  31.     
  32.     def chapter_break(self, match):
  33.         chap = match.group('section')
  34.         styles = match.group('styles')
  35.         self.html_preprocess_sections = self.html_preprocess_sections + 1
  36.         self.log('marked ' + unicode(self.html_preprocess_sections) + ' section markers based on punctuation. - ' + unicode(chap))
  37.         return '<' + styles + ' style="page-break-before:always">' + chap
  38.  
  39.     
  40.     def insert_indent(self, match):
  41.         pstyle = match.group('formatting')
  42.         span = match.group('span')
  43.         self.found_indents = self.found_indents + 1
  44.         if pstyle:
  45.             if not span:
  46.                 return '<p ' + pstyle + ' style="text-indent:3%">'
  47.             return '<p ' + pstyle + ' style="text-indent:3%">' + span
  48.         pstyle
  49.         if not span:
  50.             return '<p style="text-indent:3%">'
  51.         return '<p style="text-indent:3%">' + span
  52.  
  53.     
  54.     def no_markup(self, raw, percent):
  55.         htm_end_ere = re.compile('</p>', re.DOTALL)
  56.         line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
  57.         htm_end = htm_end_ere.findall(raw)
  58.         line_end = line_end_ere.findall(raw)
  59.         tot_htm_ends = len(htm_end)
  60.         tot_ln_fds = len(line_end)
  61.         self.log('There are ' + unicode(tot_ln_fds) + ' total Line feeds, and ' + unicode(tot_htm_ends) + ' marked up endings')
  62.         if percent > 1:
  63.             percent = 1
  64.         
  65.         if percent < 0:
  66.             percent = 0
  67.         
  68.         min_lns = tot_ln_fds * percent
  69.         self.log('There must be fewer than ' + unicode(min_lns) + ' unmarked lines to add markup')
  70.         if min_lns > tot_htm_ends:
  71.             return True
  72.  
  73.     
  74.     def __call__(self, html):
  75.         self.log('*********  Preprocessing HTML  *********')
  76.         html = re.sub('\\s*</p>', '</p>\n', html)
  77.         html = re.sub('\\s*<p>\\s*', '\n<p>', html)
  78.         if self.no_markup(html, 0.1):
  79.             self.log('not enough paragraph markers, adding now')
  80.             pre = re.compile('<pre>', re.IGNORECASE)
  81.             if len(pre.findall(html)) == 1:
  82.                 self.log('Running Text Processing')
  83.                 convert_basic = convert_basic
  84.                 preserve_spaces = preserve_spaces
  85.                 separate_paragraphs_single_line = separate_paragraphs_single_line
  86.                 import calibre.ebooks.txt.processor
  87.                 outerhtml = re.compile('.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE | re.DOTALL)
  88.                 html = outerhtml.sub('\\g<text>', html)
  89.                 html = separate_paragraphs_single_line(html)
  90.                 html = preserve_spaces(html)
  91.                 html = convert_basic(html, epub_split_size_kb = 0)
  92.             else:
  93.                 add_markup = re.compile('(?<!>)(\n)')
  94.                 html = add_markup.sub('</p>\n<p>', html)
  95.         
  96.         txtindent = re.compile(u'<p(?P<formatting>[^>]*)>\\s*(?P<span>(<span[^>]*>\\s*)+)?\\s*(┬á){2,}', re.IGNORECASE)
  97.         html = txtindent.sub(self.insert_indent, html)
  98.         if self.found_indents > 1:
  99.             self.log('replaced ' + unicode(self.found_indents) + ' nbsp indents with inline styles')
  100.         
  101.         html = re.sub(u'┬á', ' ', html)
  102.         html = re.sub(u'\\s*<o:p>\\s*</o:p>', ' ', html)
  103.         html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]*>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html)
  104.         html = re.sub('\\s*<[ibu][^>]*>\\s*(<[ibu][^>]*>\\s*</[ibu]>\\s*){0,2}\\s*</[ibu]>', ' ', html)
  105.         html = re.sub('\\s*<span[^>]*>\\s*(<span[^>]>\\s*</span>){0,2}\\s*</span>\\s*', ' ', html)
  106.         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE | re.DOTALL)
  107.         blankreg = re.compile('\\s*(?P<openline><p[^>]*>)\\s*(?P<closeline></p>)', re.IGNORECASE)
  108.         blanklines = blankreg.findall(html)
  109.         lines = linereg.findall(html)
  110.         blanks_between_paragraphs = False
  111.         if len(lines) > 1:
  112.             self.log('There are ' + unicode(len(blanklines)) + ' blank lines. ' + unicode(float(len(blanklines)) / float(len(lines))) + ' percent blank')
  113.             if float(len(blanklines)) / float(len(lines)) > 0.4 and getattr(self.extra_opts, 'remove_paragraph_spacing', False):
  114.                 self.log('deleting blank lines')
  115.                 html = blankreg.sub('', html)
  116.             elif float(len(blanklines)) / float(len(lines)) > 0.4:
  117.                 blanks_between_paragraphs = True
  118.             else:
  119.                 blanks_between_paragraphs = False
  120.         
  121.         lookahead = '(?=<(p|div))'
  122.         chapter_line_open = '<(?P<outer>p|div)[^>]*>\\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\\s*'
  123.         chapter_header_open = '(?P<chap>'
  124.         chapter_header_close = ')\\s*'
  125.         chapter_line_close = '(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)\\s[^>]*>)?\\s*</(?P=outer)>\\s*'
  126.         if blanks_between_paragraphs:
  127.             blank_lines = '(\\s*<p[^>]*>\\s*</p>){0,2}\\s*'
  128.         else:
  129.             blank_lines = ''
  130.         opt_title_open = '('
  131.         title_line_open = '<(?P<outer2>p|div)[^>]*>\\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\\s*'
  132.         title_header_open = '(?P<title>'
  133.         title_header_close = ')\\s*'
  134.         title_line_close = '(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)\\s[^>]*>)?\\s*</(?P=outer2)>'
  135.         opt_title_close = ')?'
  136.         default_title = '(\\s*[\\w\\\'\\"-]+){1,5}(?!<)'
  137.         typical_chapters = '.?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\\s|Prologue|Book\\s|Part\\s|Dedication)\\s*([\\d\\w-]+\\:?\\s*){0,4}'
  138.         numeric_chapters = '.?(\\d+\\.?|(CHAPTER\\s*([\\dA-Z\\-\\\'\\"\\?\\.!#,]+\\s*){1,10}))\\s*'
  139.         uppercase_chapters = '\\s*.?([A-Z#]+(\\s|-){0,3}){1,5}\\s*'
  140.         chapter_marker = lookahead + chapter_line_open + chapter_header_open + typical_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
  141.         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
  142.         self.html_preprocess_sections = len(heading.findall(html))
  143.         self.log('found ' + unicode(self.html_preprocess_sections) + ' pre-existing headings')
  144.         if self.html_preprocess_sections < 10:
  145.             chapdetect = re.compile('%s' % chapter_marker, re.IGNORECASE)
  146.             html = chapdetect.sub(self.chapter_head, html)
  147.         
  148.         if self.html_preprocess_sections < 10:
  149.             self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying numeric chapters')
  150.             chapter_marker = lookahead + chapter_line_open + chapter_header_open + numeric_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
  151.             chapdetect2 = re.compile('%s' % chapter_marker, re.IGNORECASE)
  152.             html = chapdetect2.sub(self.chapter_head, html)
  153.         
  154.         if self.html_preprocess_sections < 10:
  155.             self.log('not enough chapters, only ' + unicode(self.html_preprocess_sections) + ', trying with uppercase words')
  156.             chapter_marker = lookahead + chapter_line_open + chapter_header_open + uppercase_chapters + chapter_header_close + chapter_line_close + blank_lines + opt_title_open + title_line_open + title_header_open + default_title + title_header_close + title_line_close + opt_title_close
  157.             chapdetect2 = re.compile('%s' % chapter_marker, re.UNICODE)
  158.             html = chapdetect2.sub(self.chapter_head, html)
  159.         
  160.         paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
  161.         spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
  162.         paras = len(paras_reg.findall(html))
  163.         spans = len(spans_reg.findall(html))
  164.         if spans > 1:
  165.             if float(paras) / float(spans) < 0.75:
  166.                 format = 'spanned_html'
  167.             else:
  168.                 format = 'html'
  169.         else:
  170.             format = 'html'
  171.         docanalysis = DocAnalysis(format, html)
  172.         hardbreaks = docanalysis.line_histogram(0.5)
  173.         self.log('Hard line breaks check returned ' + unicode(hardbreaks))
  174.         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
  175.         length = docanalysis.line_length(unwrap_factor)
  176.         self.log('*** Median line length is ' + unicode(length) + ', calculated with ' + format + ' format ***')
  177.         if hardbreaks or unwrap_factor < 0.4:
  178.             self.log('Unwrapping required, unwrapping Lines')
  179.             html = re.sub(u'(?<=.{%i}[ΓÇôΓÇö])\\s*(?=<)(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*(?=[[a-z\\d])' % length, '', html)
  180.             self.log('Unwrapping/Removing hyphens')
  181.             dehyphenator = Dehyphenator()
  182.             html = dehyphenator(html, 'html', length)
  183.             self.log('Done dehyphenating')
  184.             unwrap = re.compile(u'(?<=.{%i}([a-z,:)\\IA├ƒ]|(?<!\\&\\w{4});))\\s*</(span|p|div)>\\s*(</(p|span|div)>)?\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*<(span|div|p)[^>]*>\\s*(<(span|div|p)[^>]*>)?\\s*' % length, re.UNICODE)
  185.             html = unwrap.sub(' ', html)
  186.             dehyphenator = Dehyphenator()
  187.             html = dehyphenator(html, 'html_cleanup', length)
  188.         else:
  189.             self.log('Cleaning up hyphenation')
  190.             dehyphenator = Dehyphenator()
  191.             html = dehyphenator(html, 'html_cleanup', length)
  192.             self.log('Done dehyphenating')
  193.         html = re.sub(u'┬¡\\s*(</span>\\s*(</[iubp]>\\s*<[iubp][^>]*>\\s*)?<span[^>]*>|</[iubp]>\\s*<[iubp][^>]*>)?\\s*', '', html)
  194.         if self.html_preprocess_sections < 10:
  195.             self.log('Looking for more split points based on punctuation, currently have ' + unicode(self.html_preprocess_sections))
  196.             chapdetect3 = re.compile('<(?P<styles>(p|div)[^>]*)>\\s*(?P<section>(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*(<[ibu][^>]*>){0,2}\\s*(<span[^>]*>)?\\s*.?(?=[a-z#\\-*\\s]+<)([a-z#-*]+\\s*){1,5}\\s*\\s*(</span>)?(</[ibu]>){0,2}\\s*(</span>)?\\s*(</[ibu]>){0,2}\\s*(</span>)?\\s*</(p|div)>)', re.IGNORECASE)
  197.             html = chapdetect3.sub(self.chapter_break, html)
  198.         
  199.         doubleheading = re.compile('(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\\s*(<(?!h\\d)[^>]*>\\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
  200.         html = doubleheading.sub('\\g<firsthead>\n<h3' + '\\g<secondhead>' + '</h3>', html)
  201.         html = blankreg.sub(u'\n\\g<openline>┬á' + '\\g<closeline>', html)
  202.         return html
  203.  
  204.  
  205.