home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_885 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-10-31  |  12.2 KB  |  312 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. __license__ = 'GPL 3'
  5. __copyright__ = '2009, John Schember <john@nachtimwald.com>'
  6. __docformat__ = 'restructuredtext en'
  7. import cStringIO
  8. from base64 import b64encode
  9. import re
  10.  
  11. try:
  12.     from PIL import Image
  13.     Image
  14. except ImportError:
  15.     import Image
  16.  
  17. from lxml import etree
  18. from calibre import prepare_string_for_xml
  19. from calibre.constants import __appname__, __version__
  20. from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
  21. from calibre.ebooks.oeb.stylizer import Stylizer
  22. from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
  23. TAG_MAP = {
  24.     'b': 'strong',
  25.     'i': 'emphasis',
  26.     'p': 'p',
  27.     'li': 'p',
  28.     'div': 'p',
  29.     'br': 'p' }
  30. TAG_SPACE = []
  31. TAG_IMAGES = [
  32.     'img']
  33. TAG_LINKS = [
  34.     'a']
  35. BLOCK = [
  36.     'p']
  37. STYLES = [
  38.     ('font-weight', {
  39.         'bold': 'strong',
  40.         'bolder': 'strong' }),
  41.     ('font-style', {
  42.         'italic': 'emphasis' })]
  43.  
  44. class FB2MLizer(object):
  45.     
  46.     def __init__(self, log):
  47.         self.log = log
  48.         self.image_hrefs = { }
  49.         self.link_hrefs = { }
  50.  
  51.     
  52.     def extract_content(self, oeb_book, opts):
  53.         self.log.info('Converting XHTML to FB2 markup...')
  54.         self.oeb_book = oeb_book
  55.         self.opts = opts
  56.         return self.fb2mlize_spine()
  57.  
  58.     
  59.     def fb2mlize_spine(self):
  60.         self.image_hrefs = { }
  61.         self.link_hrefs = { }
  62.         output = [
  63.             self.fb2_header()]
  64.         output.append(self.get_cover_page())
  65.         output.append(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk')
  66.         output.append(self.get_text())
  67.         output.append(self.fb2_body_footer())
  68.         output.append(self.fb2mlize_images())
  69.         output.append(self.fb2_footer())
  70.         output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc())
  71.         output = self.clean_text(output)
  72.         if self.opts.sectionize_chapters:
  73.             output = self.sectionize_chapters(output)
  74.         
  75.         return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding = unicode, pretty_print = True)
  76.  
  77.     
  78.     def clean_text(self, text):
  79.         text = re.sub('(?miu)<p>\\s*</p>', '', text)
  80.         text = re.sub('(?miu)\\s+</p>', '</p>', text)
  81.         text = re.sub('(?miu)</p><p>', '</p>\n\n<p>', text)
  82.         return text
  83.  
  84.     
  85.     def fb2_header(self):
  86.         author_first = u''
  87.         author_middle = u''
  88.         author_last = u''
  89.         author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
  90.         if len(author_parts) == 1:
  91.             author_last = author_parts[0]
  92.         elif len(author_parts) == 2:
  93.             author_first = author_parts[0]
  94.             author_last = author_parts[1]
  95.         else:
  96.             author_first = author_parts[0]
  97.             author_middle = ' '.join(author_parts[1:-2])
  98.             author_last = author_parts[-1]
  99.         return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.gribuser.ru/xml/fictionbook/2.0">\n<description>\n<title-info>\n <author>\n<first-name>%s</first-name>\n<middle-name>%s</middle-name>\n<last-name>%s</last-name>\n</author>\n<book-title>%s</book-title> </title-info><document-info> <program-used>%s - %s</program-used></document-info>\n</description>\n<body>\n<section>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, self.oeb_book.metadata.title[0].value, __appname__, __version__)))
  100.  
  101.     
  102.     def get_cover_page(self):
  103.         output = u''
  104.         if 'cover' in self.oeb_book.guide:
  105.             output += '<image xlink:href="#cover.jpg" />'
  106.             self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.jpg'
  107.         
  108.         if 'titlepage' in self.oeb_book.guide:
  109.             self.log.debug('Generating cover page...')
  110.             href = self.oeb_book.guide['titlepage'].href
  111.             item = self.oeb_book.manifest.hrefs[href]
  112.             if item.spine_position is None:
  113.                 stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
  114.                 output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
  115.             
  116.         
  117.         return output
  118.  
  119.     
  120.     def get_toc(self):
  121.         toc = []
  122.         if self.opts.inline_toc:
  123.             self.log.debug('Generating table of contents...')
  124.             toc.append(u'<p>%s</p>' % _('Table of Contents:'))
  125.             for item in self.oeb_book.toc:
  126.                 if item.href in self.link_hrefs.keys():
  127.                     toc.append('<p><a xlink:href="#%s">%s</a></p>\n' % (self.link_hrefs[item.href], item.title))
  128.                     continue
  129.                 self.oeb.warn('Ignoring toc item: %s not found in document.' % item)
  130.             
  131.         
  132.         return ''.join(toc)
  133.  
  134.     
  135.     def sectionize_chapters(self, text):
  136.         
  137.         def remove_p(t):
  138.             t = t.replace('<p>', '')
  139.             t = t.replace('</p>', '')
  140.             return t
  141.  
  142.         text = re.sub(('(?imsu)(<p>)\\s*(?P<anchor><a\\s+id="calibre_link-\\d+"\\s*/>)\\s*(</p>)\\s*(<p>)\\s*(?P<strong><strong>.+?</strong>)\\s*(</p>)',), (lambda mo: '</section><section>%s<title><p>%s</p></title>' % (mo.group('anchor'), remove_p(mo.group('strong')))), text)
  143.         text = re.sub(('(?imsu)(<p>)\\s*(?P<anchor><a\\s+id="calibre_link-\\d+"\\s*/>)\\s*(</p>)\\s*(?P<strong><strong>.+?</strong>)',), (lambda mo: '</section><section>%s<title><p>%s</p></title>' % (mo.group('anchor'), remove_p(mo.group('strong')))), text)
  144.         text = re.sub(('(?imsu)(?P<anchor><a\\s+id="calibre_link-\\d+"\\s*/>)\\s*(<p>)\\s*(?P<strong><strong>.+?</strong>)\\s*(</p>)',), (lambda mo: '</section><section>%s<title><p>%s</p></title>' % (mo.group('anchor'), remove_p(mo.group('strong')))), text)
  145.         text = re.sub(('(?imsu)(<p>)\\s*(?P<anchor><a\\s+id="calibre_link-\\d+"\\s*/>)\\s*(?P<strong><strong>.+?</strong>)\\s*(</p>)',), (lambda mo: '</section><section>%s<title><p>%s</p></title>' % (mo.group('anchor'), remove_p(mo.group('strong')))), text)
  146.         text = re.sub(('(?imsu)(?P<anchor><a\\s+id="calibre_link-\\d+"\\s*/>)\\s*(?P<strong><strong>.+?</strong>)',), (lambda mo: '</section><section>%s<title><p>%s</p></title>' % (mo.group('anchor'), remove_p(mo.group('strong')))), text)
  147.         return text
  148.  
  149.     
  150.     def get_text(self):
  151.         text = []
  152.         for item in self.oeb_book.spine:
  153.             self.log.debug('Converting %s to FictionBook2 XML' % item.href)
  154.             stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
  155.             text.append(self.add_page_anchor(item))
  156.             text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
  157.         
  158.         return ''.join(text)
  159.  
  160.     
  161.     def fb2_body_footer(self):
  162.         return u'\n</section>\n</body>'
  163.  
  164.     
  165.     def fb2_footer(self):
  166.         return u'</FictionBook>'
  167.  
  168.     
  169.     def add_page_anchor(self, page):
  170.         return self.get_anchor(page, '')
  171.  
  172.     
  173.     def get_anchor(self, page, aid):
  174.         aid = prepare_string_for_xml(aid)
  175.         aid = '%s#%s' % (page.href, aid)
  176.         if aid not in self.link_hrefs.keys():
  177.             self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys())
  178.         
  179.         aid = self.link_hrefs[aid]
  180.         return '<a id="%s" />' % aid
  181.  
  182.     
  183.     def fb2mlize_images(self):
  184.         images = []
  185.         for item in self.oeb_book.manifest:
  186.             if item.media_type in OEB_RASTER_IMAGES:
  187.                 
  188.                 try:
  189.                     im = Image.open(cStringIO.StringIO(item.data)).convert('RGB')
  190.                     data = cStringIO.StringIO()
  191.                     im.save(data, 'JPEG')
  192.                     data = data.getvalue()
  193.                     raw_data = b64encode(data)
  194.                     data = ''
  195.                     col = 1
  196.                     for char in raw_data:
  197.                         if col == 72:
  198.                             data += '\n'
  199.                             col = 1
  200.                         
  201.                         col += 1
  202.                         data += char
  203.                     
  204.                     images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs.get(item.href, '0000.JPEG'), item.media_type, data))
  205.                 except Exception:
  206.                     e = None
  207.                     self.log.error('Error: Could not include file %s becuase %s.' % (item.href, e))
  208.                 except:
  209.                     None<EXCEPTION MATCH>Exception
  210.                 
  211.  
  212.             None<EXCEPTION MATCH>Exception
  213.         
  214.         return ''.join(images)
  215.  
  216.     
  217.     def dump_text(self, elem, stylizer, page, tag_stack = []):
  218.         if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS:
  219.             return []
  220.         style = stylizer.style(elem)
  221.         if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
  222.             return []
  223.         fb2_text = []
  224.         tags = []
  225.         tag = barename(elem.tag)
  226.         if tag in TAG_LINKS:
  227.             href = elem.get('href')
  228.             if href:
  229.                 href = prepare_string_for_xml(page.abshref(href))
  230.                 href = href.replace('"', '"')
  231.                 if '://' in href:
  232.                     fb2_text.append('<a xlink:href="%s">' % href)
  233.                 elif href.startswith('#'):
  234.                     href = href[1:]
  235.                 
  236.                 if href not in self.link_hrefs.keys():
  237.                     self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
  238.                 
  239.                 href = self.link_hrefs[href]
  240.                 fb2_text.append('<a xlink:href="#%s">' % href)
  241.                 tags.append('a')
  242.             
  243.         
  244.         id_name = elem.get('id')
  245.         if id_name:
  246.             fb2_text.append(self.get_anchor(page, id_name))
  247.         
  248.         fb2_tag = TAG_MAP.get(tag, None)
  249.         if fb2_tag == 'p':
  250.             if 'p' in tag_stack + tags:
  251.                 all_tags = tag_stack + tags
  252.                 closed_tags = []
  253.                 all_tags.reverse()
  254.                 for t in all_tags:
  255.                     fb2_text.append('</%s>' % t)
  256.                     closed_tags.append(t)
  257.                     if t == 'p':
  258.                         break
  259.                         continue
  260.                 
  261.                 closed_tags.reverse()
  262.                 for t in closed_tags:
  263.                     fb2_text.append('<%s>' % t)
  264.                 
  265.             else:
  266.                 fb2_text.append('<p>')
  267.                 tags.append('p')
  268.         elif fb2_tag and fb2_tag not in tag_stack + tags:
  269.             fb2_text.append('<%s>' % fb2_tag)
  270.             tags.append(fb2_tag)
  271.         
  272.         for s in STYLES:
  273.             style_tag = s[1].get(style[s[0]], None)
  274.             if style_tag and style_tag not in tag_stack + tags:
  275.                 fb2_text.append('<%s>' % style_tag)
  276.                 tags.append(style_tag)
  277.                 continue
  278.         
  279.         if tag in TAG_SPACE:
  280.             if not fb2_text and fb2_text[-1] != ' ' or not fb2_text[-1].endswith(' '):
  281.                 fb2_text.append(' ')
  282.             
  283.         
  284.         if hasattr(elem, 'text') and elem.text:
  285.             if 'p' not in tag_stack + tags:
  286.                 fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.text))
  287.             else:
  288.                 fb2_text.append(prepare_string_for_xml(elem.text))
  289.         
  290.         for item in elem:
  291.             fb2_text += self.dump_text(item, stylizer, page, tag_stack + tags)
  292.         
  293.         tags.reverse()
  294.         fb2_text += self.close_tags(tags)
  295.         if hasattr(elem, 'tail') and elem.tail:
  296.             if 'p' not in tag_stack:
  297.                 fb2_text.append('<p>%s</p>' % prepare_string_for_xml(elem.tail))
  298.             else:
  299.                 fb2_text.append(prepare_string_for_xml(elem.tail))
  300.         
  301.         return fb2_text
  302.  
  303.     
  304.     def close_tags(self, tags):
  305.         text = []
  306.         for tag in tags:
  307.             text.append('</%s>' % tag)
  308.         
  309.         return text
  310.  
  311.  
  312.