home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL 3'
- __copyright__ = '2009, John Schember <john@nachtimwald.com>'
- __docformat__ = 'restructuredtext en'
- import re
- from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
- from calibre.ebooks.oeb.stylizer import Stylizer
- from calibre.ebooks.pdb.ereader import image_name
- from calibre.ebooks.pml import unipmlcode
- TAG_MAP = {
- 'b': 'B',
- 'strong': 'B',
- 'i': 'i',
- 'small': 'k',
- 'sub': 'Sb',
- 'sup': 'Sp',
- 'big': 'l',
- 'del': 'o',
- 'h1': 'x',
- 'h2': 'X0',
- 'h3': 'X1',
- 'h4': 'X2',
- 'h5': 'X3',
- 'h6': 'X4',
- '!--': 'v' }
- STYLES = [
- ('font-weight', {
- 'bold': 'B',
- 'bolder': 'B' }),
- ('font-style', {
- 'italic': 'i' }),
- ('text-decoration', {
- 'underline': 'u' }),
- ('text-align', {
- 'right': 'r',
- 'center': 'c' })]
- BLOCK_TAGS = [
- 'p',
- 'div']
- BLOCK_STYLES = [
- 'block']
- LINK_TAGS = [
- 'a']
- IMAGE_TAGS = [
- 'img']
- SEPARATE_TAGS = [
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'p',
- 'div',
- 'li',
- 'tr']
-
- class PMLMLizer(object):
-
- def __init__(self, log):
- self.log = log
- self.image_hrefs = { }
- self.link_hrefs = { }
-
-
- def extract_content(self, oeb_book, opts):
- self.log.info('Converting XHTML to PML markup...')
- self.oeb_book = oeb_book
- self.opts = opts
- self.toc = { }
- for item in oeb_book.toc:
- (href, mid, id) = item.href.partition('#')
- self.get_anchor_id(href, id)
- if not self.toc.get(href, None):
- self.toc[href] = { }
-
- self.toc[href][id] = item.title
-
- return self.pmlmlize_spine()
-
-
- def pmlmlize_spine(self):
- self.image_hrefs = { }
- self.link_hrefs = { }
- output = [
- u'']
- output.append(self.get_cover_page())
- output.append(self.get_text())
- output = ''.join(output)
- output = self.clean_text(output)
- return output
-
-
- def get_cover_page(self):
- output = u''
- if 'cover' in self.oeb_book.guide:
- output += '\\m="cover.png"\n'
- self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.png'
-
- if 'titlepage' in self.oeb_book.guide:
- self.log.debug('Generating title page...')
- href = self.oeb_book.guide['titlepage'].href
- item = self.oeb_book.manifest.hrefs[href]
- if item.spine_position is None:
- stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
- output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
-
-
- return output
-
-
- def get_text(self):
- text = [
- u'']
- for item in self.oeb_book.spine:
- self.log.debug('Converting %s to PML markup...' % item.href)
- stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
- text.append(self.add_page_anchor(item))
- text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
-
- return ''.join(text)
-
-
- def add_page_anchor(self, page):
- return self.get_anchor(page, '')
-
-
- def get_anchor_id(self, href, aid):
- aid = '%s#%s' % (href, aid)
- if aid not in self.link_hrefs.keys():
- self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys())
-
- aid = self.link_hrefs[aid]
- return aid
-
-
- def get_anchor(self, page, aid):
- aid = self.get_anchor_id(page.href, aid)
- return u'\\Q="%s"' % aid
-
-
- def remove_newlines(self, text):
- text = text.replace('\r\n', ' ')
- text = text.replace('\n', ' ')
- text = text.replace('\r', ' ')
- return text
-
-
- def clean_text(self, text):
- text = re.sub('\\\\p\\s*\\\\p', '', text)
- anchors = set(re.findall('(?<=\\\\Q=").+?(?=")', text))
- links = set(re.findall('(?<=\\\\q="#).+?(?=")', text))
- for unused in anchors.difference(links):
- text = text.replace('\\Q="%s"' % unused, '')
-
- text = text.replace(u'Â', '')
- text = text.replace(u' ', ' ')
- text = re.sub('[^\x00-\x7f]', (lambda x: unipmlcode(x.group())), text)
- text = re.sub('(?m)^[ ]+', '', text)
- text = re.sub('(?m)[ ]+$', '', text)
- text = re.sub('[ ]{2,}', ' ', text)
- text = re.sub('\n[ ]+\n', '\n\n', text)
- if self.opts.remove_paragraph_spacing:
- text = re.sub('\n{2,}', '\n', text)
- text = re.sub('(?imu)^(?P<text>.+)$', (lambda mo: if re.search('\\\\[XxCm]', mo.group('text')):
- mo.group('text')' %s' % mo.group('text')), text)
- else:
- text = re.sub('\n{3,}', '\n\n', text)
- return text
-
-
- def dump_text(self, elem, stylizer, page, tag_stack = []):
- if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS:
- return []
- text = []
- tags = []
- style = stylizer.style(elem)
- if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
- return []
- tag = barename(elem.tag)
- if tag in IMAGE_TAGS:
- if elem.attrib.get('src', None):
- if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
- if len(self.image_hrefs.keys()) == 0:
- self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png'
- else:
- self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
-
- text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
-
-
- if tag == 'hr':
- w = '\\w'
- width = elem.get('width')
- if width:
- w += '="%s%%"' % width
- else:
- w += '="50%"'
- text.append(w)
-
- toc_name = elem.attrib.get('name', None)
- toc_id = elem.attrib.get('id', None)
- if (toc_id or toc_name) and tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
- toc_page = page.href
- if self.toc.get(toc_page, None):
- for toc_x in (toc_name, toc_id):
- toc_title = self.toc[toc_page].get(toc_x, None)
- if toc_title:
- text.append('\\C0="%s"' % toc_title)
- continue
-
-
-
- pml_tag = TAG_MAP.get(tag, None)
- if pml_tag and pml_tag not in tag_stack + tags:
- text.append('\\%s' % pml_tag)
- tags.append(pml_tag)
-
- if tag in LINK_TAGS and 'q' not in tag_stack + tags:
- href = elem.get('href')
- if href:
- href = page.abshref(href)
- if '://' not in href:
- if '#' not in href:
- href += '#'
-
- if href not in self.link_hrefs.keys():
- self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
-
- href = '#%s' % self.link_hrefs[href]
-
- text.append('\\q="%s"' % href)
- tags.append('q')
-
-
- id_name = elem.get('id')
- name_name = elem.get('name')
- for name_x in (id_name, name_name):
- if name_x:
- text.append(self.get_anchor(page, name_x))
- continue
-
- for s in STYLES:
- style_tag = s[1].get(style[s[0]], None)
- if style_tag and style_tag not in tag_stack + tags:
- text.append('\\%s' % style_tag)
- tags.append(style_tag)
- continue
-
- if hasattr(elem, 'text') and elem.text:
- text.append(self.remove_newlines(elem.text))
-
- for item in elem:
- text += self.dump_text(item, stylizer, page, tag_stack + tags)
-
- tags.reverse()
- text += self.close_tags(tags)
- if tag in SEPARATE_TAGS:
- text.append('\n\n')
-
- if hasattr(elem, 'tail') and elem.tail:
- text.append(self.remove_newlines(elem.tail))
-
- return text
-
-
- def close_tags(self, tags):
- text = []
- for tag in tags:
- if tag == 'block':
- text.append('\n\n')
- continue
- text.append('\\%s' % tag)
-
- return text
-
-
-