Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_1142 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-08-06 | 5.3 KB | 162 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' import os import re from lxml import etree from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer BLOCK_TAGS = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'tr'] BLOCK_STYLES = [ 'block'] SPACE_TAGS = [ 'td'] class TXTMLizer(object): def __init__(self, log): self.log = log def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts return self.mlize_spine() def mlize_spine(self): output = [ u''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding = unicode)) content = self.remove_newlines(content) output += self.dump_text(etree.fromstring(content), stylizer) output = self.cleanup_text(u''.join(output)) return output def remove_newlines(self, text): self.log.debug('\tRemove newlines for processing...') text = text.replace('\r\n', ' ') text = text.replace('\n', ' ') text = text.replace('\r', ' ') return text def get_toc(self): toc = [ u''] if getattr(self.opts, 'inline_toc', None): self.log.debug('Generating table of contents...') toc.append(u'%s\n\n' % _(u'Table of Contents:')) for item in self.oeb_book.toc: toc.append(u'* %s\n\n' % item.title) return ''.join(toc) def cleanup_text(self, text): self.log.debug('\tClean up text...') text = text.replace(u'├é', '') text = text.replace(u'┬á', ' ') text = text.replace('\t+', ' ') text = text.replace('\x0b+', ' ') text = text.replace('\x0c+', ' ') text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) text = re.sub('[ ]{2,}', ' ', text) text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) text = re.sub('(?imu)^(?=.)', '\t', text) else: text = re.sub('\n{3,}', '\n\n', text) text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)[ ]+$', '', text) if self.opts.max_line_length: max_length = self.opts.max_line_length if self.opts.max_line_length < 25 and not (self.opts.force_max_line_length): max_length = 25 short_lines = [] lines = text.splitlines() for line in lines: while len(line) > max_length: space = line.rfind(' ', 0, max_length) if space != -1: short_lines.append(line[:space]) line = line[space + 1:] continue if self.opts.force_max_line_length: short_lines.append(line[:max_length]) line = line[max_length:] continue space = line.find(' ', max_length, len(line)) if space != -1: short_lines.append(line[:space]) line = line[space + 1:] continue short_lines.append(line) line = '' short_lines.append(line) text = '\n'.join(short_lines) return text def dump_text(self, elem, stylizer, end = ''): if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS: return [ ''] text = [ ''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden': return [ ''] tag = barename(elem.tag) in_block = False if tag in SPACE_TAGS: if not end.endswith('u ') and hasattr(elem, 'text') and elem.text: text.append(u' ') if hasattr(elem, 'text') and elem.text: text.append(elem.text) for item in elem: en = u'' if len(text) >= 2: en = text[-1][-2:] text += self.dump_text(item, stylizer, en) if in_block: text.append(u'\n\n') if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text