home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
- import shutil
- import os
- import re
- import struct
- import textwrap
- import cStringIO
- import sys
-
- try:
- from PIL import Image as PILImage
- PILImage
- except ImportError:
- import Image as PILImage
-
- from lxml import html, etree
- from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode, replace_entities
- from calibre.utils.filenames import ascii_filename
- from calibre.utils.date import parse_date
- from calibre.ptempfile import TemporaryDirectory
- from calibre.ebooks import DRMError
- from calibre.ebooks.chardet import ENCODING_PATS
- from calibre.ebooks.mobi import MobiError
- from calibre.ebooks.mobi.huffcdic import HuffReader
- from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
- from calibre.ebooks.compression.palmdoc import decompress_doc
- from calibre.ebooks.metadata import MetaInformation
- from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
- from calibre.ebooks.metadata.toc import TOC
-
- class EXTHHeader(object):
-
- def __init__(self, raw, codec, title):
- self.doctype = raw[:4]
- (self.length, self.num_items) = struct.unpack('>LL', raw[4:12])
- raw = raw[12:]
- pos = 0
- self.mi = MetaInformation(_('Unknown'), [
- _('Unknown')])
- self.has_fake_cover = True
- left = self.num_items
- while left > 0:
- left -= 1
- (id, size) = struct.unpack('>LL', raw[pos:pos + 8])
- content = raw[pos + 8:pos + size]
- pos += size
- if id >= 100 and id < 200:
- self.process_metadata(id, content, codec)
- continue
- if id == 203:
- self.has_fake_cover = bool(struct.unpack('>L', content)[0])
- continue
- if id == 201:
- (co,) = struct.unpack('>L', content)
- if co < 1e+07:
- self.cover_offset = co
-
- co < 1e+07
- if id == 202:
- (self.thumbnail_offset,) = struct.unpack('>L', content)
- continue
- if id == 501:
- continue
- if id == 502:
- continue
- if id == 503:
- if not title and title == _('Unknown') and 'USER_CONTENT' in title or title.startswith('dtp_'):
-
- try:
- title = content.decode(codec)
-
-
- title.startswith('dtp_')
- if title:
- self.mi.title = replace_entities(title)
-
-
-
- def process_metadata(self, id, content, codec):
- if id == 100:
- if self.mi.authors == [
- _('Unknown')]:
- self.mi.authors = []
-
- au = content.decode(codec, 'ignore').strip()
- self.mi.authors.append(au)
- if re.match('\\S+?\\s*,\\s+\\S+', au.strip()):
- self.mi.author_sort = au.strip()
-
- elif id == 101:
- self.mi.publisher = content.decode(codec, 'ignore').strip()
- elif id == 103:
- self.mi.comments = content.decode(codec, 'ignore')
- elif id == 104:
- self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
- elif id == 105:
- if not self.mi.tags:
- self.mi.tags = []
-
- []([ x.strip() for x in content.decode(codec, 'ignore').split(';') ])
- self.mi.tags = list(set(self.mi.tags))
- elif id == 106:
-
- try:
- self.mi.pubdate = parse_date(content, as_utc = False)
-
- elif id == 108:
- pass
-
-
-
-
- class BookHeader(object):
-
- def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix = False):
- self.log = log
- self.compression_type = raw[:2]
- (self.records, self.records_size) = struct.unpack('>HH', raw[8:12])
- (self.encryption_type,) = struct.unpack('>H', raw[12:14])
- if ident == 'TEXTREAD':
- self.codepage = 1252
-
- if len(raw) <= 16:
- self.codec = 'cp1252'
- self.extra_flags = 0
- self.title = _('Unknown')
- self.language = 'ENGLISH'
- self.sublanguage = 'NEUTRAL'
- (self.exth_flag, self.exth) = (0, None)
- self.ancient = True
- self.first_image_index = -1
- self.mobi_version = 1
- else:
- self.ancient = False
- self.doctype = raw[16:20]
- (self.length, self.type, self.codepage, self.unique_id, self.version) = struct.unpack('>LLLLL', raw[20:40])
-
- try:
- self.codec = {
- 1252: 'cp1252',
- 65001: 'utf-8' }[self.codepage]
- except (IndexError, KeyError):
- self.codec = None if user_encoding is None else user_encoding
- log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, self.codec))
-
- if (ident == 'TEXTREAD' and self.length < 228 and 232 < self.length or try_extra_data_fix) and self.length == 228:
- self.extra_flags = 0
- else:
- (self.extra_flags,) = struct.unpack('>H', raw[242:244])
- if self.compression_type == 'DH':
- (self.huff_offset, self.huff_number) = struct.unpack('>LL', raw[112:120])
-
- (toff, tlen) = struct.unpack('>II', raw[84:92])
- tend = toff + tlen
- self.title = None if tend < len(raw) else _('Unknown')
- langcode = struct.unpack('!L', raw[92:96])[0]
- langid = langcode & 255
- sublangid = langcode >> 10 & 255
- self.language = main_language.get(langid, 'ENGLISH')
- self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
- self.mobi_version = struct.unpack('>I', raw[104:108])[0]
- self.first_image_index = struct.unpack('>L', raw[108:112])[0]
- (self.exth_flag,) = struct.unpack('>L', raw[128:132])
- self.exth = None
- if not isinstance(self.title, unicode):
- self.title = self.title.decode(self.codec, 'replace')
-
- if self.exth_flag & 64:
-
- try:
- self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
- self.exth.mi.uid = self.unique_id
-
- try:
- self.exth.mi.language = mobi2iana(langid, sublangid)
- except:
- self.log.exception('Unknown language code')
-
- self.log.exception('Invalid EXTH header')
- self.exth_flag = 0
-
-
-
-
-
- class MetadataHeader(BookHeader):
-
- def __init__(self, stream, log):
- self.stream = stream
- self.ident = self.identity()
- self.num_sections = self.section_count()
- if self.num_sections >= 2:
- header = self.header()
- BookHeader.__init__(self, header, self.ident, None, log)
- else:
- self.exth = None
-
-
- def identity(self):
- self.stream.seek(60)
- ident = self.stream.read(8).upper()
- if ident not in ('BOOKMOBI', 'TEXTREAD'):
- raise MobiError('Unknown book type: %s' % ident)
- ident not in ('BOOKMOBI', 'TEXTREAD')
- return ident
-
-
- def section_count(self):
- self.stream.seek(76)
- return struct.unpack('>H', self.stream.read(2))[0]
-
-
- def section_offset(self, number):
- self.stream.seek(78 + number * 8)
- return struct.unpack('>LBBBB', self.stream.read(8))[0]
-
-
- def header(self):
- section_headers = []
- section_headers.append(self.section_offset(0))
- section_headers.append(self.section_offset(1))
- end_off = section_headers[1]
- off = section_headers[0]
- self.stream.seek(off)
- return self.stream.read(end_off - off)
-
-
- def section_data(self, number):
- start = self.section_offset(number)
- if number == self.num_sections - 1:
- end = os.stat(self.stream.name).st_size
- else:
- end = self.section_offset(number + 1)
- self.stream.seek(start)
- return self.stream.read(end - start)
-
-
-
- class MobiReader(object):
- PAGE_BREAK_PAT = re.compile('(<[/]{0,1}mbp:pagebreak\\s*[/]{0,1}>)+', re.IGNORECASE)
- IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
-
- def __init__(self, filename_or_stream, log, user_encoding = None, debug = None, try_extra_data_fix = False):
- self.log = log
- self.debug = debug
- self.embedded_mi = None
- self.base_css_rules = textwrap.dedent('\n blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }\n\n p { margin: 0em; text-align: justify }\n\n .bold { font-weight: bold }\n\n .italic { font-style: italic }\n\n .mbp_pagebreak {\n page-break-after: always; margin: 0; display: block\n }\n ')
- self.tag_css_rules = { }
- if hasattr(filename_or_stream, 'read'):
- stream = filename_or_stream
- stream.seek(0)
- else:
- stream = open(filename_or_stream, 'rb')
- raw = stream.read()
- if raw.startswith('TPZ'):
- raise ValueError(_('This is an Amazon Topaz book. It cannot be processed.'))
- raw.startswith('TPZ')
- self.header = raw[0:72]
- self.name = self.header[:32].replace('\x00', '')
- (self.num_sections,) = struct.unpack('>H', raw[76:78])
- self.ident = self.header[60:68].upper()
- if self.ident not in ('BOOKMOBI', 'TEXTREAD'):
- raise MobiError('Unknown book type: %s' % repr(self.ident))
- self.ident not in ('BOOKMOBI', 'TEXTREAD')
- self.sections = []
- self.section_headers = []
- for i in range(self.num_sections):
- (offset, a1, a2, a3, a4) = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
- flags = a1
- val = a2 << 16 | a3 << 8 | a4
- self.section_headers.append((offset, flags, val))
-
-
- def section(section_number):
- if section_number == self.num_sections - 1:
- end_off = len(raw)
- else:
- end_off = self.section_headers[section_number + 1][0]
- off = self.section_headers[section_number][0]
- return raw[off:end_off]
-
- for i in range(self.num_sections):
- self.sections.append((section(i), self.section_headers[i]))
-
- self.book_header = BookHeader(self.sections[0][0], self.ident, user_encoding, self.log, try_extra_data_fix = try_extra_data_fix)
- self.name = self.name.decode(self.book_header.codec, 'replace')
-
-
- def extract_content(self, output_dir, parse_cache):
- output_dir = os.path.abspath(output_dir)
- if self.book_header.encryption_type != 0:
- raise DRMError(self.name)
- self.book_header.encryption_type != 0
- processed_records = self.extract_text()
- if self.debug is not None:
- parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
-
- self.add_anchors()
- self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
- self.processed_html = self.processed_html.replace('</</', '</')
- self.processed_html = re.sub('</([a-zA-Z]+)<', '</\\1><', self.processed_html)
- for pat in ENCODING_PATS:
- self.processed_html = pat.sub('', self.processed_html)
-
- self.processed_html = re.sub('&(\\S+?);', xml_entity_to_unicode, self.processed_html)
- self.extract_images(processed_records, output_dir)
- self.replace_page_breaks()
- self.cleanup_html()
- self.log.debug('Parsing HTML...')
-
- try:
- root = html.fromstring(self.processed_html)
- if len(root.xpath('//html')) > 5:
- root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', ''))
- except:
- self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
- self.processed_html = self.remove_random_bytes(self.processed_html)
- root = html.fromstring(self.processed_html)
-
- if root.xpath('descendant::p/descendant::p'):
- soupparser = soupparser
- import lxml.html
- self.log.warning('Malformed markup, parsing using BeautifulSoup')
-
- try:
- root = soupparser.fromstring(self.processed_html)
- except Exception:
- self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
- self.processed_html = self.remove_random_bytes(self.processed_html)
- root = soupparser.fromstring(self.processed_html)
- except:
- None<EXCEPTION MATCH>Exception
-
-
- None<EXCEPTION MATCH>Exception
- if root.tag != 'html':
- self.log.warn('File does not have opening <html> tag')
- nroot = html.fromstring('<html><head></head><body></body></html>')
- bod = nroot.find('body')
- for child in list(root):
- child.getparent().remove(child)
- bod.append(child)
-
- root = nroot
-
- htmls = list(root.xpath('//html'))
- if len(htmls) > 1:
- self.log.warn('Markup contains multiple <html> tags, merging.')
- for h in htmls:
- p = h.getparent()
- if hasattr(p, 'remove'):
- p.remove(h)
- continue
-
- bodies = root.xpath('//body')
- heads = root.xpath('//head')
- for x in root:
- root.remove(x)
-
- (head, body) = map(root.makeelement, ('head', 'body'))
- for h in heads:
- for x in h:
- h.remove(x)
- head.append(x)
-
-
- for b in bodies:
- for x in b:
- b.remove(x)
- body.append(x)
-
-
- (root.append(head), root.append(body))
-
- for x in root.xpath('//script'):
- x.getparent().remove(x)
-
- head = root.xpath('//head')
- if head:
- head = head[0]
- else:
- head = root.makeelement('head', { })
- root.insert(0, head)
- head.text = '\n\t'
- link = head.makeelement('link', {
- 'type': 'text/css',
- 'href': 'styles.css',
- 'rel': 'stylesheet' })
- head.insert(0, link)
- link.tail = '\n\t'
- title = head.xpath('descendant::title')
- m = head.makeelement('meta', {
- 'http-equiv': 'Content-Type',
- 'content': 'text/html; charset=utf-8' })
- head.insert(0, m)
- if not title:
- title = head.makeelement('title', { })
- title.text = self.book_header.title
- title.tail = '\n\t'
- head.insert(0, title)
- head.text = '\n\t'
-
- self.upshift_markup(root)
- guides = root.xpath('//guide')
- guide = None if guides else None
- metadata_elems = root.xpath('//metadata')
- if metadata_elems and self.book_header.exth is None:
- self.read_embedded_metadata(root, metadata_elems[0], guide)
-
- for elem in guides + metadata_elems:
- elem.getparent().remove(elem)
-
- fname = self.name.encode('ascii', 'replace')
- fname = re.sub('[\\x08\\x15\\0]+', '', fname)
- htmlfile = os.path.join(output_dir, ascii_filename(fname) + '.html')
-
- try:
- for ref in guide.xpath('descendant::reference'):
- if ref.attrib.has_key('href'):
- ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
- continue
- except AttributeError:
- pass
-
- parse_cache[htmlfile] = root
- self.htmlfile = htmlfile
- ncx = cStringIO.StringIO()
- (opf, ncx_manifest_entry) = self.create_opf(htmlfile, guide, root)
- self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
- opf.render(open(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry = ncx_manifest_entry)
- ncx = ncx.getvalue()
- if ncx:
- ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
- open(ncx_path, 'wb').write(ncx)
-
-
- try:
- s = _[1]
- s.write(self.base_css_rules + '\n\n')
- for cls, rule in self.tag_css_rules.items():
- s.write('.%s { %s }\n\n' % (cls, rule))
- finally:
- pass
-
-
-
- def read_embedded_metadata(self, root, elem, guide):
- raw = '<?xml version="1.0" encoding="utf-8" ?>\n<package>' + html.tostring(elem, encoding = 'utf-8') + '</package>'
- stream = cStringIO.StringIO(raw)
- opf = OPF(stream)
- self.embedded_mi = MetaInformation(opf)
- if guide is not None:
- for ref in guide.xpath('descendant::reference'):
- if 'cover' in ref.get('type', '').lower():
- href = ref.get('href', '')
- if href.startswith('#'):
- href = href[1:]
-
- anchors = root.xpath('//*[@id="%s"]' % href)
- if anchors:
- cpos = anchors[0]
- reached = False
- for elem in root.iter():
- if elem is cpos:
- reached = True
-
- if reached and elem.tag == 'img':
- cover = elem.get('src', None)
- self.embedded_mi.cover = cover
- elem.getparent().remove(elem)
- break
- continue
-
-
- break
- continue
-
-
-
-
- def cleanup_html(self):
- self.log.debug('Cleaning up HTML...')
- self.processed_html = re.sub('<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
- if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
- self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
-
- self.processed_html = self.processed_html.replace('\r\n', '\n')
- self.processed_html = self.processed_html.replace('> <', '>\n<')
- self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
- self.processed_html = re.sub('<?xml[^>]*>', '', self.processed_html)
-
-
- def remove_random_bytes(self, html):
- return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08', '', html)
-
-
- def ensure_unit(self, raw, unit = 'px'):
- if re.search('\\d+$', raw) is not None:
- raw += unit
-
- return raw
-
-
- def upshift_markup(self, root):
- self.log.debug('Converting style information to CSS...')
- size_map = {
- 'xx-small': '0.5',
- 'x-small': '1',
- 'small': '2',
- 'medium': '3',
- 'large': '4',
- 'x-large': '5',
- 'xx-large': '6' }
- mobi_version = self.book_header.mobi_version
- for x in root.xpath('//ncx'):
- x.getparent().remove(x)
-
- for i, tag in enumerate(root.iter(etree.Element)):
- tag.attrib.pop('xmlns', '')
- for x in tag.attrib:
- if ':' in x:
- del tag.attrib[x]
- continue
-
- if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city', 'street', 'address', 'content', 'form'):
- tag.tag = None if tag.tag in ('content', 'form') else 'span'
- for key in tag.attrib.keys():
- tag.attrib.pop(key)
-
- continue
-
- styles = []
- attrib = tag.attrib
- if attrib.has_key('style'):
- style = attrib.pop('style').strip()
- if style:
- styles.append(style)
-
-
- if attrib.has_key('height'):
- height = attrib.pop('height').strip()
- if height and '<' not in height and '>' not in height and re.search('\\d+', height):
- if tag.tag in ('table', 'td', 'tr'):
- pass
- elif tag.tag == 'img':
- tag.set('height', height)
- else:
- styles.append('margin-top: %s' % self.ensure_unit(height))
-
-
- if attrib.has_key('width'):
- width = attrib.pop('width').strip()
- if width and re.search('\\d+', width):
- if tag.tag in ('table', 'td', 'tr'):
- pass
- elif tag.tag == 'img':
- tag.set('width', width)
- else:
- styles.append('text-indent: %s' % self.ensure_unit(width))
- if width.startswith('-'):
- styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
-
-
-
- if attrib.has_key('align'):
- align = attrib.pop('align').strip()
- if align:
- align = align.lower()
- if align == 'baseline':
- styles.append('vertical-align: ' + align)
- else:
- styles.append('text-align: %s' % align)
-
-
- if tag.tag == 'hr':
- if mobi_version == 1:
- tag.tag = 'div'
- styles.append('page-break-before: always')
- styles.append('display: block')
- styles.append('margin: 0')
-
- elif tag.tag == 'i':
- tag.tag = 'span'
- tag.attrib['class'] = 'italic'
- elif tag.tag == 'b':
- tag.tag = 'span'
- tag.attrib['class'] = 'bold'
- elif tag.tag == 'font':
- sz = tag.get('size', '').lower()
-
- try:
- float(sz)
- except ValueError:
- if sz in size_map.keys():
- attrib['size'] = size_map[sz]
-
- except:
- sz in size_map.keys()
-
-
- None<EXCEPTION MATCH>ValueError
- if tag.tag == 'img':
- recindex = None
- for attr in self.IMAGE_ATTRS:
- if not attrib.pop(attr, None):
- pass
- recindex = recindex
-
- if recindex is not None:
- attrib['src'] = 'images/%s.jpg' % recindex
-
- for attr in ('width', 'height'):
- if attr in attrib:
- val = attrib[attr]
- if val.lower().endswith('em'):
-
- try:
- nval = float(val[:-2])
- nval *= 16 * (168.451 / 72)
- attrib[attr] = '%dpx' % int(nval)
- del attrib[attr]
-
- elif val.lower().endswith('%'):
- del attrib[attr]
-
- val.lower().endswith('em')
-
- elif tag.tag == 'pre':
- if not tag.text:
- tag.tag = 'div'
-
-
- if 'filepos-id' in attrib:
- attrib['id'] = attrib.pop('filepos-id')
- if 'name' in attrib and attrib['name'] != attrib['id']:
- attrib['name'] = attrib['id']
-
-
- if 'filepos' in attrib:
- filepos = attrib.pop('filepos')
-
- try:
- attrib['href'] = '#filepos%d' % int(filepos)
- except ValueError:
- pass
- except:
- None<EXCEPTION MATCH>ValueError
-
-
- None<EXCEPTION MATCH>ValueError
- if styles:
- ncls = None
- rule = '; '.join(styles)
- for sel, srule in self.tag_css_rules.items():
- if srule == rule:
- ncls = sel
- break
- continue
-
- if ncls is None:
- ncls = 'calibre_%d' % i
- self.tag_css_rules[ncls] = rule
-
- cls = attrib.get('class', '')
- cls = None + cls if cls else '' + ncls
- attrib['class'] = cls
- continue
-
-
-
- def create_opf(self, htmlfile, guide = None, root = None):
- mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
- if mi is None:
- mi = MetaInformation(self.book_header.title, [
- _('Unknown')])
-
- opf = OPFCreator(os.path.dirname(htmlfile), mi)
- if hasattr(self.book_header.exth, 'cover_offset'):
- opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
- elif mi.cover is not None:
- opf.cover = mi.cover
- else:
- opf.cover = 'images/00001.jpg'
- if not os.path.exists(os.path.join(os.path.dirname(htmlfile), *opf.cover.split('/'))):
- opf.cover = None
-
- cover = opf.cover
- cover_copied = None
- if cover is not None:
- cover = cover.replace('/', os.sep)
- if os.path.exists(cover):
- ncover = 'images' + os.sep + 'calibre_cover.jpg'
- if os.path.exists(ncover):
- os.remove(ncover)
-
- shutil.copyfile(cover, ncover)
- cover_copied = os.path.abspath(ncover)
- opf.cover = ncover.replace(os.sep, '/')
-
-
- manifest = [
- (htmlfile, 'application/xhtml+xml'),
- (os.path.abspath('styles.css'), 'text/css')]
- bp = os.path.dirname(htmlfile)
- added = set([])
- for i in getattr(self, 'image_names', []):
- path = os.path.join(bp, 'images', i)
- added.add(path)
- manifest.append((path, 'image/jpeg'))
-
- if cover_copied is not None:
- manifest.append((cover_copied, 'image/jpeg'))
-
- opf.create_manifest(manifest)
- opf.create_spine([
- os.path.basename(htmlfile)])
- toc = None
- if guide is not None:
- opf.create_guide(guide)
- for ref in opf.guide:
- if ref.type.lower() == 'toc':
- toc = ref.href()
- continue
-
-
- ncx_manifest_entry = None
- if toc:
- ncx_manifest_entry = 'toc.ncx'
- elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
- tocobj = None
- ent_pat = re.compile('&(\\S+?);')
- if elems:
- tocobj = TOC()
- reached = False
- for x in root.iter():
- if x == elems[-1]:
- reached = True
- continue
-
- if reached and x.tag == 'a':
- href = x.get('href', '')
- if href and re.match('\\w+://', href) is None:
-
- try:
- text = []([ t.strip() for t in x.xpath('descendant::text()') ])
- except:
- text = ''
-
- text = ent_pat.sub(entity_to_unicode, text)
- tocobj.add_item(toc.partition('#')[0], href[1:], text)
-
-
- if reached and x.get('class', None) == 'mbp_pagebreak':
- break
- continue
-
-
- if tocobj is not None:
- opf.set_toc(tocobj)
-
-
- return (opf, ncx_manifest_entry)
-
-
- def sizeof_trailing_entries(self, data):
-
- def sizeof_trailing_entry(ptr, psize):
- (bitpos, result) = (0, 0)
- while True:
- v = ord(ptr[psize - 1])
- result |= (v & 127) << bitpos
- bitpos += 7
- psize -= 1
- if v & 128 != 0 and bitpos >= 28 or psize == 0:
- return result
- continue
- psize == 0
-
- num = 0
- size = len(data)
- flags = self.book_header.extra_flags >> 1
- while flags:
- if flags & 1:
- num += sizeof_trailing_entry(data, size - num)
-
- flags >>= 1
- if self.book_header.extra_flags & 1:
- num += (ord(data[size - num - 1]) & 3) + 1
-
- return num
-
-
- def text_section(self, index):
- data = self.sections[index][0]
- trail_size = self.sizeof_trailing_entries(data)
- return data[:len(data) - trail_size]
-
-
- def extract_text(self):
- self.log.debug('Extracting text...')
- text_sections = [ self.text_section(i) for i in range(1, self.book_header.records + 1) ]
- processed_records = list(range(0, self.book_header.records + 1))
- self.mobi_html = ''
- if self.book_header.compression_type == 'DH':
- huffs = [ self.sections[i][0] for i in range(self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number) ]
- processed_records += list(range(self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number))
- huff = HuffReader(huffs)
- self.mobi_html = huff.decompress(text_sections)
- elif self.book_header.compression_type == '\x00\x02':
- for section in text_sections:
- self.mobi_html += decompress_doc(section)
-
- elif self.book_header.compression_type == '\x00\x01':
- self.mobi_html = ''.join(text_sections)
- else:
- raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type))
- self.mobi_html = self.mobi_html.replace('\x00', '')
- if self.book_header.codec == 'cp1252':
- self.mobi_html = self.mobi_html.replace('\x1e', '')
- self.mobi_html = self.mobi_html.replace('\x02', '')
-
- return processed_records
-
-
- def replace_page_breaks(self):
- self.processed_html = self.PAGE_BREAK_PAT.sub('<div class="mbp_pagebreak" />', self.processed_html)
-
-
- def add_anchors(self):
- self.log.debug('Adding anchors...')
- positions = set([])
- link_pattern = re.compile('<[^<>]+filepos=[\'"]{0,1}(\\d+)[^<>]*>', re.IGNORECASE)
- for match in link_pattern.finditer(self.mobi_html):
- positions.add(int(match.group(1)))
-
- pos = 0
- self.processed_html = ''
- end_tag_re = re.compile('<\\s*/')
- for end in sorted(positions):
- if end == 0:
- continue
-
- oend = end
- l = self.mobi_html.find('<', end)
- r = self.mobi_html.find('>', end)
- anchor = '<a id="filepos%d"></a>'
- if r > -1:
- if r < l and l == end or l == -1:
- p = self.mobi_html.rfind('<', 0, end + 1)
- if pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and not self.mobi_html[p:r + 1].endswith('/>'):
- anchor = ' filepos-id="filepos%d"'
- end = r
- else:
- end = r + 1
-
- self.processed_html += self.mobi_html[pos:end] + anchor % oend
- pos = end
-
- self.processed_html += self.mobi_html[pos:]
- self.processed_html = re.sub('&([^;]*?)(<a id="filepos\\d+"></a>)([^;]*);', '&\\1\\3;\\2', self.processed_html)
-
-
- def extract_images(self, processed_records, output_dir):
- self.log.debug('Extracting images...')
- output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
-
- image_index = 0
- self.image_names = []
- start = getattr(self.book_header, 'first_image_index', -1)
- if start > self.num_sections or start < 0:
- start = 0
-
- for i in range(start, self.num_sections):
- if i in processed_records:
- continue
-
- processed_records.append(i)
- data = self.sections[i][0]
- buf = cStringIO.StringIO(data)
- image_index += 1
-
- try:
- im = PILImage.open(buf)
- im = im.convert('RGB')
- except IOError:
- continue
-
- path = os.path.join(output_dir, '%05d.jpg' % image_index)
- self.image_names.append(os.path.basename(path))
- im.save(open(path, 'wb'), format = 'JPEG')
-
-
-
-
- def get_metadata(stream):
- Log = Log
- import calibre.utils.logging
- log = Log()
- mi = MetaInformation(os.path.basename(stream.name), [
- _('Unknown')])
- mh = MetadataHeader(stream, log)
- if mh.title and mh.title != _('Unknown'):
- mi.title = mh.title
-
- if hasattr(mh.exth, 'cover_offset'):
- cover_index = mh.first_image_index + mh.exth.cover_offset
- data = mh.section_data(int(cover_index))
- else:
- data = mh.section_data(mh.first_image_index)
- buf = cStringIO.StringIO(data)
-
- try:
- im = PILImage.open(buf)
- except:
- log.exception('Failed to read MOBI cover')
-
- obuf = cStringIO.StringIO()
- im.convert('RGB').save(obuf, format = 'JPEG')
- mi.cover_data = ('jpg', obuf.getvalue())
- return mi
-
-