Maximum CD 2011 January

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_856 (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2010-10-31 | 8.3 KB | 252 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, and Alex Bramley <a.bramley at gmail.com>.' import os import re from mimetypes import guess_type as guess_mimetype from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString from calibre.constants import iswindows, filesystem_encoding from calibre.utils.chm.chm import CHMFile from calibre.utils.chm.chmlib import CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL, chm_enumerate from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.chardet import xml_to_unicode def match_string(s1, s2_already_lowered): if s1 is not None and s2_already_lowered is not None: if s1.lower() == s2_already_lowered: return True return False def check_all_prev_empty(tag): if tag is None: return True if tag.__class__ == NavigableString and not check_empty(tag): return False return check_all_prev_empty(tag.previousSibling) def check_empty(s, rex = re.compile('\\S')): return rex.search(s) is None class CHMError(Exception): pass class CHMReader(CHMFile): def __init__(self, input, log): CHMFile.__init__(self) if isinstance(input, unicode): input = input.encode(filesystem_encoding) if not self.LoadCHM(input): raise CHMError("Unable to open CHM file '%s'" % (input,)) self.LoadCHM(input) self.log = log self._sourcechm = input self._contents = None self._playorder = 0 self._metadata = False self._extracted = False (self.root, ext) = os.path.splitext(self.topics.lstrip('/')) self.hhc_path = self.root + '.hhc' def _parse_toc(self, ul, basedir = os.getcwdu()): toc = TOC(play_order = self._playorder, base_path = basedir, text = '') self._playorder += 1 for li in ul('li', recursive = False): href = li.object('param', { 'name': 'Local' })[0]['value'] if href.count('#'): (href, frag) = href.split('#') else: frag = None name = self._deentity(li.object('param', { 'name': 'Name' })[0]['value']) toc.add_item(href, frag, name, play_order = self._playorder) self._playorder += 1 if li.ul: child = self._parse_toc(li.ul) child.parent = toc toc.append(child) continue self return toc def GetFile(self, path): if path[0] != '/': path = '/' + path (res, ui) = self.ResolveObject(path) if res != CHM_RESOLVE_SUCCESS: raise CHMError("Unable to locate '%s' within CHM file '%s'" % (path, self.filename)) res != CHM_RESOLVE_SUCCESS (size, data) = self.RetrieveObject(ui) if size == 0: raise CHMError("'%s' is zero bytes in length!" % (path,)) size == 0 return data def ExtractFiles(self, output_dir = os.getcwdu()): html_files = set([]) for path in self.Contents(): lpath = os.path.join(output_dir, path) self._ensure_dir(lpath) try: data = self.GetFile(path) except: self.log.exception('Failed to extract %s from CHM, ignoring' % path) continue if lpath.find(';') != -1: lpath = lpath.split(';')[0] try: try: f = _[1] f.write(data) finally: pass try: if 'html' in guess_mimetype(path)[0]: html_files.add(lpath) except: open(lpath, 'wb').__exit__ open(lpath, 'wb') continue if iswindows and len(lpath) > 250: self.log.warn('%r filename too long, skipping' % path) continue raise for lpath in html_files: try: f = _[2] data = f.read() data = self._reformat(data, lpath) f.seek(0) f.truncate() f.write(data) finally: pass self._extracted = True files = _[3] if self.hhc_path not in files and files: self.hhc_path = files[0] def _reformat(self, data, htmlpath): try: data = xml_to_unicode(data, strip_encoding_pats = True)[0] soup = BeautifulSoup(data) except ValueError: self.log.exception('Unable to parse html for cleaning, leaving it') return data [ s.extract() for s in soup('script') ] t = soup('table') if t: if t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None: try: alt = t[-1].img['alt'].lower() if alt.find('prev') != -1 and alt.find('next') != -1 or alt.find('team') != -1: t[-1].extract() br = soup('br') if br: if check_all_prev_empty(br[0].previousSibling): br[0].extract() base = os.path.dirname(htmlpath) for img in soup('img', src = True): src = img['src'] ipath = os.path.join(base, *src.split('/')) if os.path.exists(ipath): continue src = src.split(';')[0] if not src: continue ipath = os.path.join(base, *src.split('/')) if not os.path.exists(ipath): while src.startswith('../'): src = src[3:] img['src'] = src try: tables = soup.body.findAll('table', recursive = False) if tables and len(tables) == 1: trs = tables[0].findAll('tr', recursive = False) if trs and len(trs) == 1: tds = trs[0].findAll('td', recursive = False) if tds and len(tds) == 1: tdContents = tds[0].contents tableIdx = soup.body.contents.index(tables[0]) tables[0].extract() while tdContents: soup.body.insert(tableIdx, tdContents.pop()) except: pass return str(soup) def Contents(self): if self._contents is not None: return self._contents paths = [] def get_paths(chm, ui, ctx): if ui.path[-1] != '/': paths.append(ui.path.lstrip('/')) chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None) self._contents = paths return self._contents def _ensure_dir(self, path): dir = os.path.dirname(path) if not os.path.isdir(dir): os.makedirs(dir) def extract_content(self, output_dir = os.getcwdu()): self.ExtractFiles(output_dir = output_dir)