home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- from __future__ import with_statement
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, and Alex Bramley <a.bramley at gmail.com>.'
- import os
- import re
- from mimetypes import guess_type as guess_mimetype
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
- from calibre.constants import iswindows, filesystem_encoding
- from calibre.utils.chm.chm import CHMFile
- from calibre.utils.chm.chmlib import CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL, chm_enumerate
- from calibre.ebooks.metadata.toc import TOC
- from calibre.ebooks.chardet import xml_to_unicode
-
- def match_string(s1, s2_already_lowered):
- if s1 is not None and s2_already_lowered is not None:
- if s1.lower() == s2_already_lowered:
- return True
-
- return False
-
-
- def check_all_prev_empty(tag):
- if tag is None:
- return True
- if tag.__class__ == NavigableString and not check_empty(tag):
- return False
- return check_all_prev_empty(tag.previousSibling)
-
-
- def check_empty(s, rex = re.compile('\\S')):
- return rex.search(s) is None
-
-
- class CHMError(Exception):
- pass
-
-
- class CHMReader(CHMFile):
-
- def __init__(self, input, log):
- CHMFile.__init__(self)
- if isinstance(input, unicode):
- input = input.encode(filesystem_encoding)
-
- if not self.LoadCHM(input):
- raise CHMError("Unable to open CHM file '%s'" % (input,))
- self.LoadCHM(input)
- self.log = log
- self._sourcechm = input
- self._contents = None
- self._playorder = 0
- self._metadata = False
- self._extracted = False
- (self.root, ext) = os.path.splitext(self.topics.lstrip('/'))
- self.hhc_path = self.root + '.hhc'
-
-
- def _parse_toc(self, ul, basedir = os.getcwdu()):
- toc = TOC(play_order = self._playorder, base_path = basedir, text = '')
- self._playorder += 1
- for li in ul('li', recursive = False):
- href = li.object('param', {
- 'name': 'Local' })[0]['value']
- if href.count('#'):
- (href, frag) = href.split('#')
- else:
- frag = None
- name = self._deentity(li.object('param', {
- 'name': 'Name' })[0]['value'])
- toc.add_item(href, frag, name, play_order = self._playorder)
- self._playorder += 1
- if li.ul:
- child = self._parse_toc(li.ul)
- child.parent = toc
- toc.append(child)
- continue
- self
-
- return toc
-
-
- def GetFile(self, path):
- if path[0] != '/':
- path = '/' + path
-
- (res, ui) = self.ResolveObject(path)
- if res != CHM_RESOLVE_SUCCESS:
- raise CHMError("Unable to locate '%s' within CHM file '%s'" % (path, self.filename))
- res != CHM_RESOLVE_SUCCESS
- (size, data) = self.RetrieveObject(ui)
- if size == 0:
- raise CHMError("'%s' is zero bytes in length!" % (path,))
- size == 0
- return data
-
-
- def ExtractFiles(self, output_dir = os.getcwdu()):
- html_files = set([])
- for path in self.Contents():
- lpath = os.path.join(output_dir, path)
- self._ensure_dir(lpath)
-
- try:
- data = self.GetFile(path)
- except:
- self.log.exception('Failed to extract %s from CHM, ignoring' % path)
- continue
-
- if lpath.find(';') != -1:
- lpath = lpath.split(';')[0]
-
-
- try:
-
- try:
- f = _[1]
- f.write(data)
- finally:
- pass
-
-
- try:
- if 'html' in guess_mimetype(path)[0]:
- html_files.add(lpath)
- except:
- open(lpath, 'wb').__exit__
- open(lpath, 'wb')
-
- continue
- if iswindows and len(lpath) > 250:
- self.log.warn('%r filename too long, skipping' % path)
- continue
-
-
- raise
-
- for lpath in html_files:
-
- try:
- f = _[2]
- data = f.read()
- data = self._reformat(data, lpath)
- f.seek(0)
- f.truncate()
- f.write(data)
- finally:
- pass
-
-
- self._extracted = True
- files = _[3]
- if self.hhc_path not in files and files:
- self.hhc_path = files[0]
-
-
-
- def _reformat(self, data, htmlpath):
-
- try:
- data = xml_to_unicode(data, strip_encoding_pats = True)[0]
- soup = BeautifulSoup(data)
- except ValueError:
- self.log.exception('Unable to parse html for cleaning, leaving it')
- return data
-
- [ s.extract() for s in soup('script') ]
- t = soup('table')
- if t:
- if t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None:
-
- try:
- alt = t[-1].img['alt'].lower()
- if alt.find('prev') != -1 and alt.find('next') != -1 or alt.find('team') != -1:
- t[-1].extract()
-
-
-
- br = soup('br')
- if br:
- if check_all_prev_empty(br[0].previousSibling):
- br[0].extract()
-
-
- base = os.path.dirname(htmlpath)
- for img in soup('img', src = True):
- src = img['src']
- ipath = os.path.join(base, *src.split('/'))
- if os.path.exists(ipath):
- continue
-
- src = src.split(';')[0]
- if not src:
- continue
-
- ipath = os.path.join(base, *src.split('/'))
- if not os.path.exists(ipath):
- while src.startswith('../'):
- src = src[3:]
-
- img['src'] = src
-
-
- try:
- tables = soup.body.findAll('table', recursive = False)
- if tables and len(tables) == 1:
- trs = tables[0].findAll('tr', recursive = False)
- if trs and len(trs) == 1:
- tds = trs[0].findAll('td', recursive = False)
- if tds and len(tds) == 1:
- tdContents = tds[0].contents
- tableIdx = soup.body.contents.index(tables[0])
- tables[0].extract()
- while tdContents:
- soup.body.insert(tableIdx, tdContents.pop())
-
-
- except:
- pass
-
- return str(soup)
-
-
- def Contents(self):
- if self._contents is not None:
- return self._contents
- paths = []
-
- def get_paths(chm, ui, ctx):
- if ui.path[-1] != '/':
- paths.append(ui.path.lstrip('/'))
-
-
- chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
- self._contents = paths
- return self._contents
-
-
- def _ensure_dir(self, path):
- dir = os.path.dirname(path)
- if not os.path.isdir(dir):
- os.makedirs(dir)
-
-
-
- def extract_content(self, output_dir = os.getcwdu()):
- self.ExtractFiles(output_dir = output_dir)
-
-
-