home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_856 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-10-31  |  8.3 KB  |  252 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. from __future__ import with_statement
  5. __license__ = 'GPL v3'
  6. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, and Alex Bramley <a.bramley at gmail.com>.'
  7. import os
  8. import re
  9. from mimetypes import guess_type as guess_mimetype
  10. from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
  11. from calibre.constants import iswindows, filesystem_encoding
  12. from calibre.utils.chm.chm import CHMFile
  13. from calibre.utils.chm.chmlib import CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL, chm_enumerate
  14. from calibre.ebooks.metadata.toc import TOC
  15. from calibre.ebooks.chardet import xml_to_unicode
  16.  
  17. def match_string(s1, s2_already_lowered):
  18.     if s1 is not None and s2_already_lowered is not None:
  19.         if s1.lower() == s2_already_lowered:
  20.             return True
  21.     
  22.     return False
  23.  
  24.  
  25. def check_all_prev_empty(tag):
  26.     if tag is None:
  27.         return True
  28.     if tag.__class__ == NavigableString and not check_empty(tag):
  29.         return False
  30.     return check_all_prev_empty(tag.previousSibling)
  31.  
  32.  
  33. def check_empty(s, rex = re.compile('\\S')):
  34.     return rex.search(s) is None
  35.  
  36.  
  37. class CHMError(Exception):
  38.     pass
  39.  
  40.  
  41. class CHMReader(CHMFile):
  42.     
  43.     def __init__(self, input, log):
  44.         CHMFile.__init__(self)
  45.         if isinstance(input, unicode):
  46.             input = input.encode(filesystem_encoding)
  47.         
  48.         if not self.LoadCHM(input):
  49.             raise CHMError("Unable to open CHM file '%s'" % (input,))
  50.         self.LoadCHM(input)
  51.         self.log = log
  52.         self._sourcechm = input
  53.         self._contents = None
  54.         self._playorder = 0
  55.         self._metadata = False
  56.         self._extracted = False
  57.         (self.root, ext) = os.path.splitext(self.topics.lstrip('/'))
  58.         self.hhc_path = self.root + '.hhc'
  59.  
  60.     
  61.     def _parse_toc(self, ul, basedir = os.getcwdu()):
  62.         toc = TOC(play_order = self._playorder, base_path = basedir, text = '')
  63.         self._playorder += 1
  64.         for li in ul('li', recursive = False):
  65.             href = li.object('param', {
  66.                 'name': 'Local' })[0]['value']
  67.             if href.count('#'):
  68.                 (href, frag) = href.split('#')
  69.             else:
  70.                 frag = None
  71.             name = self._deentity(li.object('param', {
  72.                 'name': 'Name' })[0]['value'])
  73.             toc.add_item(href, frag, name, play_order = self._playorder)
  74.             self._playorder += 1
  75.             if li.ul:
  76.                 child = self._parse_toc(li.ul)
  77.                 child.parent = toc
  78.                 toc.append(child)
  79.                 continue
  80.             self
  81.         
  82.         return toc
  83.  
  84.     
  85.     def GetFile(self, path):
  86.         if path[0] != '/':
  87.             path = '/' + path
  88.         
  89.         (res, ui) = self.ResolveObject(path)
  90.         if res != CHM_RESOLVE_SUCCESS:
  91.             raise CHMError("Unable to locate '%s' within CHM file '%s'" % (path, self.filename))
  92.         res != CHM_RESOLVE_SUCCESS
  93.         (size, data) = self.RetrieveObject(ui)
  94.         if size == 0:
  95.             raise CHMError("'%s' is zero bytes in length!" % (path,))
  96.         size == 0
  97.         return data
  98.  
  99.     
  100.     def ExtractFiles(self, output_dir = os.getcwdu()):
  101.         html_files = set([])
  102.         for path in self.Contents():
  103.             lpath = os.path.join(output_dir, path)
  104.             self._ensure_dir(lpath)
  105.             
  106.             try:
  107.                 data = self.GetFile(path)
  108.             except:
  109.                 self.log.exception('Failed to extract %s from CHM, ignoring' % path)
  110.                 continue
  111.  
  112.             if lpath.find(';') != -1:
  113.                 lpath = lpath.split(';')[0]
  114.             
  115.             
  116.             try:
  117.                 
  118.                 try:
  119.                     f = _[1]
  120.                     f.write(data)
  121.                 finally:
  122.                     pass
  123.  
  124.                 
  125.                 try:
  126.                     if 'html' in guess_mimetype(path)[0]:
  127.                         html_files.add(lpath)
  128.                 except:
  129.                     open(lpath, 'wb').__exit__
  130.                     open(lpath, 'wb')
  131.  
  132.             continue
  133.             if iswindows and len(lpath) > 250:
  134.                 self.log.warn('%r filename too long, skipping' % path)
  135.                 continue
  136.             
  137.  
  138.             raise 
  139.         
  140.         for lpath in html_files:
  141.             
  142.             try:
  143.                 f = _[2]
  144.                 data = f.read()
  145.                 data = self._reformat(data, lpath)
  146.                 f.seek(0)
  147.                 f.truncate()
  148.                 f.write(data)
  149.             finally:
  150.                 pass
  151.  
  152.         
  153.         self._extracted = True
  154.         files = _[3]
  155.         if self.hhc_path not in files and files:
  156.             self.hhc_path = files[0]
  157.         
  158.  
  159.     
  160.     def _reformat(self, data, htmlpath):
  161.         
  162.         try:
  163.             data = xml_to_unicode(data, strip_encoding_pats = True)[0]
  164.             soup = BeautifulSoup(data)
  165.         except ValueError:
  166.             self.log.exception('Unable to parse html for cleaning, leaving it')
  167.             return data
  168.  
  169.         [ s.extract() for s in soup('script') ]
  170.         t = soup('table')
  171.         if t:
  172.             if t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None:
  173.                 
  174.                 try:
  175.                     alt = t[-1].img['alt'].lower()
  176.                     if alt.find('prev') != -1 and alt.find('next') != -1 or alt.find('team') != -1:
  177.                         t[-1].extract()
  178.  
  179.             
  180.         
  181.         br = soup('br')
  182.         if br:
  183.             if check_all_prev_empty(br[0].previousSibling):
  184.                 br[0].extract()
  185.             
  186.         
  187.         base = os.path.dirname(htmlpath)
  188.         for img in soup('img', src = True):
  189.             src = img['src']
  190.             ipath = os.path.join(base, *src.split('/'))
  191.             if os.path.exists(ipath):
  192.                 continue
  193.             
  194.             src = src.split(';')[0]
  195.             if not src:
  196.                 continue
  197.             
  198.             ipath = os.path.join(base, *src.split('/'))
  199.             if not os.path.exists(ipath):
  200.                 while src.startswith('../'):
  201.                     src = src[3:]
  202.             
  203.             img['src'] = src
  204.         
  205.         
  206.         try:
  207.             tables = soup.body.findAll('table', recursive = False)
  208.             if tables and len(tables) == 1:
  209.                 trs = tables[0].findAll('tr', recursive = False)
  210.                 if trs and len(trs) == 1:
  211.                     tds = trs[0].findAll('td', recursive = False)
  212.                     if tds and len(tds) == 1:
  213.                         tdContents = tds[0].contents
  214.                         tableIdx = soup.body.contents.index(tables[0])
  215.                         tables[0].extract()
  216.                         while tdContents:
  217.                             soup.body.insert(tableIdx, tdContents.pop())
  218.                     
  219.                 
  220.         except:
  221.             pass
  222.  
  223.         return str(soup)
  224.  
  225.     
  226.     def Contents(self):
  227.         if self._contents is not None:
  228.             return self._contents
  229.         paths = []
  230.         
  231.         def get_paths(chm, ui, ctx):
  232.             if ui.path[-1] != '/':
  233.                 paths.append(ui.path.lstrip('/'))
  234.             
  235.  
  236.         chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
  237.         self._contents = paths
  238.         return self._contents
  239.  
  240.     
  241.     def _ensure_dir(self, path):
  242.         dir = os.path.dirname(path)
  243.         if not os.path.isdir(dir):
  244.             os.makedirs(dir)
  245.         
  246.  
  247.     
  248.     def extract_content(self, output_dir = os.getcwdu()):
  249.         self.ExtractFiles(output_dir = output_dir)
  250.  
  251.  
  252.