home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_1011 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-08-06  |  16.1 KB  |  521 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. from __future__ import with_statement
  5. __license__ = 'GPL v3'
  6. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  7. __docformat__ = 'restructuredtext en'
  8. import os
  9. import math
  10. import functools
  11. import collections
  12. import re
  13. import copy
  14. from lxml.etree import XPath as _XPath
  15. from lxml import etree
  16. from lxml.cssselect import CSSSelector
  17. from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, urldefrag, rewrite_links, urlunquote, barename, XHTML
  18. from calibre.ebooks.epub import rules
  19. XPath = functools.partial(_XPath, namespaces = NAMESPACES)
  20. SPLIT_POINT_ATTR = 'csp'
  21.  
  22. def tostring(root):
  23.     return etree.tostring(root, encoding = 'utf-8')
  24.  
  25.  
  26. class SplitError(ValueError):
  27.     
  28.     def __init__(self, path, root):
  29.         size = len(tostring(root)) / 1024
  30.         ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB') % (path, size))
  31.  
  32.  
  33.  
  34. class Split(object):
  35.     
  36.     def __init__(self, split_on_page_breaks = True, page_breaks_xpath = None, max_flow_size = 0):
  37.         self.split_on_page_breaks = split_on_page_breaks
  38.         self.page_breaks_xpath = page_breaks_xpath
  39.         self.max_flow_size = max_flow_size
  40.         self.page_break_selectors = None
  41.         if self.page_breaks_xpath is not None:
  42.             self.page_break_selectors = [
  43.                 (XPath(self.page_breaks_xpath), False)]
  44.         
  45.  
  46.     
  47.     def __call__(self, oeb, opts):
  48.         self.oeb = oeb
  49.         self.log = oeb.log
  50.         self.opts = opts
  51.         self.map = { }
  52.         for item in list(self.oeb.manifest.items):
  53.             if item.spine_position is not None and etree.iselement(item.data):
  54.                 self.split_item(item)
  55.                 continue
  56.         
  57.         self.fix_links()
  58.  
  59.     
  60.     def split_item(self, item):
  61.         page_breaks = []
  62.         page_break_ids = []
  63.         if self.split_on_page_breaks:
  64.             (page_breaks, page_break_ids) = self.find_page_breaks(item)
  65.         
  66.         splitter = FlowSplitter(item, page_breaks, page_break_ids, self.max_flow_size, self.oeb, self.opts)
  67.         if splitter.was_split:
  68.             am = splitter.anchor_map
  69.             self.map[item.href] = collections.defaultdict(am.default_factory, **am)
  70.         
  71.  
  72.     
  73.     def find_page_breaks(self, item):
  74.         if self.page_break_selectors is None:
  75.             self.page_break_selectors = set([])
  76.             stylesheets = _[1]
  77.             for rule in rules(stylesheets):
  78.                 before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
  79.                 after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
  80.                 
  81.                 try:
  82.                     if before and before != 'avoid':
  83.                         self.page_break_selectors.add((CSSSelector(rule.selectorText), True))
  84.                 except:
  85.                     []
  86.                     []
  87.  
  88.                 
  89.                 try:
  90.                     if after and after != 'avoid':
  91.                         self.page_break_selectors.add((CSSSelector(rule.selectorText), False))
  92.                 continue
  93.                 []
  94.                 continue
  95.  
  96.             
  97.         
  98.         page_breaks = set([])
  99.         for selector, before in self.page_break_selectors:
  100.             body = item.data.xpath('//h:body', namespaces = NAMESPACES)
  101.             if not body:
  102.                 continue
  103.             
  104.             for elem in selector(body[0]):
  105.                 if elem not in body:
  106.                     if before:
  107.                         elem.set('pb_before', '1')
  108.                     
  109.                     page_breaks.add(elem)
  110.                     continue
  111.             
  112.         
  113.         for i, elem in enumerate(item.data.iter()):
  114.             
  115.             try:
  116.                 elem.set('pb_order', str(i))
  117.             continue
  118.             except TypeError:
  119.                 continue
  120.                 continue
  121.             
  122.  
  123.         
  124.         page_breaks = list(page_breaks)
  125.         page_breaks.sort(cmp = (lambda x, y: cmp(int(x.get('pb_order')), int(y.get('pb_order')))))
  126.         page_break_ids = []
  127.         page_breaks_ = []
  128.         for i, x in enumerate(page_breaks):
  129.             x.set('id', x.get('id', 'calibre_pb_%d' % i))
  130.             id = x.get('id')
  131.             page_breaks_.append((XPath('//*[@id=%r]' % id), x.get('pb_before', False)))
  132.             page_break_ids.append(id)
  133.         
  134.         for elem in item.data.iter():
  135.             elem.attrib.pop('pb_order', False)
  136.             if elem.get('pb_before', False):
  137.                 elem.attrib.pop('pb_before')
  138.                 continue
  139.             None<EXCEPTION MATCH>TypeError
  140.         
  141.         return (page_breaks_, page_break_ids)
  142.  
  143.     
  144.     def fix_links(self):
  145.         for item in self.oeb.manifest:
  146.             if etree.iselement(item.data):
  147.                 self.current_item = item
  148.                 rewrite_links(item.data, self.rewrite_links)
  149.                 continue
  150.         
  151.  
  152.     
  153.     def rewrite_links(self, url):
  154.         (href, frag) = urldefrag(url)
  155.         href = self.current_item.abshref(href)
  156.         if href in self.map:
  157.             anchor_map = self.map[href]
  158.             nhref = None[anchor_map if frag else None]
  159.             nhref = self.current_item.relhref(nhref)
  160.             if frag:
  161.                 nhref = '#'.join((urlunquote(nhref), frag))
  162.             
  163.             return nhref
  164.         return url
  165.  
  166.  
  167.  
  168. class FlowSplitter(object):
  169.     
  170.     def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb, opts):
  171.         self.item = item
  172.         self.oeb = oeb
  173.         self.opts = opts
  174.         self.log = oeb.log
  175.         self.page_breaks = page_breaks
  176.         self.page_break_ids = page_break_ids
  177.         self.max_flow_size = max_flow_size
  178.         self.base = item.href
  179.         self.csp_counter = 0
  180.         (base, ext) = os.path.splitext(self.base)
  181.         self.base = base.replace('%', '%%') + '_split_%.3d' + ext
  182.         self.trees = [
  183.             self.item.data.getroottree()]
  184.         self.splitting_on_page_breaks = True
  185.         if self.page_breaks:
  186.             self.split_on_page_breaks(self.trees[0])
  187.         
  188.         self.splitting_on_page_breaks = False
  189.         if self.max_flow_size > 0:
  190.             lt_found = False
  191.             self.log('\tLooking for large trees in %s...' % item.href)
  192.             trees = list(self.trees)
  193.             self.tree_map = { }
  194.             for i, tree in enumerate(trees):
  195.                 size = len(tostring(tree.getroot()))
  196.                 if size > self.max_flow_size:
  197.                     self.log('\tFound large tree #%d' % i)
  198.                     lt_found = True
  199.                     self.split_trees = []
  200.                     self.split_to_size(tree)
  201.                     self.tree_map[tree] = self.split_trees
  202.                     continue
  203.             
  204.             if not lt_found:
  205.                 self.log('\tNo large trees found')
  206.             
  207.             self.trees = []
  208.             for x in trees:
  209.                 self.trees.extend(self.tree_map.get(x, [
  210.                     x]))
  211.             
  212.         
  213.         self.was_split = len(self.trees) > 1
  214.         if self.was_split:
  215.             self.log('\tSplit into %d parts' % len(self.trees))
  216.         
  217.         self.commit()
  218.  
  219.     
  220.     def split_on_page_breaks(self, orig_tree):
  221.         ordered_ids = []
  222.         for elem in orig_tree.xpath('//*[@id]'):
  223.             id = elem.get('id')
  224.             if id in self.page_break_ids:
  225.                 ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
  226.                 continue
  227.         
  228.         self.trees = []
  229.         tree = orig_tree
  230.         for pattern, before in ordered_ids:
  231.             elem = pattern(tree)
  232.             if elem:
  233.                 self.log.debug('\t\tSplitting on page-break')
  234.                 (before, after) = self.do_split(tree, elem[0], before)
  235.                 self.trees.append(before)
  236.                 tree = after
  237.                 continue
  238.         
  239.         self.trees.append(tree)
  240.         trees = []
  241.         ids = set([])
  242.         for tree in self.trees:
  243.             root = tree.getroot()
  244.             if self.is_page_empty(root):
  245.                 discarded_ids = root.xpath('//*[@id]')
  246.                 for x in discarded_ids:
  247.                     x = x.get('id')
  248.                     if not x.startswith('calibre_'):
  249.                         ids.add(x)
  250.                         continue
  251.                 
  252.             if ids:
  253.                 body = self.get_body(root)
  254.                 if body is not None:
  255.                     for x in ids:
  256.                         body.insert(0, body.makeelement(XHTML('div'), id = x, style = 'height:0pt'))
  257.                     
  258.                 
  259.             
  260.             ids = set([])
  261.             trees.append(tree)
  262.         
  263.         self.trees = trees
  264.  
  265.     
  266.     def get_body(self, root):
  267.         body = root.xpath('//h:body', namespaces = NAMESPACES)
  268.         if not body:
  269.             return None
  270.         return body[0]
  271.  
  272.     
  273.     def adjust_split_point(self, root, path):
  274.         sp = root.xpath(path)[0]
  275.         while True:
  276.             parent = sp.getparent()
  277.             if barename(parent.tag) in ('body', 'html'):
  278.                 break
  279.             
  280.             if parent.text and parent.text.strip():
  281.                 break
  282.             
  283.             if parent.index(sp) > 0:
  284.                 break
  285.             
  286.             sp = parent
  287.         npath = sp.getroottree().getpath(sp)
  288.         if self.opts.verbose > 3 and npath != path:
  289.             self.log.debug('\t\t\tMoved split point %s to %s' % (path, npath))
  290.         
  291.         return npath
  292.  
  293.     
  294.     def do_split(self, tree, split_point, before):
  295.         path = tree.getpath(split_point)
  296.         tree = copy.deepcopy(tree)
  297.         tree2 = copy.deepcopy(tree)
  298.         root = tree.getroot()
  299.         root2 = tree2.getroot()
  300.         (body, body2) = map(self.get_body, (root, root2))
  301.         path = self.adjust_split_point(root, path)
  302.         split_point = root.xpath(path)[0]
  303.         split_point2 = root2.xpath(path)[0]
  304.         
  305.         def nix_element(elem, top = True):
  306.             parent = elem.getparent()
  307.             index = parent.index(elem)
  308.             if top:
  309.                 parent.remove(elem)
  310.             else:
  311.                 index = parent.index(elem)
  312.                 parent[index:index + 1] = list(elem.iterchildren())
  313.  
  314.         hit_split_point = False
  315.         for elem in list(body.iterdescendants()):
  316.             if elem is split_point:
  317.                 hit_split_point = True
  318.                 if before:
  319.                     nix_element(elem)
  320.                     continue
  321.                 continue
  322.             
  323.             if hit_split_point:
  324.                 nix_element(elem)
  325.                 continue
  326.         
  327.         hit_split_point = False
  328.         for elem in list(body2.iterdescendants()):
  329.             if elem is split_point2:
  330.                 hit_split_point = True
  331.                 if not before:
  332.                     nix_element(elem, top = False)
  333.                     continue
  334.                 continue
  335.             
  336.             if not hit_split_point:
  337.                 nix_element(elem, top = False)
  338.                 continue
  339.         
  340.         body2.text = '\n'
  341.         return (tree, tree2)
  342.  
  343.     
  344.     def is_page_empty(self, root):
  345.         body = self.get_body(root)
  346.         if body is None:
  347.             return False
  348.         txt = re.sub('\\s+', '', etree.tostring(body, method = 'text', encoding = unicode))
  349.         if len(txt) > 4:
  350.             return False
  351.         for img in root.xpath('//h:img', namespaces = NAMESPACES):
  352.             if img.get('style', '') != 'display:none':
  353.                 return False
  354.         
  355.         return True
  356.  
  357.     
  358.     def split_text(self, text, root, size):
  359.         self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
  360.         rest = text.replace('\r', '')
  361.         parts = re.split('\n\n', rest)
  362.         self.log.debug('\t\t\t\tFound %d parts' % len(parts))
  363.         if max(map(len, parts)) > size:
  364.             raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
  365.         max(map(len, parts)) > size
  366.         ans = []
  367.         buf = ''
  368.         for part in parts:
  369.             if len(buf) + len(part) < size:
  370.                 buf += '\n\n' + part
  371.                 continue
  372.             ans.append(buf)
  373.             buf = part
  374.         
  375.         return ans
  376.  
  377.     
  378.     def split_to_size(self, tree):
  379.         self.log.debug('\t\tSplitting...')
  380.         root = tree.getroot()
  381.         for pre in list(XPath('//h:pre')(root)):
  382.             text = u''.join(pre.xpath('descendant::text()'))
  383.             pre.text = text
  384.             for child in list(pre.iterchildren()):
  385.                 pre.remove(child)
  386.             
  387.             if len(pre.text) > self.max_flow_size * 0.5:
  388.                 self.log.debug('\t\tSplitting large <pre> tag')
  389.                 frags = self.split_text(pre.text, root, int(0.2 * self.max_flow_size))
  390.                 new_pres = []
  391.                 for frag in frags:
  392.                     pre2 = copy.copy(pre)
  393.                     pre2.text = frag
  394.                     pre2.tail = u''
  395.                     new_pres.append(pre2)
  396.                 
  397.                 new_pres[-1].tail = pre.tail
  398.                 p = pre.getparent()
  399.                 i = p.index(pre)
  400.                 p[i:i + 1] = new_pres
  401.                 continue
  402.         
  403.         (split_point, before) = self.find_split_point(root)
  404.         if split_point is None:
  405.             raise SplitError(self.item.href, root)
  406.         split_point is None
  407.         self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
  408.         trees = self.do_split(tree, split_point, before)
  409.         sizes = [ len(tostring(t.getroot())) for t in trees ]
  410.         if min(sizes) < 5120:
  411.             self.log.debug('\t\t\tSplit tree too small')
  412.             self.split_to_size(tree)
  413.             return None
  414.         for t, size in zip(trees, sizes):
  415.             r = t.getroot()
  416.             if self.is_page_empty(r):
  417.                 continue
  418.                 continue
  419.             min(sizes) < 5120
  420.             if size <= self.max_flow_size:
  421.                 self.split_trees.append(t)
  422.                 self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' % (len(self.split_trees), size / 1024))
  423.                 continue
  424.             []
  425.             self.log.debug('\t\t\tSplit tree still too large: %d KB' % size / 1024)
  426.             self.split_to_size(t)
  427.         
  428.  
  429.     
  430.     def find_split_point(self, root):
  431.         
  432.         def pick_elem(elems):
  433.             pass
  434.  
  435.         for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/h:html/h:body/h:div', '//h:pre', '//h:hr', '//h:p', '//h:div', '//h:br', '//h:li'):
  436.             elems = root.xpath(path, namespaces = NAMESPACES)
  437.             elem = pick_elem(elems)
  438.             if elem is not None:
  439.                 
  440.                 try:
  441.                     XPath(elem.getroottree().getpath(elem))
  442.                 except:
  443.                     continue
  444.  
  445.                 return (elem, True)
  446.         
  447.         return (None, True)
  448.  
  449.     
  450.     def commit(self):
  451.         if not self.was_split:
  452.             return None
  453.         self.anchor_map = (collections.defaultdict,)((lambda : self.base % 0))
  454.         self.files = []
  455.         for i, tree in enumerate(self.trees):
  456.             root = tree.getroot()
  457.             self.files.append(self.base % i)
  458.             for elem in root.xpath('//*[@id or @name]'):
  459.                 for anchor in (elem.get('id', ''), elem.get('name', '')):
  460.                     if anchor != '' and anchor not in self.anchor_map:
  461.                         self.anchor_map[anchor] = self.files[-1]
  462.                         continue
  463.                     self.was_split
  464.                 
  465.             
  466.             for elem in root.xpath('//*[@%s]' % SPLIT_POINT_ATTR):
  467.                 elem.attrib.pop(SPLIT_POINT_ATTR, '0')
  468.             
  469.         
  470.         spine_pos = self.item.spine_position
  471.         for current, tree in zip(*map(reversed, (self.files, self.trees))):
  472.             for a in tree.getroot().xpath('//h:a[@href]', namespaces = NAMESPACES):
  473.                 href = a.get('href').strip()
  474.                 if href.startswith('#'):
  475.                     anchor = href[1:]
  476.                     file = self.anchor_map[anchor]
  477.                     file = self.item.relhref(file)
  478.                     if file != current:
  479.                         a.set('href', file + href)
  480.                     
  481.                 file != current
  482.             
  483.             new_id = self.oeb.manifest.generate(id = self.item.id)[0]
  484.             new_item = self.oeb.manifest.add(new_id, current, self.item.media_type, data = tree.getroot())
  485.             self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
  486.         
  487.         if self.oeb.guide:
  488.             for ref in self.oeb.guide.values():
  489.                 (href, frag) = urldefrag(ref.href)
  490.                 if href == self.item.href:
  491.                     nhref = None[self.anchor_map if frag else None]
  492.                     if frag:
  493.                         nhref = '#'.join((nhref, frag))
  494.                     
  495.                     ref.href = nhref
  496.                     continue
  497.             
  498.         
  499.         
  500.         def fix_toc_entry(toc):
  501.             if toc.href:
  502.                 (href, frag) = urldefrag(toc.href)
  503.                 if href == self.item.href:
  504.                     nhref = None[self.anchor_map if frag else None]
  505.                     if frag:
  506.                         nhref = '#'.join((nhref, frag))
  507.                     
  508.                     toc.href = nhref
  509.                 
  510.             
  511.             for x in toc:
  512.                 fix_toc_entry(x)
  513.             
  514.  
  515.         if self.oeb.toc:
  516.             fix_toc_entry(self.oeb.toc)
  517.         
  518.         self.oeb.manifest.remove(self.item)
  519.  
  520.  
  521.