home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_916 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-08-06  |  62.9 KB  |  2,140 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. __license__ = 'GPL v3'
  5. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  6. import os
  7. import re
  8. import sys
  9. import copy
  10. import glob
  11. import tempfile
  12. from collections import deque
  13. from urllib import unquote
  14. from urlparse import urlparse
  15. from math import ceil, floor
  16. from functools import partial
  17.  
  18. try:
  19.     from PIL import Image as PILImage
  20.     PILImage
  21. except ImportError:
  22.     import Image as PILImage
  23.  
  24. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString, Declaration, ProcessingInstruction
  25. from calibre.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, ImageBlock, JumpButton, CharButton, Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, LrsError, Sup, Sub, EmpLine
  26. from calibre.ebooks.lrf.pylrs.pylrs import Span
  27. from calibre.ebooks.lrf import Book
  28. from calibre.ebooks import ConversionError
  29. from calibre.ebooks.lrf.html.table import Table
  30. from calibre import filename_to_utf8, __appname__, fit_image, preferred_encoding, entity_to_unicode
  31. from calibre.ptempfile import PersistentTemporaryFile
  32. from calibre.devices.interface import DevicePlugin as Device
  33. from calibre.ebooks.lrf.html.color_map import lrs_color
  34. from calibre.ebooks.chardet import xml_to_unicode
  35.  
  36. def update_css(ncss, ocss):
  37.     for key in ncss.keys():
  38.         if ocss.has_key(key):
  39.             ocss[key].update(ncss[key])
  40.             continue
  41.         ocss[key] = ncss[key]
  42.     
  43.  
  44.  
  45. def munge_paths(basepath, url):
  46.     purl = urlparse(unquote(url))
  47.     path = purl[2]
  48.     fragment = purl[5]
  49.     if path:
  50.         path = path.replace('/', os.sep)
  51.     
  52.     if not path:
  53.         path = basepath
  54.     elif not os.path.isabs(path):
  55.         if isinstance(path, unicode):
  56.             path = path.encode(sys.getfilesystemencoding())
  57.         
  58.         dn = os.path.dirname(basepath)
  59.         if isinstance(dn, unicode):
  60.             dn = dn.encode(sys.getfilesystemencoding())
  61.         
  62.         path = os.path.join(dn, path)
  63.     
  64.     return (os.path.normpath(path), fragment)
  65.  
  66.  
  67. def strip_style_comments(match):
  68.     src = match.group()
  69.     while True:
  70.         lindex = src.find('/*')
  71.         if lindex < 0:
  72.             break
  73.         
  74.         rindex = src.find('*/', lindex)
  75.         if rindex < 0:
  76.             src = src[:lindex]
  77.             break
  78.         
  79.         src = src[:lindex] + src[rindex + 2:]
  80.     return src
  81.  
  82.  
  83. def tag_regex(tagname):
  84.     return dict(open = '(?:<\\s*%(t)s\\s+[^<>]*?>|<\\s*%(t)s\\s*>)' % dict(t = tagname), close = '</\\s*%(t)s\\s*>' % dict(t = tagname))
  85.  
  86.  
  87. class HTMLConverter(object):
  88.     SELECTOR_PAT = re.compile('([A-Za-z0-9\\-\\_\\:\\.]+[A-Za-z0-9\\-\\_\\:\\.\\s\\,]*)\\s*\\{([^\\}]*)\\}')
  89.     PAGE_BREAK_PAT = re.compile('page-break-(?:after|before)\\s*:\\s*(\\w+)', re.IGNORECASE)
  90.     IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
  91.     MARKUP_MASSAGE = [
  92.         (re.compile('<a(\\s[^>]*)?/>', re.IGNORECASE), (lambda match: '<a' + match.group(1) + '></a>')),
  93.         (re.compile('<\\s*style.*?>(.*?)<\\/\\s*style\\s*>', re.DOTALL | re.IGNORECASE), (lambda match: match.group().replace('<!--', '').replace('-->', ''))),
  94.         (re.compile('<\\s*a\\s+[^<>]*href\\s*=[^<>]*>(.*?)<\\s*/\\s*a\\s*>', re.DOTALL | re.IGNORECASE), (lambda match: re.compile('%(open)s|%(close)s' % tag_regex('p'), re.IGNORECASE).sub('', match.group()))),
  95.         (re.compile('<p>( |\\s)*</p>', re.IGNORECASE), (lambda m: '<br />')),
  96.         (re.compile('<h[0-5]?>( |\\s)*</h[0-5]?>', re.IGNORECASE), (lambda m: '<br />')),
  97.         (re.compile(u'&(\\S+?);'), partial(entity_to_unicode, exceptions = [
  98.             'lt',
  99.             'gt',
  100.             'amp'])),
  101.         (re.compile('(<style.*?</style>)', re.IGNORECASE | re.DOTALL), strip_style_comments),
  102.         (re.compile('(?i)<script[^<>]+?/>'), (lambda match: '')),
  103.         (re.compile('(?i)<\\s*div([^>]*)/\\s*>'), (lambda match: '<div%s></div>' % match.group(1)))]
  104.     BAEN = [
  105.         (re.compile('page-break-before:\\s*\\w+([\\s;\\}])', re.IGNORECASE), (lambda match: match.group(1))),
  106.         (re.compile('<p>\\s*(<a id.*?>\\s*</a>)\\s*</p>', re.IGNORECASE), (lambda match: match.group(1))),
  107.         (re.compile('<\\s*a\\s+id="p[0-9]+"\\s+name="p[0-9]+"\\s*>\\s*</a>', re.IGNORECASE), (lambda match: ''))]
  108.     PDFTOHTML = [
  109.         (re.compile('<hr.*?>', re.IGNORECASE), (lambda match: '<br />')),
  110.         (re.compile('\\d+<br>', re.IGNORECASE), (lambda match: '')),
  111.         (re.compile('<br.*?>\\s*<br.*?>', re.IGNORECASE), (lambda match: '<p>')),
  112.         (re.compile('(.*)<br.*?>', re.IGNORECASE), (lambda match: if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40:
  113. match.group()match.group(1))),
  114.         (re.compile('-\\n\\r?'), (lambda match: ''))]
  115.     BOOK_DESIGNER = [
  116.         (re.compile('<hr>', re.IGNORECASE), (lambda match: '<span style="page-break-after:always"> </span>')),
  117.         (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h1 id="BookTitle" align="%s">%s</h1>' if match.group(2) else 'center', match.group(3)))),
  118.         (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (lambda match: None % ('<h2 id="BookAuthor" align="%s">%s</h2>' if match.group(2) else 'center', match.group(3)))),
  119.         (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h2 class="title">%s</h2>' % (match.group(1),))),
  120.         (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE | re.DOTALL), (lambda match: '<h3 class="subtitle">%s</h3>' % (match.group(1),))),
  121.         (re.compile('<div[^><]*?>( ){4}</div>', re.IGNORECASE), (lambda match: '<p></p>'))]
  122.     
  123.     def __hasattr__(self, attr):
  124.         if hasattr(self.options, attr):
  125.             return True
  126.         return object.__hasattr__(self, attr)
  127.  
  128.     
  129.     def __getattr__(self, attr):
  130.         if hasattr(self.options, attr):
  131.             return getattr(self.options, attr)
  132.         return object.__getattribute__(self, attr)
  133.  
  134.     
  135.     def __setattr__(self, attr, val):
  136.         if hasattr(self.options, attr):
  137.             setattr(self.options, attr, val)
  138.         else:
  139.             object.__setattr__(self, attr, val)
  140.  
  141.     CSS = {
  142.         'h1': {
  143.             'font-size': 'xx-large',
  144.             'font-weight': 'bold',
  145.             'text-indent': '0pt' },
  146.         'h2': {
  147.             'font-size': 'x-large',
  148.             'font-weight': 'bold',
  149.             'text-indent': '0pt' },
  150.         'h3': {
  151.             'font-size': 'large',
  152.             'font-weight': 'bold',
  153.             'text-indent': '0pt' },
  154.         'h4': {
  155.             'font-size': 'large',
  156.             'text-indent': '0pt' },
  157.         'h5': {
  158.             'font-weight': 'bold',
  159.             'text-indent': '0pt' },
  160.         'b': {
  161.             'font-weight': 'bold' },
  162.         'strong': {
  163.             'font-weight': 'bold' },
  164.         'i': {
  165.             'font-style': 'italic' },
  166.         'cite': {
  167.             'font-style': 'italic' },
  168.         'em': {
  169.             'font-style': 'italic' },
  170.         'small': {
  171.             'font-size': 'small' },
  172.         'pre': {
  173.             'font-family': 'monospace',
  174.             'white-space': 'pre' },
  175.         'code': {
  176.             'font-family': 'monospace' },
  177.         'tt': {
  178.             'font-family': 'monospace' },
  179.         'center': {
  180.             'text-align': 'center' },
  181.         'th': {
  182.             'font-size': 'large',
  183.             'font-weight': 'bold' },
  184.         'big': {
  185.             'font-size': 'large',
  186.             'font-weight': 'bold' },
  187.         '.libprs500_dropcaps': {
  188.             'font-size': 'xx-large' },
  189.         'u': {
  190.             'text-decoration': 'underline' },
  191.         'sup': {
  192.             'vertical-align': 'super',
  193.             'font-size': '60%' },
  194.         'sub': {
  195.             'vertical-align': 'sub',
  196.             'font-size': '60%' } }
  197.     
  198.     def __init__(self, book, fonts, options, logger, paths):
  199.         object.__setattr__(self, 'options', options)
  200.         self.log = logger
  201.         self.fonts = fonts
  202.         self.scaled_images = { }
  203.         self.rotated_images = { }
  204.         self.text_styles = []
  205.         self.block_styles = []
  206.         self.images = { }
  207.         self.targets = { }
  208.         self.links = deque()
  209.         self.processed_files = []
  210.         self.extra_toc_entries = []
  211.         self.image_memory = []
  212.         self.id_counter = 0
  213.         self.unused_target_blocks = []
  214.         self.link_level = 0
  215.         self.memory = []
  216.         self.tops = { }
  217.         self.previous_text = ''
  218.         self.stripped_space = ''
  219.         self.preserve_block_style = False
  220.         self.avoid_page_break = False
  221.         self.current_page = book.create_page()
  222.         self.blockquote_style = book.create_block_style(sidemargin = 60, topskip = 20, footskip = 20)
  223.         self.unindented_style = book.create_text_style(parindent = 0)
  224.         self.in_table = False
  225.         self.list_level = 0
  226.         self.list_indent = 20
  227.         self.list_counter = 1
  228.         self.book = book
  229.         self.override_css = { }
  230.         self.override_pcss = { }
  231.         if self._override_css is not None:
  232.             if os.access(self._override_css, os.R_OK):
  233.                 src = open(self._override_css, 'rb').read()
  234.             else:
  235.                 src = self._override_css
  236.             match = self.PAGE_BREAK_PAT.search(src)
  237.             if match and not re.match('avoid', match.group(1), re.IGNORECASE):
  238.                 self.page_break_found = True
  239.             
  240.             (ncss, npcss) = self.parse_css(src)
  241.             if ncss:
  242.                 update_css(ncss, self.override_css)
  243.             
  244.             if npcss:
  245.                 update_css(npcss, self.override_pcss)
  246.             
  247.         
  248.         paths = [ os.path.abspath(path) for path in paths ]
  249.         paths = [ _[2] if not isinstance(path, unicode) else path for path in paths ]
  250.         for link in self.links:
  251.             paths = _[3][link['path']]
  252.             _[3]
  253.         for text, tb in self.extra_toc_entries:
  254.             self.book.addTocEntry(text, tb)
  255.         
  256.  
  257.     
  258.     def is_baen(self, soup):
  259.         return bool(soup.find('meta', attrs = {
  260.             'name': 'Publisher',
  261.             'content': re.compile('Baen', re.IGNORECASE) }))
  262.  
  263.     
  264.     def is_book_designer(self, raw):
  265.         return bool(re.search('<H2[^><]*id=BookTitle', raw))
  266.  
  267.     
  268.     def preprocess(self, raw):
  269.         nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
  270.         nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
  271.         if not (self.book_designer) and self.is_book_designer(raw):
  272.             self.book_designer = True
  273.             self.log.info(_('\tBook Designer file detected.'))
  274.         
  275.         self.log.info(_('\tParsing HTML...'))
  276.         if self.baen:
  277.             nmassage.extend(HTMLConverter.BAEN)
  278.         
  279.         if self.pdftohtml:
  280.             nmassage.extend(HTMLConverter.PDFTOHTML)
  281.         
  282.         if self.book_designer:
  283.             nmassage.extend(HTMLConverter.BOOK_DESIGNER)
  284.         
  285.         
  286.         try:
  287.             soup = BeautifulSoup(raw, convertEntities = BeautifulSoup.XHTML_ENTITIES, markupMassage = nmassage)
  288.         except ConversionError:
  289.             err = None
  290.             if 'Failed to coerce to unicode' in str(err):
  291.                 raw = unicode(raw, 'utf8', 'replace')
  292.                 soup = BeautifulSoup(raw, convertEntities = BeautifulSoup.XHTML_ENTITIES, markupMassage = nmassage)
  293.             else:
  294.                 raise 
  295.             'Failed to coerce to unicode' in str(err)
  296.  
  297.         if not (self.baen) and self.is_baen(soup):
  298.             self.baen = True
  299.             self.log.info(_('\tBaen file detected. Re-parsing...'))
  300.             return self.preprocess(raw)
  301.         if self.book_designer:
  302.             t = soup.find(id = 'BookTitle')
  303.             if t:
  304.                 self.book.set_title(self.get_text(t))
  305.             
  306.             a = soup.find(id = 'BookAuthor')
  307.             if a:
  308.                 self.book.set_author(self.get_text(a))
  309.             
  310.         
  311.         if self.verbose:
  312.             tdir = tempfile.gettempdir()
  313.             if not os.path.exists(tdir):
  314.                 os.makedirs(tdir)
  315.             
  316.             
  317.             try:
  318.                 dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb')
  319.                 dump.write(unicode(soup).encode('utf-8'))
  320.                 self.log.info(_('Written preprocessed HTML to ') + dump.name)
  321.                 dump.close()
  322.  
  323.         
  324.         return soup
  325.  
  326.     
  327.     def add_file(self, path):
  328.         self.css = HTMLConverter.CSS.copy()
  329.         self.pseudo_css = self.override_pcss.copy()
  330.         for selector in self.override_css:
  331.             if self.css.has_key(selector):
  332.                 self.css[selector].update(self.override_css[selector])
  333.                 continue
  334.             self.css[selector] = self.override_css[selector]
  335.         
  336.         upath = None if isinstance(path, unicode) else path
  337.         self.file_name = os.path.basename(upath.decode(sys.getfilesystemencoding()))
  338.         None(self.log.info % _('Processing %s') if self.verbose else repr(self.file_name))
  339.         if not os.path.exists(upath):
  340.             upath = upath.replace('&', '%26')
  341.         
  342.         f = open(upath, 'rb')
  343.         raw = f.read()
  344.         if self.pdftohtml:
  345.             raw = raw.decode('utf-8', 'ignore')
  346.         elif self.encoding is not None:
  347.             raw = raw.decode(self.encoding, 'ignore')
  348.         else:
  349.             raw = xml_to_unicode(raw, self.verbose)[0]
  350.         f.close()
  351.         soup = self.preprocess(raw)
  352.         self.log.info(_('\tConverting to BBeB...'))
  353.         self.current_style = { }
  354.         self.page_break_found = False
  355.         if not isinstance(path, unicode):
  356.             path = path.decode(sys.getfilesystemencoding())
  357.         
  358.         self.target_prefix = path
  359.         self.previous_text = '\n'
  360.         self.tops[path] = self.parse_file(soup)
  361.         self.processed_files.append(path)
  362.  
  363.     
  364.     def parse_css(self, style):
  365.         sdict = { }
  366.         pdict = { }
  367.         style = re.sub('/\\*.*?\\*/', '', style)
  368.         for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
  369.             for key in sel[0].split(','):
  370.                 val = self.parse_style_properties(sel[1])
  371.                 key = key.strip().lower()
  372.                 if '+' in key:
  373.                     continue
  374.                 
  375.                 if ':' in key:
  376.                     (key, sep, pseudo) = key.partition(':')
  377.                     if key in pdict:
  378.                         if pseudo in pdict[key]:
  379.                             pdict[key][pseudo].update(val)
  380.                         else:
  381.                             pdict[key][pseudo] = val
  382.                     else:
  383.                         pdict[key] = {
  384.                             pseudo: val }
  385.                 key in pdict
  386.                 if key in sdict:
  387.                     sdict[key].update(val)
  388.                     continue
  389.                 sdict[key] = val
  390.             
  391.         
  392.         return (sdict, pdict)
  393.  
  394.     
  395.     def parse_style_properties(self, props):
  396.         prop = dict()
  397.         for s in props.split(';'):
  398.             l = s.split(':', 1)
  399.             if len(l) == 2:
  400.                 key = l[0].strip().lower()
  401.                 val = l[1].strip()
  402.                 prop[key] = val
  403.                 continue
  404.         
  405.         return prop
  406.  
  407.     
  408.     def tag_css(self, tag, parent_css = { }):
  409.         
  410.         def merge_parent_css(prop, pcss):
  411.             inherited = [
  412.                 'text-align',
  413.                 'float',
  414.                 'white-space',
  415.                 'color',
  416.                 'line-height',
  417.                 'vertical-align']
  418.             temp = { }
  419.             for key in pcss.keys():
  420.                 chk = key.lower()
  421.                 if chk.startswith('font') or chk in inherited:
  422.                     temp[key] = pcss[key]
  423.                     continue
  424.             
  425.             prop.update(temp)
  426.  
  427.         prop = { }
  428.         pprop = { }
  429.         tagname = tag.name.lower()
  430.         if parent_css:
  431.             merge_parent_css(prop, parent_css)
  432.         
  433.         if tag.has_key('align'):
  434.             al = tag['align'].lower()
  435.             if al in ('left', 'right', 'center', 'justify'):
  436.                 prop['text-align'] = al
  437.             
  438.         
  439.         if self.css.has_key(tagname):
  440.             prop.update(self.css[tagname])
  441.         
  442.         if self.pseudo_css.has_key(tagname):
  443.             pprop.update(self.pseudo_css[tagname])
  444.         
  445.         if tag.has_key('class'):
  446.             cls = tag['class'].lower()
  447.             for cls in cls.split():
  448.                 for classname in [
  449.                     '.' + cls,
  450.                     tagname + '.' + cls]:
  451.                     if self.css.has_key(classname):
  452.                         prop.update(self.css[classname])
  453.                     
  454.                     if self.pseudo_css.has_key(classname):
  455.                         pprop.update(self.pseudo_css[classname])
  456.                         continue
  457.                 
  458.             
  459.         
  460.         if tag.has_key('id') and self.css.has_key(tag['id']):
  461.             prop.update(self.css[tag['id']])
  462.         
  463.         if tag.has_key('style'):
  464.             prop.update(self.parse_style_properties(tag['style']))
  465.         
  466.         return (prop, pprop)
  467.  
  468.     
  469.     def parse_file(self, soup):
  470.         
  471.         def get_valid_block(page):
  472.             for item in page.contents:
  473.                 if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
  474.                     if isinstance(item, TextBlock) and not (item.contents):
  475.                         continue
  476.                     
  477.                     return item
  478.             
  479.  
  480.         if not self.current_page:
  481.             self.current_page = self.book.create_page()
  482.         
  483.         self.current_block = self.book.create_text_block()
  484.         self.current_para = Paragraph()
  485.         if self.cover:
  486.             self.add_image_page(self.cover)
  487.             self.cover = None
  488.         
  489.         top = self.current_block
  490.         self.current_block.must_append = True
  491.         self.soup = soup
  492.         self.process_children(soup, { }, { })
  493.         self.soup = None
  494.         if self.current_para and self.current_block:
  495.             self.current_para.append_to(self.current_block)
  496.         
  497.         if self.current_block and self.current_page:
  498.             self.current_block.append_to(self.current_page)
  499.         
  500.         if self.avoid_page_break:
  501.             self.avoid_page_break = False
  502.         elif self.current_page and self.current_page.has_text():
  503.             self.book.append(self.current_page)
  504.             self.current_page = None
  505.         
  506.         if top not in top.parent.contents:
  507.             top = top.parent.contents[0]
  508.         
  509.         if not top.has_text() and top.parent.contents.index(top) == len(top.parent.contents) - 1:
  510.             opage = top.parent
  511.             top.parent.contents.remove(top)
  512.             if self.book.last_page() is opage:
  513.                 if self.current_page and self.current_page.has_text():
  514.                     for c in self.current_page.contents:
  515.                         if isinstance(c, (TextBlock, ImageBlock)):
  516.                             return c
  517.                     
  518.                 
  519.                 raise ConversionError(_('Could not parse file: %s') % self.file_name)
  520.             self.book.last_page() is opage
  521.             
  522.             try:
  523.                 index = self.book.pages().index(opage)
  524.             except ValueError:
  525.                 self.log.warning(_('%s is an empty file') % self.file_name)
  526.                 tb = self.book.create_text_block()
  527.                 self.current_page.append(tb)
  528.                 return tb
  529.  
  530.             for page in list(self.book.pages()[index + 1:]):
  531.                 for c in page.contents:
  532.                     if isinstance(c, (TextBlock, ImageBlock, Canvas)):
  533.                         return c
  534.                 
  535.             
  536.             raise ConversionError(_('Could not parse file: %s') % self.file_name)
  537.         top.parent.contents.index(top) == len(top.parent.contents) - 1
  538.         return top
  539.  
  540.     
  541.     def create_link(self, children, tag):
  542.         para = None
  543.         for i in range(len(children) - 1, -1, -1):
  544.             if isinstance(children[i], (Span, EmpLine)):
  545.                 para = children[i]
  546.                 break
  547.                 continue
  548.         
  549.         if para is None:
  550.             raise ConversionError(_('Failed to parse link %s %s') % (tag, children))
  551.         para is None
  552.         text = self.get_text(tag, 1000)
  553.         if not text:
  554.             text = 'Link'
  555.             img = tag.find('img')
  556.             if img:
  557.                 
  558.                 try:
  559.                     text = img['alt']
  560.                 except KeyError:
  561.                     pass
  562.                 except:
  563.                     None<EXCEPTION MATCH>KeyError
  564.                 
  565.  
  566.             None<EXCEPTION MATCH>KeyError
  567.         
  568.         (path, fragment) = munge_paths(self.target_prefix, tag['href'])
  569.         if self.link_level == 0 and not (self.use_spine):
  570.             pass
  571.         return {
  572.             'para': para,
  573.             'text': text,
  574.             'path': os.path.abspath(path),
  575.             'fragment': fragment,
  576.             'in toc': not (self.options.no_links_in_toc) }
  577.  
  578.     
  579.     def get_text(self, tag, limit = None):
  580.         css = self.tag_css(tag)[0]
  581.         if (css.has_key('display') or css['display'].lower() == 'none' or css.has_key('visibility')) and css['visibility'].lower() == 'hidden':
  582.             return ''
  583.         (text, alt_text) = (u'', u'')
  584.         for c in tag.contents:
  585.             if limit != None and len(text) > limit:
  586.                 break
  587.             
  588.             if isinstance(c, HTMLConverter.IGNORED_TAGS):
  589.                 continue
  590.             
  591.             if isinstance(c, NavigableString):
  592.                 text += unicode(c)
  593.                 continue
  594.             if isinstance(c, Tag):
  595.                 if c.name.lower() == 'img' and c.has_key('alt'):
  596.                     alt_text += c['alt']
  597.                     continue
  598.                 
  599.                 text += self.get_text(c)
  600.                 continue
  601.         
  602.         if text.strip():
  603.             return text
  604.         return alt_text
  605.  
  606.     
  607.     def process_links(self):
  608.         
  609.         def add_toc_entry(text, target):
  610.             if target.parent != None and hasattr(target.parent, 'objId'):
  611.                 self.book.addTocEntry(ascii_text, tb)
  612.             else:
  613.                 self.log.debug(_('Cannot add link %s to TOC') % ascii_text)
  614.  
  615.         
  616.         def get_target_block(fragment, targets):
  617.             bs = targets[fragment]
  618.             if not isinstance(bs, BlockSpace):
  619.                 return bs
  620.             ans = None
  621.             found = False
  622.             page = bs.parent
  623.             for item in page.contents:
  624.                 if found:
  625.                     if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
  626.                         ans = item
  627.                         break
  628.                     
  629.                 
  630.                 if item == bs:
  631.                     found = True
  632.                     continue
  633.                     continue
  634.             
  635.             if not ans:
  636.                 for i in range(len(page.contents) - 1, -1, -1):
  637.                     if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
  638.                         ans = page.contents[i]
  639.                         break
  640.                         continue
  641.                 
  642.             
  643.             if not ans:
  644.                 ntb = self.book.create_text_block()
  645.                 ntb.Paragraph(' ')
  646.                 page.append(ntb)
  647.                 ans = ntb
  648.             
  649.             if found:
  650.                 targets[fragment] = ans
  651.                 page.contents.remove(bs)
  652.             
  653.             return ans
  654.  
  655.         outside_links = deque()
  656.         while len(self.links) > 0:
  657.             link = self.links.popleft()
  658.             (para, text, path, fragment) = (link['para'], link['text'], link['path'], link['fragment'])
  659.             ascii_text = text
  660.             if not isinstance(path, unicode):
  661.                 path = path.decode(sys.getfilesystemencoding())
  662.             
  663.             if path in self.processed_files:
  664.                 if path + fragment in self.targets.keys():
  665.                     tb = get_target_block(path + fragment, self.targets)
  666.                 else:
  667.                     tb = self.tops[path]
  668.                 if link['in toc']:
  669.                     add_toc_entry(ascii_text, tb)
  670.                 
  671.                 jb = JumpButton(tb)
  672.                 self.book.append(jb)
  673.                 cb = CharButton(jb, text = text)
  674.                 para.contents = []
  675.                 para.append(cb)
  676.                 
  677.                 try:
  678.                     self.unused_target_blocks.remove(tb)
  679.                 except ValueError:
  680.                     pass
  681.                 except:
  682.                     None<EXCEPTION MATCH>ValueError
  683.                 
  684.  
  685.             None<EXCEPTION MATCH>ValueError
  686.             outside_links.append(link)
  687.         return outside_links
  688.  
  689.     
  690.     def create_toc(self, toc):
  691.         for item in toc.top_level_items():
  692.             ascii_text = item.text
  693.             if not (item.fragment) and item.abspath in self.tops:
  694.                 self.book.addTocEntry(ascii_text, self.tops[item.abspath])
  695.                 continue
  696.             if item.abspath:
  697.                 url = None + item.abspath if item.fragment else ''
  698.                 if url in self.targets:
  699.                     self.book.addTocEntry(ascii_text, self.targets[url])
  700.                 
  701.             url in self.targets
  702.         
  703.  
  704.     
  705.     def end_page(self):
  706.         if self.current_para.has_text():
  707.             self.current_para.append_to(self.current_block)
  708.             self.current_para = Paragraph()
  709.         
  710.         if self.current_block.has_text() or self.current_block.must_append:
  711.             self.current_block.append_to(self.current_page)
  712.             self.current_block = self.book.create_text_block()
  713.         
  714.         if self.current_page.has_text():
  715.             self.book.append(self.current_page)
  716.             self.current_page = self.book.create_page()
  717.         
  718.  
  719.     
  720.     def add_image_page(self, path):
  721.         if os.access(path, os.R_OK):
  722.             self.end_page()
  723.             pwidth = self.profile.screen_width
  724.             pheight = self.profile.screen_height - self.profile.fudge
  725.             page = self.book.create_page(evensidemargin = 0, oddsidemargin = 0, topmargin = 0, textwidth = pwidth, headheight = 0, headsep = 0, footspace = 0, footheight = 0, textheight = pheight)
  726.             if not self.images.has_key(path):
  727.                 self.images[path] = ImageStream(path)
  728.             
  729.             im = PILImage.open(path)
  730.             (width, height) = im.size
  731.             canvas = Canvas(pwidth, pheight)
  732.             ib = ImageBlock(self.images[path], x1 = width, y1 = height, xsize = width, ysize = height, blockwidth = width, blockheight = height)
  733.             canvas.put_object(ib, int((pwidth - width) / 2), int((pheight - height) / 2))
  734.             page.append(canvas)
  735.             self.book.append(page)
  736.         
  737.  
  738.     
  739.     def process_children(self, ptag, pcss, ppcss = { }):
  740.         for c in copy.copy(ptag.contents):
  741.             if isinstance(c, HTMLConverter.IGNORED_TAGS):
  742.                 continue
  743.                 continue
  744.             if isinstance(c, Tag):
  745.                 self.parse_tag(c, pcss)
  746.                 continue
  747.             if isinstance(c, NavigableString):
  748.                 self.add_text(c, pcss, ppcss)
  749.                 continue
  750.         
  751.         if not self.in_table:
  752.             
  753.             try:
  754.                 if self.minimize_memory_usage:
  755.                     ptag.extract()
  756.             except AttributeError:
  757.                 print ptag, type(ptag)
  758.             except:
  759.                 None<EXCEPTION MATCH>AttributeError
  760.             
  761.  
  762.         None<EXCEPTION MATCH>AttributeError
  763.  
  764.     
  765.     def get_alignment(self, css):
  766.         val = None if css.has_key('text-align') else None
  767.         align = 'head'
  768.         if val is not None:
  769.             if val in ('right', 'foot'):
  770.                 align = 'foot'
  771.             elif val == 'center':
  772.                 align = 'center'
  773.             
  774.         
  775.         if css.has_key('float'):
  776.             val = css['float'].lower()
  777.             if val == 'left':
  778.                 align = 'head'
  779.             
  780.             if val == 'right':
  781.                 align = 'foot'
  782.             
  783.             css.pop('float')
  784.         
  785.         return align
  786.  
  787.     
  788.     def process_alignment(self, css):
  789.         align = self.get_alignment(css)
  790.         if align != self.current_block.textStyle.attrs['align']:
  791.             self.current_para.append_to(self.current_block)
  792.             self.current_block.append_to(self.current_page)
  793.             ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
  794.             ts.attrs['align'] = align
  795.             
  796.             try:
  797.                 index = self.text_styles.index(ts)
  798.                 ts = self.text_styles[index]
  799.             except ValueError:
  800.                 self.text_styles.append(ts)
  801.  
  802.             self.current_block = self.book.create_text_block(blockStyle = self.current_block.blockStyle, textStyle = ts)
  803.             self.current_para = Paragraph()
  804.             return True
  805.         return False
  806.  
  807.     
  808.     def add_text(self, tag, css, pseudo_css, force_span_use = False):
  809.         src = None if hasattr(tag, 'string') else tag
  810.         if len(src) > 32760:
  811.             pos = 0
  812.             while pos < len(src):
  813.                 self.add_text(src[pos:pos + 32760], css, pseudo_css, force_span_use)
  814.                 pos += 32760
  815.             return None
  816.         src = src.replace('\r\n', '\n').replace('\r', '\n')
  817.         if not not css.has_key('white-space'):
  818.             pass
  819.         collapse_whitespace = css['white-space'] != 'pre'
  820.         if self.process_alignment(css) and collapse_whitespace:
  821.             src = src.lstrip()
  822.         
  823.         
  824.         def append_text(src):
  825.             (fp, key, variant) = self.font_properties(css)
  826.             for x, y in [
  827.                 (u'┬¡', ''),
  828.                 (u'┬á', ' '),
  829.                 (u'∩¼Ç', 'ff'),
  830.                 (u'∩¼ü', 'fi'),
  831.                 (u'∩¼é', 'fl'),
  832.                 (u'∩¼â', 'ffi'),
  833.                 (u'∩¼ä', 'ffl')]:
  834.                 src = src.replace(x, y)
  835.             
  836.             
  837.             valigner = lambda x: x
  838.             if 'vertical-align' in css:
  839.                 valign = css['vertical-align']
  840.                 if valign in ('sup', 'super', 'sub'):
  841.                     fp['fontsize'] = int(int(fp['fontsize']) * 5 / 3)
  842.                     valigner = None if valign == 'sub' else Sup
  843.                 
  844.             
  845.             normal_font_size = int(fp['fontsize'])
  846.             if variant == 'small-caps':
  847.                 dump = Span(fontsize = normal_font_size - 30)
  848.                 temp = []
  849.                 for c in src:
  850.                     if c.isupper():
  851.                         if temp:
  852.                             dump.append(valigner(''.join(temp)))
  853.                             temp = []
  854.                         
  855.                         dump.append(Span(valigner(c), fontsize = normal_font_size))
  856.                         continue
  857.                     temp.append(c.upper())
  858.                 
  859.                 src = dump
  860.                 if temp:
  861.                     src.append(valigner(''.join(temp)))
  862.                 
  863.             else:
  864.                 src = valigner(src)
  865.             if key in ('italic', 'bi'):
  866.                 already_italic = False
  867.                 for fonts in self.fonts.values():
  868.                     it = None if fonts.has_key('italic') else ''
  869.                     bi = None if fonts.has_key('bi') else ''
  870.                     if fp['fontfacename'] in (it, bi):
  871.                         already_italic = True
  872.                         break
  873.                         continue
  874.                 
  875.                 if not already_italic:
  876.                     src = Italic(src)
  877.                 
  878.             
  879.             unneeded = []
  880.             for prop in fp:
  881.                 if fp[prop] == self.current_block.textStyle.attrs[prop]:
  882.                     unneeded.append(prop)
  883.                     continue
  884.             
  885.             for prop in unneeded:
  886.                 fp.pop(prop)
  887.             
  888.             attrs = { }
  889.             if 'color' in css and not (self.ignore_colors):
  890.                 attrs['textcolor'] = lrs_color(css['color'])
  891.             
  892.             attrs.update(fp)
  893.             elem = None if attrs or force_span_use else src
  894.             if css.has_key('text-decoration'):
  895.                 dec = css['text-decoration'].lower()
  896.                 if dec == 'underline':
  897.                     pass
  898.                 elif dec == 'overline':
  899.                     pass
  900.                 
  901.                 linepos = None
  902.                 if linepos is not None:
  903.                     elem = EmpLine(elem, emplineposition = linepos)
  904.                 
  905.             
  906.             self.current_para.append(elem)
  907.  
  908.         if collapse_whitespace:
  909.             src = re.sub('\\s{1,}', ' ', src)
  910.             if self.stripped_space and len(src) == len(src.lstrip(u' \n\r\t')):
  911.                 src = self.stripped_space + src
  912.             
  913.             src = src.rstrip(u' \n\r\t')
  914.             orig = src
  915.             self.stripped_space = orig[len(src):]
  916.             if len(self.previous_text) != len(self.previous_text.rstrip(u' \n\r\t')):
  917.                 src = src.lstrip(u' \n\r\t')
  918.             
  919.             if len(src):
  920.                 self.previous_text = src
  921.                 append_text(src)
  922.             
  923.         else:
  924.             srcs = src.split('\n')
  925.             for src in srcs[:-1]:
  926.                 append_text(src)
  927.                 self.line_break()
  928.             
  929.             last = srcs[-1]
  930.             if len(last):
  931.                 append_text(last)
  932.             
  933.  
  934.     
  935.     def line_break(self):
  936.         self.current_para.append(CR())
  937.         self.previous_text = '\n'
  938.  
  939.     
  940.     def end_current_para(self):
  941.         if self.current_para.contents:
  942.             self.current_block.append(self.current_para)
  943.         
  944.         self.current_block.append(CR())
  945.         self.current_para = Paragraph()
  946.  
  947.     
  948.     def end_current_block(self):
  949.         if self.current_para.contents:
  950.             self.current_block.append(self.current_para)
  951.             self.current_para = Paragraph()
  952.         
  953.         if self.current_block.contents or self.current_block.must_append:
  954.             self.current_page.append(self.current_block)
  955.             self.current_block = self.book.create_text_block(textStyle = self.current_block.textStyle, blockStyle = self.current_block.blockStyle)
  956.         
  957.  
  958.     
  959.     def process_image(self, path, tag_css, width = None, height = None, dropcaps = False, rescale = False):
  960.         
  961.         def detect_encoding(im):
  962.             fmt = im.format
  963.             if fmt == 'JPG':
  964.                 fmt = 'JPEG'
  965.             
  966.             return fmt
  967.  
  968.         original_path = path
  969.         if self.rotated_images.has_key(path):
  970.             path = self.rotated_images[path].name
  971.         
  972.         if self.scaled_images.has_key(path):
  973.             path = self.scaled_images[path].name
  974.         
  975.         
  976.         try:
  977.             im = PILImage.open(path)
  978.         except IOError:
  979.             err = None
  980.             self.log.warning('Unable to process image: %s\n%s' % (original_path, err))
  981.             return None
  982.  
  983.         encoding = detect_encoding(im)
  984.         
  985.         def scale_image(width, height):
  986.             if width <= 0:
  987.                 width = 1
  988.             
  989.             if height <= 0:
  990.                 height = 1
  991.             
  992.             pt = PersistentTemporaryFile(suffix = '_html2lrf_scaled_image_.' + encoding.lower())
  993.             self.image_memory.append(pt)
  994.             
  995.             try:
  996.                 im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
  997.                 pt.close()
  998.                 self.scaled_images[path] = pt
  999.                 return pt.name
  1000.             except (IOError, SystemError):
  1001.                 err = None
  1002.                 self.log.warning(_('Unable to process image %s. Error: %s') % (path, err))
  1003.  
  1004.  
  1005.         if width == None or height == None:
  1006.             (width, height) = im.size
  1007.         elif rescale:
  1008.             if width < im.size[0] or height < im.size[1]:
  1009.                 path = scale_image(width, height)
  1010.                 if not path:
  1011.                     return None
  1012.             
  1013.         factor = 720 / self.profile.dpi
  1014.         pheight = int(self.current_page.pageStyle.attrs['textheight'])
  1015.         pwidth = int(self.current_page.pageStyle.attrs['textwidth'])
  1016.         if dropcaps:
  1017.             scale = False
  1018.             if width > 0.75 * pwidth:
  1019.                 width = int(0.75 * pwidth)
  1020.                 scale = True
  1021.             
  1022.             if height > 0.75 * pheight:
  1023.                 height = int(0.75 * pheight)
  1024.                 scale = True
  1025.             
  1026.             if scale:
  1027.                 path = scale_image(width, height)
  1028.             
  1029.             if not self.images.has_key(path):
  1030.                 self.images[path] = ImageStream(path)
  1031.             
  1032.             im = Image(self.images[path], x0 = 0, y0 = 0, x1 = width, y1 = height, xsize = width, ysize = height)
  1033.             line_height = (int(self.current_block.textStyle.attrs['baselineskip']) + int(self.current_block.textStyle.attrs['linespace'])) // 10
  1034.             line_height *= self.profile.dpi / 72
  1035.             lines = int(ceil(float(height) / line_height))
  1036.             dc = DropCaps(lines)
  1037.             dc.append(Plot(im, xsize = ceil(width * factor), ysize = ceil(height * factor)))
  1038.             self.current_para.append(dc)
  1039.             return None
  1040.         if self.autorotation and width > pwidth and width > height:
  1041.             pt = PersistentTemporaryFile(suffix = '_html2lrf_rotated_image_.' + encoding.lower())
  1042.             
  1043.             try:
  1044.                 im = im.rotate(90)
  1045.                 im.save(pt, encoding)
  1046.                 path = pt.name
  1047.                 self.rotated_images[path] = pt
  1048.                 (width, height) = im.size
  1049.             except IOError:
  1050.                 dropcaps
  1051.                 dropcaps
  1052.                 self.log.debug(_('Unable to process interlaced PNG %s') % original_path)
  1053.             except:
  1054.                 dropcaps
  1055.             finally:
  1056.                 pt.close()
  1057.  
  1058.         
  1059.         (scaled, width, height) = fit_image(width, height, pwidth, pheight)
  1060.         if scaled:
  1061.             path = scale_image(width, height)
  1062.         
  1063.         if not path:
  1064.             return None
  1065.         if not self.images.has_key(path):
  1066.             
  1067.             try:
  1068.                 self.images[path] = ImageStream(path, encoding = encoding)
  1069.             except LrsError:
  1070.                 path
  1071.                 err = path
  1072.                 self.log.warning(_('Could not process image: %s\n%s') % (original_path, err))
  1073.                 return None
  1074.             
  1075.  
  1076.         path<EXCEPTION MATCH>LrsError
  1077.         im = Image(self.images[path], x0 = 0, y0 = 0, x1 = width, y1 = height, xsize = width, ysize = height)
  1078.         self.process_alignment(tag_css)
  1079.         if max(width, height) <= min(pwidth, pheight) / 5:
  1080.             self.current_para.append(Plot(im, xsize = ceil(width * factor), ysize = ceil(height * factor)))
  1081.         elif height <= int(floor((2 / 3) * pheight)):
  1082.             pb = self.current_block
  1083.             self.end_current_para()
  1084.             self.process_alignment(tag_css)
  1085.             self.current_para.append(Plot(im, xsize = width * factor, ysize = height * factor))
  1086.             self.current_block.append(self.current_para)
  1087.             self.current_page.append(self.current_block)
  1088.             self.current_block = self.book.create_text_block(textStyle = pb.textStyle, blockStyle = pb.blockStyle)
  1089.             self.current_para = Paragraph()
  1090.         else:
  1091.             self.end_page()
  1092.             if len(self.current_page.contents) == 1 and not self.current_page.has_text():
  1093.                 self.current_page.contents[0:1] = []
  1094.             
  1095.             self.current_page.append(Canvas(width = pwidth, height = height))
  1096.             left = int(floor((pwidth - width) / 2))
  1097.             self.current_page.contents[-1].put_object(ImageBlock(self.images[path], xsize = width, ysize = height, x1 = width, y1 = height, blockwidth = width, blockheight = height), left, 0)
  1098.  
  1099.     
  1100.     def process_page_breaks(self, tag, tagname, tag_css):
  1101.         if 'page-break-before' in tag_css.keys():
  1102.             if tag_css['page-break-before'].lower() != 'avoid':
  1103.                 self.end_page()
  1104.             
  1105.             tag_css.pop('page-break-before')
  1106.         
  1107.         end_page = False
  1108.         if 'page-break-after' in tag_css.keys():
  1109.             if tag_css['page-break-after'].lower() == 'avoid':
  1110.                 self.avoid_page_break = True
  1111.             else:
  1112.                 end_page = True
  1113.             tag_css.pop('page-break-after')
  1114.         
  1115.         if self.force_page_break_attr[0].match(tagname) and tag.has_key(self.force_page_break_attr[1]) or self.force_page_break_attr[2].match(tag[self.force_page_break_attr[1]]) or self.force_page_break.match(tagname):
  1116.             self.end_page()
  1117.             self.page_break_found = True
  1118.         
  1119.         return end_page
  1120.  
  1121.     
  1122.     def block_properties(self, tag_css):
  1123.         
  1124.         def get(what):
  1125.             src = [ None for i in range(4) ]
  1126.             for i, c in enumerate(('-top', '-right', '-bottom', '-left')):
  1127.                 if tag_css.has_key(what + c):
  1128.                     src[i] = tag_css[what + c]
  1129.                     continue
  1130.                 None if tag_css.has_key(what) else []
  1131.             
  1132.             return src
  1133.  
  1134.         s1 = get('margin')
  1135.         s2 = get('padding')
  1136.         bl = str(self.current_block.blockStyle.attrs['blockwidth']) + 'px'
  1137.         
  1138.         def set(default, one, two):
  1139.             fval = None
  1140.             if one is not None:
  1141.                 val = None(self.unit_convert, one = 'base_length' if 'em' in one else bl)
  1142.                 if val is not None:
  1143.                     fval = val
  1144.                 
  1145.             
  1146.             if two is not None:
  1147.                 val = None(self.unit_convert, two = 'base_length' if 'em' in two else bl)
  1148.                 if val is not None:
  1149.                     fval = None if fval is None else fval + val
  1150.                 
  1151.             
  1152.             if fval is None:
  1153.                 fval = default
  1154.             
  1155.             return fval
  1156.  
  1157.         ans = { }
  1158.         ans['topskip'] = set(self.book.defaultBlockStyle.attrs['topskip'], s1[0], s2[0])
  1159.         ans['footskip'] = set(self.book.defaultBlockStyle.attrs['footskip'], s1[2], s2[2])
  1160.         ans['sidemargin'] = set(self.book.defaultBlockStyle.attrs['sidemargin'], s1[3], s2[3])
  1161.         factor = 0.7
  1162.         if 2 * int(ans['sidemargin']) >= factor * int(self.current_block.blockStyle.attrs['blockwidth']):
  1163.             val = int(ans['sidemargin'])
  1164.             ans['sidemargin'] = set(self.book.defaultBlockStyle.attrs['sidemargin'], s1[1], s2[1])
  1165.             val += int(ans['sidemargin'])
  1166.             val /= 2
  1167.             ans['sidemargin'] = int(val)
  1168.         
  1169.         if 2 * int(ans['sidemargin']) >= factor * int(self.current_block.blockStyle.attrs['blockwidth']):
  1170.             ans['sidemargin'] = int(factor * int(self.current_block.blockStyle.attrs['blockwidth']) / 2)
  1171.         
  1172.         for prop in ('topskip', 'footskip', 'sidemargin'):
  1173.             if ans[prop] < 0:
  1174.                 ans[prop] = 0
  1175.                 continue
  1176.         
  1177.         return ans
  1178.  
  1179.     
  1180.     def font_properties(self, css):
  1181.         t = { }
  1182.         for key in ('fontwidth', 'fontsize', 'wordspace', 'fontfacename', 'fontweight', 'baselineskip'):
  1183.             t[key] = self.book.defaultTextStyle.attrs[key]
  1184.         
  1185.         
  1186.         def font_weight(val):
  1187.             ans = 0
  1188.             m = re.search('([0-9]+)', val)
  1189.             if m:
  1190.                 ans = int(m.group(1))
  1191.             elif val.find('bold') >= 0 or val.find('strong') >= 0:
  1192.                 ans = 700
  1193.             
  1194.             if ans >= 700:
  1195.                 return 'bold'
  1196.             return 'normal'
  1197.  
  1198.         
  1199.         def font_style(val):
  1200.             ans = 'normal'
  1201.             if 'italic' in val or 'oblique' in val:
  1202.                 ans = 'italic'
  1203.             
  1204.             return ans
  1205.  
  1206.         
  1207.         def font_family(val):
  1208.             ans = 'serif'
  1209.             if max(val.find('courier'), val.find('mono'), val.find('fixed'), val.find('typewriter')) >= 0:
  1210.                 ans = 'mono'
  1211.             elif max(val.find('arial'), val.find('helvetica'), val.find('verdana'), val.find('trebuchet'), val.find('sans')) >= 0:
  1212.                 ans = 'sans'
  1213.             
  1214.             return ans
  1215.  
  1216.         
  1217.         def font_variant(val):
  1218.             ans = None
  1219.             if 'small-caps' in val.lower():
  1220.                 ans = 'small-caps'
  1221.             
  1222.             return ans
  1223.  
  1224.         
  1225.         def font_key(family, style, weight):
  1226.             key = 'normal'
  1227.             if style == 'italic' and weight == 'normal':
  1228.                 key = 'italic'
  1229.             elif style == 'normal' and weight == 'bold':
  1230.                 key = 'bold'
  1231.             elif style == 'italic' and weight == 'bold':
  1232.                 key = 'bi'
  1233.             
  1234.             return key
  1235.  
  1236.         
  1237.         def font_size(val):
  1238.             normal = 100
  1239.             ans = self.unit_convert(val, pts = True, base_length = '10pt')
  1240.             if ans:
  1241.                 if ans <= 0:
  1242.                     ans += normal
  1243.                     if ans == 0:
  1244.                         ans = int(font_size('smaller'))
  1245.                     
  1246.                     if ans < 0:
  1247.                         ans = normal
  1248.                     
  1249.                 
  1250.             elif ans == 0:
  1251.                 ans = int(font_size('smaller'))
  1252.             elif 'smaller' in val:
  1253.                 ans = normal - 20
  1254.             elif 'xx-small' in val:
  1255.                 ans = 40
  1256.             elif 'x-small' in val:
  1257.                 ans = 60
  1258.             elif 'small' in val:
  1259.                 ans = 80
  1260.             elif 'medium' in val:
  1261.                 ans = 100
  1262.             elif 'larger' in val:
  1263.                 ans = normal + 20
  1264.             elif 'xx-large' in val:
  1265.                 ans = 180
  1266.             elif 'x-large' in val:
  1267.                 ans = 140
  1268.             elif 'large' in val:
  1269.                 ans = 120
  1270.             
  1271.             if ans is not None:
  1272.                 ans += int(self.font_delta * 20)
  1273.                 ans = str(ans)
  1274.             
  1275.             return ans
  1276.  
  1277.         (family, weight, style, variant) = ('serif', 'normal', 'normal', None)
  1278.         for key in css.keys():
  1279.             val = css[key].lower()
  1280.             if key == 'font':
  1281.                 vals = val.split()
  1282.                 for val in vals:
  1283.                     family = font_family(val)
  1284.                     if family != 'serif':
  1285.                         break
  1286.                         continue
  1287.                     (None, None)
  1288.                 
  1289.                 for val in vals:
  1290.                     weight = font_weight(val)
  1291.                     if weight != 'normal':
  1292.                         break
  1293.                         continue
  1294.                 
  1295.                 for val in vals:
  1296.                     style = font_style(val)
  1297.                     if style != 'normal':
  1298.                         break
  1299.                         continue
  1300.                 
  1301.                 for val in vals:
  1302.                     sz = font_size(val)
  1303.                     if sz:
  1304.                         t['fontsize'] = sz
  1305.                         break
  1306.                         continue
  1307.                 
  1308.                 for val in vals:
  1309.                     variant = font_variant(val)
  1310.                     if variant:
  1311.                         t['fontvariant'] = variant
  1312.                         break
  1313.                         continue
  1314.                 
  1315.             if key in ('font-family', 'font-name'):
  1316.                 family = font_family(val)
  1317.                 continue
  1318.             if key == 'font-size':
  1319.                 ans = font_size(val)
  1320.                 if ans:
  1321.                     t['fontsize'] = ans
  1322.                 
  1323.             ans
  1324.             if key == 'font-weight':
  1325.                 weight = font_weight(val)
  1326.                 continue
  1327.             if key == 'font-style':
  1328.                 style = font_style(val)
  1329.                 continue
  1330.             if key == 'font-variant':
  1331.                 variant = font_variant(val)
  1332.                 continue
  1333.         
  1334.         if variant:
  1335.             css['font-variant'] = variant
  1336.         
  1337.         key = font_key(family, style, weight)
  1338.         if self.fonts[family].has_key(key):
  1339.             t['fontfacename'] = self.fonts[family][key][1]
  1340.         else:
  1341.             t['fontfacename'] = self.fonts[family]['normal'][1]
  1342.         if key in ('bold', 'bi'):
  1343.             t['fontweight'] = 700
  1344.         
  1345.         fs = int(t['fontsize'])
  1346.         if fs > 120:
  1347.             t['wordspace'] = int(fs / 4)
  1348.         
  1349.         t['baselineskip'] = fs + 20
  1350.         return (t, key, variant)
  1351.  
  1352.     
  1353.     def unit_convert(self, val, pts = False, base_length = '10pt'):
  1354.         dpi = self.profile.dpi
  1355.         result = None
  1356.         
  1357.         try:
  1358.             result = int(val)
  1359.         except ValueError:
  1360.             pass
  1361.  
  1362.         m = re.search('\\s*(-*[0-9]*\\.?[0-9]*)\\s*(%|em|px|mm|cm|in|dpt|pt|pc)', val)
  1363.         if m is not None and m.group(1):
  1364.             unit = float(m.group(1))
  1365.             if m.group(2) == '%':
  1366.                 normal = self.unit_convert(base_length)
  1367.                 result = (unit / 100) * normal
  1368.             elif m.group(2) == 'px':
  1369.                 result = unit
  1370.             elif m.group(2) == 'in':
  1371.                 result = unit * dpi
  1372.             elif m.group(2) == 'pt':
  1373.                 result = unit * dpi / 72
  1374.             elif m.group(2) == 'dpt':
  1375.                 result = unit * dpi / 720
  1376.             elif m.group(2) == 'em':
  1377.                 normal = self.unit_convert(base_length)
  1378.                 result = unit * normal
  1379.             elif m.group(2) == 'pc':
  1380.                 result = unit * (dpi / 72) * 12
  1381.             elif m.group(2) == 'mm':
  1382.                 result = unit * 0.04 * dpi
  1383.             elif m.group(2) == 'cm':
  1384.                 result = unit * 0.4 * dpi
  1385.             
  1386.         
  1387.         if result is not None:
  1388.             if pts:
  1389.                 result = int(round(result * (720 / dpi)))
  1390.             else:
  1391.                 result = int(round(result))
  1392.         
  1393.         return result
  1394.  
  1395.     
  1396.     def text_properties(self, tag_css):
  1397.         indent = self.book.defaultTextStyle.attrs['parindent']
  1398.         if tag_css.has_key('text-indent'):
  1399.             bl = str(self.current_block.blockStyle.attrs['blockwidth']) + 'px'
  1400.             if 'em' in tag_css['text-indent']:
  1401.                 bl = '10pt'
  1402.             
  1403.             indent = self.unit_convert(unicode(tag_css['text-indent']), pts = True, base_length = bl)
  1404.             if not indent:
  1405.                 indent = 0
  1406.             
  1407.             if indent > 0 and indent < 10 * self.minimum_indent:
  1408.                 indent = int(10 * self.minimum_indent)
  1409.             
  1410.         
  1411.         fp = self.font_properties(tag_css)[0]
  1412.         fp['parindent'] = indent
  1413.         if tag_css.has_key('line-height'):
  1414.             bls = int(self.book.defaultTextStyle.attrs['baselineskip'])
  1415.             ls = int(self.book.defaultTextStyle.attrs['linespace'])
  1416.             
  1417.             try:
  1418.                 val = int(float(tag_css['line-height'].strip()) * ls)
  1419.                 fp['linespace'] = val
  1420.             except ValueError:
  1421.                 val = self.unit_convert(tag_css['line-height'], pts = True, base_length = '1pt')
  1422.  
  1423.             if val is not None:
  1424.                 val -= bls
  1425.                 if val >= 0:
  1426.                     fp['linespace'] = val
  1427.                 
  1428.             
  1429.         
  1430.         return fp
  1431.  
  1432.     
  1433.     def process_block(self, tag, tag_css):
  1434.         text_properties = self.text_properties(tag_css)
  1435.         block_properties = self.block_properties(tag_css)
  1436.         indent = (float(text_properties['parindent']) / 10) * (self.profile.dpi / 72)
  1437.         margin = float(block_properties['sidemargin'])
  1438.         if indent < 0 and margin + indent < 0:
  1439.             text_properties['parindent'] = int(-margin * (72 / self.profile.dpi) * 10)
  1440.         
  1441.         align = self.get_alignment(tag_css)
  1442.         
  1443.         def fill_out_properties(props, default):
  1444.             for key in default.keys():
  1445.                 if not props.has_key(key):
  1446.                     props[key] = default[key]
  1447.                     continue
  1448.             
  1449.  
  1450.         fill_out_properties(block_properties, self.book.defaultBlockStyle.attrs)
  1451.         fill_out_properties(text_properties, self.book.defaultTextStyle.attrs)
  1452.         
  1453.         def properties_different(dict1, dict2):
  1454.             for key in dict1.keys():
  1455.                 if dict1[key] != dict2[key]:
  1456.                     return True
  1457.             
  1458.             return False
  1459.  
  1460.         if properties_different(self.current_block.blockStyle.attrs, block_properties) and properties_different(self.current_block.textStyle.attrs, text_properties) or align != self.current_block.textStyle.attrs['align']:
  1461.             ts = self.current_block.textStyle.copy()
  1462.             ts.attrs.update(text_properties)
  1463.             ts.attrs['align'] = align
  1464.             bs = self.current_block.blockStyle.copy()
  1465.             if not self.preserve_block_style:
  1466.                 bs.attrs.update(block_properties)
  1467.             
  1468.             self.current_block.append_to(self.current_page)
  1469.             
  1470.             try:
  1471.                 index = self.text_styles.index(ts)
  1472.                 ts = self.text_styles[index]
  1473.             except ValueError:
  1474.                 self.text_styles.append(ts)
  1475.  
  1476.             
  1477.             try:
  1478.                 index = self.block_styles.index(bs)
  1479.                 bs = self.block_styles[index]
  1480.             except ValueError:
  1481.                 self.block_styles.append(bs)
  1482.  
  1483.             self.current_block = self.book.create_text_block(blockStyle = bs, textStyle = ts)
  1484.             return True
  1485.         return False
  1486.  
  1487.     
  1488.     def process_anchor(self, tag, tag_css, tag_pseudo_css):
  1489.         if not self.in_table:
  1490.             key = None if tag.has_key('name') else 'id'
  1491.             name = tag[key].replace('#', '')
  1492.             previous = self.current_block
  1493.             self.process_children(tag, tag_css, tag_pseudo_css)
  1494.             target = None
  1495.             if self.current_block == previous:
  1496.                 self.current_block.must_append = True
  1497.                 target = self.current_block
  1498.             else:
  1499.                 found = False
  1500.                 for item in self.current_page.contents:
  1501.                     if item == previous:
  1502.                         found = True
  1503.                         continue
  1504.                     
  1505.                     if found:
  1506.                         target = item
  1507.                         break
  1508.                         continue
  1509.                 
  1510.                 if target and not isinstance(target, (TextBlock, ImageBlock)):
  1511.                     if isinstance(target, RuledLine):
  1512.                         target = self.book.create_text_block(textStyle = self.current_block.textStyle, blockStyle = self.current_block.blockStyle)
  1513.                         target.Paragraph(' ')
  1514.                         self.current_page.append(target)
  1515.                     else:
  1516.                         target = BlockSpace()
  1517.                         self.current_page.append(target)
  1518.                 
  1519.                 if target == None:
  1520.                     if self.current_block.has_text():
  1521.                         target = self.current_block
  1522.                     else:
  1523.                         target = self.current_block
  1524.                         self.current_block.must_append = True
  1525.                 
  1526.             self.targets[self.target_prefix + name] = target
  1527.         else:
  1528.             self.process_children(tag, tag_css, tag_pseudo_css)
  1529.  
  1530.     
  1531.     def parse_tag(self, tag, parent_css):
  1532.         
  1533.         try:
  1534.             tagname = tag.name.lower()
  1535.         except AttributeError:
  1536.             if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
  1537.                 self.add_text(tag, parent_css, { })
  1538.             
  1539.             return None
  1540.  
  1541.         (tag_css, tag_pseudo_css) = self.tag_css(tag, parent_css = parent_css)
  1542.         
  1543.         try:
  1544.             if tag_css['display'].lower() == 'none' or tag_css['visibility'].lower() == 'hidden':
  1545.                 return None
  1546.         except KeyError:
  1547.             pass
  1548.  
  1549.         if not (self.disable_chapter_detection) and self.chapter_attr[0].match(tagname):
  1550.             if (self.chapter_attr[1].lower() == 'none' or tag.has_key(self.chapter_attr[1])) and self.chapter_attr[2].match(tag[self.chapter_attr[1]]):
  1551.                 self.log.debug('Detected chapter %s' % tagname)
  1552.                 self.end_page()
  1553.                 self.page_break_found = True
  1554.                 if self.options.add_chapters_to_toc:
  1555.                     self.current_block.must_append = True
  1556.                     self.extra_toc_entries.append((self.get_text(tag, limit = 1000), self.current_block))
  1557.                 
  1558.             
  1559.         end_page = self.process_page_breaks(tag, tagname, tag_css)
  1560.         
  1561.         try:
  1562.             if tagname in ('title', 'script', 'meta', 'del', 'frameset'):
  1563.                 pass
  1564.             elif tagname == 'a' and self.link_levels >= 0:
  1565.                 if tag.has_key('href') and not self.link_exclude.match(tag['href']):
  1566.                     if urlparse(tag['href'])[0] not in ('', 'file'):
  1567.                         self.process_children(tag, tag_css, tag_pseudo_css)
  1568.                     else:
  1569.                         path = munge_paths(self.target_prefix, tag['href'])[0]
  1570.                         ext = os.path.splitext(path)[1]
  1571.                         if ext:
  1572.                             ext = ext[1:].lower()
  1573.                         
  1574.                         enc = sys.getfilesystemencoding()
  1575.                         if not enc:
  1576.                             enc = 'utf8'
  1577.                         
  1578.                         if isinstance(path, unicode):
  1579.                             path = path.encode(enc, 'replace')
  1580.                         
  1581.                         if os.access(path, os.R_OK) and os.path.isfile(path):
  1582.                             if ext in ('png', 'jpg', 'bmp', 'jpeg'):
  1583.                                 self.process_image(path, tag_css)
  1584.                             else:
  1585.                                 text = self.get_text(tag, limit = 1000)
  1586.                                 if not text.strip():
  1587.                                     text = 'Link'
  1588.                                 
  1589.                                 self.add_text(text, tag_css, { }, force_span_use = True)
  1590.                                 self.links.append(self.create_link(self.current_para.contents, tag))
  1591.                                 if tag.has_key('id') or tag.has_key('name'):
  1592.                                     key = None if tag.has_key('name') else 'id'
  1593.                                     self.targets[self.target_prefix + tag[key]] = self.current_block
  1594.                                     self.current_block.must_append = True
  1595.                                 
  1596.                         else:
  1597.                             self.log.debug('Could not follow link to ' + tag['href'])
  1598.                             self.process_children(tag, tag_css, tag_pseudo_css)
  1599.                 elif tag.has_key('name') or tag.has_key('id'):
  1600.                     self.process_anchor(tag, tag_css, tag_pseudo_css)
  1601.                 else:
  1602.                     self.process_children(tag, tag_css, tag_pseudo_css)
  1603.             elif tagname == 'img':
  1604.                 if tag.has_key('src'):
  1605.                     path = munge_paths(self.target_prefix, tag['src'])[0]
  1606.                     if not os.path.exists(path):
  1607.                         path = path.replace('&', '%26')
  1608.                     
  1609.                     if os.access(path, os.R_OK) and os.path.isfile(path):
  1610.                         (width, height) = (None, None)
  1611.                         
  1612.                         try:
  1613.                             width = int(tag['width'])
  1614.                             height = int(tag['height'])
  1615.                         except:
  1616.                             pass
  1617.  
  1618.                         if tag.has_key('class'):
  1619.                             pass
  1620.                         dropcaps = tag['class'] == 'libprs500_dropcaps'
  1621.                         self.process_image(path, tag_css, width, height, dropcaps = dropcaps, rescale = True)
  1622.                     elif not urlparse(tag['src'])[0]:
  1623.                         self.log.warn('Could not find image: ' + tag['src'])
  1624.                     
  1625.                 else:
  1626.                     self.log.debug('Failed to process: %s' % str(tag))
  1627.             elif tagname in ('style', 'link'):
  1628.                 ncss = { }
  1629.                 npcss = { }
  1630.                 if tagname == 'style':
  1631.                     text = []([ unicode(i) for i in tag.findAll(text = True) ])
  1632.                     (css, pcss) = self.parse_css(text)
  1633.                     ncss.update(css)
  1634.                     npcss.update(pcss)
  1635.                 elif tag.has_key('type') and tag['type'] in ('text/css', 'text/x-oeb1-css') and tag.has_key('href'):
  1636.                     path = munge_paths(self.target_prefix, tag['href'])[0]
  1637.                     
  1638.                     try:
  1639.                         f = open(path, 'rb')
  1640.                         src = f.read()
  1641.                         f.close()
  1642.                         match = self.PAGE_BREAK_PAT.search(src)
  1643.                         if match and not re.match('avoid', match.group(1), re.IGNORECASE):
  1644.                             self.page_break_found = True
  1645.                         
  1646.                         (ncss, npcss) = self.parse_css(src)
  1647.                     except IOError:
  1648.                         ''.join
  1649.                         ''.join
  1650.                         self.log.warn('Could not read stylesheet: ' + tag['href'])
  1651.                     except:
  1652.                         ''.join<EXCEPTION MATCH>IOError
  1653.                     
  1654.  
  1655.                 ''.join
  1656.                 if ncss:
  1657.                     update_css(ncss, self.css)
  1658.                     self.css.update(self.override_css)
  1659.                 
  1660.                 if npcss:
  1661.                     update_css(npcss, self.pseudo_css)
  1662.                     self.pseudo_css.update(self.override_pcss)
  1663.                 
  1664.             elif tagname == 'pre':
  1665.                 self.end_current_para()
  1666.                 self.end_current_block()
  1667.                 self.current_block = self.book.create_text_block()
  1668.                 ts = self.current_block.textStyle.copy()
  1669.                 self.current_block.textStyle = ts
  1670.                 self.current_block.textStyle.attrs['parindent'] = '0'
  1671.                 if tag.contents:
  1672.                     c = tag.contents[0]
  1673.                     if isinstance(c, NavigableString):
  1674.                         c = unicode(c).replace('\r\n', '\n').replace('\r', '\n')
  1675.                         if c.startswith('\n'):
  1676.                             c = c[1:]
  1677.                             tag.contents[0] = NavigableString(c)
  1678.                             tag.contents[0].setup(tag)
  1679.                         
  1680.                     
  1681.                 
  1682.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1683.                 self.end_current_block()
  1684.             elif tagname in ('ul', 'ol', 'dl'):
  1685.                 self.list_level += 1
  1686.                 prev_bs = self.current_block.blockStyle
  1687.                 self.end_current_block()
  1688.                 attrs = self.current_block.blockStyle.attrs
  1689.                 attrs = attrs.copy()
  1690.                 attrs['sidemargin'] = self.list_indent * self.list_level
  1691.                 bs = self.book.create_block_style(**attrs)
  1692.                 self.current_block = self.book.create_text_block(blockStyle = bs, textStyle = self.unindented_style)
  1693.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1694.                 self.end_current_block()
  1695.                 self.current_block.blockStyle = prev_bs
  1696.                 self.list_level -= 1
  1697.                 if tagname == 'ol':
  1698.                     self.list_counter = old_counter
  1699.                 
  1700.             elif tagname in ('li', 'dt', 'dd'):
  1701.                 margin = self.list_indent * self.list_level
  1702.                 if tagname == 'dd':
  1703.                     margin += 80
  1704.                 
  1705.                 if int(self.current_block.blockStyle.attrs['sidemargin']) != margin:
  1706.                     self.end_current_block()
  1707.                     attrs = self.current_block.blockStyle.attrs
  1708.                     attrs = attrs.copy()
  1709.                     attrs['sidemargin'] = margin
  1710.                     attrs['blockwidth'] = int(attrs['blockwidth']) + margin
  1711.                     bs = self.book.create_block_style(**attrs)
  1712.                     self.current_block = self.book.create_text_block(blockStyle = bs, textStyle = self.unindented_style)
  1713.                 
  1714.                 if self.current_para.has_text():
  1715.                     self.line_break()
  1716.                     self.current_block.append(self.current_para)
  1717.                 
  1718.                 self.current_para = Paragraph()
  1719.                 self.previous_text = '\n'
  1720.                 if tagname == 'li':
  1721.                     in_ol = True
  1722.                     parent = tag.parent
  1723.                     while parent:
  1724.                         if parent.name and parent.name.lower() in ('ul', 'ol'):
  1725.                             in_ol = parent.name.lower() == 'ol'
  1726.                             break
  1727.                         
  1728.                         parent = parent.parent
  1729.                     prepend = None if in_ol else u'ΓÇó '
  1730.                     self.current_para.append(Span(prepend))
  1731.                     self.process_children(tag, tag_css, tag_pseudo_css)
  1732.                     if in_ol:
  1733.                         self.list_counter += 1
  1734.                     
  1735.                 else:
  1736.                     self.process_children(tag, tag_css, tag_pseudo_css)
  1737.             elif tagname == 'blockquote':
  1738.                 self.current_para.append_to(self.current_block)
  1739.                 self.current_block.append_to(self.current_page)
  1740.                 pb = self.current_block
  1741.                 self.current_para = Paragraph()
  1742.                 ts = self.book.create_text_style()
  1743.                 ts.attrs['parindent'] = 0
  1744.                 
  1745.                 try:
  1746.                     index = self.text_styles.index(ts)
  1747.                     ts = self.text_styles[index]
  1748.                 except ValueError:
  1749.                     self.text_styles.append(ts)
  1750.  
  1751.                 bs = self.book.create_block_style()
  1752.                 (bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip']) = (60, 20, 20)
  1753.                 
  1754.                 try:
  1755.                     index = self.block_styles.index(bs)
  1756.                     bs = self.block_styles[index]
  1757.                 except ValueError:
  1758.                     self.block_styles.append(bs)
  1759.  
  1760.                 self.current_block = self.book.create_text_block(blockStyle = bs, textStyle = ts)
  1761.                 self.previous_text = '\n'
  1762.                 self.preserve_block_style = True
  1763.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1764.                 self.preserve_block_style = False
  1765.                 self.current_para.append_to(self.current_block)
  1766.                 self.current_block.append_to(self.current_page)
  1767.                 self.current_para = Paragraph()
  1768.                 self.current_block = self.book.create_text_block(textStyle = pb.textStyle, blockStyle = pb.blockStyle)
  1769.             elif tagname in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
  1770.                 new_block = self.process_block(tag, tag_css)
  1771.                 if (self.anchor_ids or tag.has_key('id') or self.book_designer) and tag.has_key('class') and tag['class'] == 'title':
  1772.                     if not tag.has_key('id'):
  1773.                         tag['id'] = __appname__ + '_id_' + str(self.id_counter)
  1774.                         self.id_counter += 1
  1775.                     
  1776.                     tkey = self.target_prefix + tag['id']
  1777.                     if not new_block:
  1778.                         self.end_current_block()
  1779.                     
  1780.                     self.current_block.must_append = True
  1781.                     self.targets[tkey] = self.current_block
  1782.                     if self.book_designer and tag.has_key('class') and tag['class'] == 'title':
  1783.                         self.extra_toc_entries.append((self.get_text(tag, 100), self.current_block))
  1784.                     
  1785.                 
  1786.                 src = self.get_text(tag, limit = 1000)
  1787.                 if not (self.disable_chapter_detection) and tagname.startswith('h'):
  1788.                     if self.chapter_regex.search(src):
  1789.                         self.log.debug('Detected chapter %s' % src)
  1790.                         self.end_page()
  1791.                         self.page_break_found = True
  1792.                         if self.options.add_chapters_to_toc:
  1793.                             self.current_block.must_append = True
  1794.                             self.extra_toc_entries.append((self.get_text(tag, limit = 1000), self.current_block))
  1795.                         
  1796.                     
  1797.                 
  1798.                 if self.current_para.has_text():
  1799.                     self.current_para.append_to(self.current_block)
  1800.                 
  1801.                 self.current_para = Paragraph()
  1802.                 self.previous_text = '\n'
  1803.                 if not tag.contents:
  1804.                     self.current_block.append(CR())
  1805.                     return None
  1806.                 if self.current_block.contents:
  1807.                     self.current_block.append(CR())
  1808.                 
  1809.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1810.                 if self.current_para.contents:
  1811.                     self.current_block.append(self.current_para)
  1812.                 
  1813.                 self.current_para = Paragraph()
  1814.                 if tagname.startswith('h') or self.blank_after_para:
  1815.                     self.current_block.append(CR())
  1816.                 
  1817.             elif tagname in ('b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite', 'sup', 'sub'):
  1818.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1819.             elif tagname == 'font':
  1820.                 if tag.has_key('face'):
  1821.                     tag_css['font-family'] = tag['face']
  1822.                 
  1823.                 if tag.has_key('color'):
  1824.                     tag_css['color'] = tag['color']
  1825.                 
  1826.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1827.             elif tagname in ('br',):
  1828.                 self.line_break()
  1829.                 self.previous_text = '\n'
  1830.             elif tagname in ('hr', 'tr'):
  1831.                 self.end_current_block()
  1832.                 if tagname == 'hr' and not tag_css.get('width', '').strip().startswith('0'):
  1833.                     self.current_page.RuledLine(linelength = int(self.current_page.pageStyle.attrs['textwidth']))
  1834.                 
  1835.                 self.previous_text = '\n'
  1836.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1837.             elif tagname == 'td':
  1838.                 if not self.in_table:
  1839.                     self.current_para.append(' ')
  1840.                     self.previous_text = ' '
  1841.                 
  1842.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1843.             elif tagname == 'table' and not (self.ignore_tables) and not (self.in_table):
  1844.                 if self.render_tables_as_images:
  1845.                     print 'Rendering table...'
  1846.                     render_table = render_table
  1847.                     import calibre.ebooks.lrf.html.table_as_image
  1848.                     pheight = int(self.current_page.pageStyle.attrs['textheight'])
  1849.                     pwidth = int(self.current_page.pageStyle.attrs['textwidth'])
  1850.                     images = render_table(self.soup, tag, tag_css, os.path.dirname(self.target_prefix), pwidth, pheight, self.profile.dpi, self.text_size_multiplier_for_rendered_tables)
  1851.                     for path, width, height in images:
  1852.                         stream = ImageStream(path, encoding = 'PNG')
  1853.                         im = Image(stream, x0 = 0, y0 = 0, x1 = width, y1 = height, xsize = width, ysize = height)
  1854.                         pb = self.current_block
  1855.                         self.end_current_para()
  1856.                         self.process_alignment(tag_css)
  1857.                         self.current_para.append(Plot(im, xsize = width * 720 / self.profile.dpi, ysize = height * 720 / self.profile.dpi))
  1858.                         self.current_block.append(self.current_para)
  1859.                         self.current_page.append(self.current_block)
  1860.                         self.current_block = self.book.create_text_block(textStyle = pb.textStyle, blockStyle = pb.blockStyle)
  1861.                         self.current_para = Paragraph()
  1862.                     
  1863.                 else:
  1864.                     tag_css = self.tag_css(tag)[0]
  1865.                     
  1866.                     try:
  1867.                         self.process_table(tag, tag_css)
  1868.                     except Exception:
  1869.                         err = None
  1870.                         self.log.warning(_('An error occurred while processing a table: %s. Ignoring table markup.') % repr(err))
  1871.                         self.log.exception('')
  1872.                         self.log.debug(_('Bad table:\n%s') % unicode(tag)[:300])
  1873.                         self.in_table = False
  1874.                         self.process_children(tag, tag_css, tag_pseudo_css)
  1875.                     finally:
  1876.                         if self.minimize_memory_usage:
  1877.                             tag.extract()
  1878.                         
  1879.  
  1880.             else:
  1881.                 self.process_children(tag, tag_css, tag_pseudo_css)
  1882.         finally:
  1883.             if end_page:
  1884.                 self.end_page()
  1885.             
  1886.  
  1887.  
  1888.     
  1889.     def process_table(self, tag, tag_css):
  1890.         self.end_current_block()
  1891.         self.current_block = self.book.create_text_block()
  1892.         rowpad = 10
  1893.         table = Table(self, tag, tag_css, rowpad = rowpad, colpad = 10)
  1894.         canvases = []
  1895.         ps = self.current_page.pageStyle.attrs
  1896.         for block, xpos, ypos, delta, targets in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
  1897.             if not block:
  1898.                 if ypos > int(ps['textheight']):
  1899.                     raise Exception, _('Table has cell that is too large')
  1900.                 ypos > int(ps['textheight'])
  1901.                 canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos + rowpad, blockrule = 'block-fixed'))
  1902.                 for name in targets:
  1903.                     self.targets[self.target_prefix + name] = canvases[-1]
  1904.                 
  1905.             if xpos > 65535:
  1906.                 xpos = 65535
  1907.             
  1908.             canvases[-1].put_object(block, xpos + int(delta / 2), ypos)
  1909.         
  1910.         for canvas in canvases:
  1911.             self.current_page.append(canvas)
  1912.         
  1913.         self.end_current_block()
  1914.  
  1915.     
  1916.     def remove_unused_target_blocks(self):
  1917.         for block in self.unused_target_blocks:
  1918.             block.parent.contents.remove(block)
  1919.             block.parent = None
  1920.         
  1921.  
  1922.     
  1923.     def writeto(self, path, lrs = False):
  1924.         self.remove_unused_target_blocks()
  1925.         None if lrs else self.book.renderLrf(path)
  1926.  
  1927.     
  1928.     def cleanup(self):
  1929.         for _file in self.scaled_images.values() + self.rotated_images.values():
  1930.             _file.__del__()
  1931.         
  1932.  
  1933.  
  1934.  
  1935. def process_file(path, options, logger):
  1936.     if not isinstance(path, unicode):
  1937.         path = path.decode(sys.getfilesystemencoding())
  1938.     
  1939.     path = os.path.abspath(path)
  1940.     default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
  1941.     dirpath = os.path.dirname(path)
  1942.     tpath = ''
  1943.     try_opf(path, options, logger)
  1944.     if getattr(options, 'cover', None):
  1945.         options.cover = os.path.expanduser(options.cover)
  1946.         if not os.path.isabs(options.cover):
  1947.             options.cover = os.path.join(dirpath, options.cover)
  1948.         
  1949.         if os.access(options.cover, os.R_OK):
  1950.             th = Device.THUMBNAIL_HEIGHT
  1951.             im = PILImage.open(options.cover)
  1952.             pwidth = options.profile.screen_width
  1953.             pheight = options.profile.screen_height - options.profile.fudge
  1954.             (width, height) = im.size
  1955.             if width < pwidth:
  1956.                 corrf = float(pwidth) / width
  1957.                 width = pwidth
  1958.                 height = int(corrf * height)
  1959.             
  1960.             (scaled, width, height) = fit_image(width, height, pwidth, pheight)
  1961.             
  1962.             try:
  1963.                 cim = None if scaled else im
  1964.                 cf = PersistentTemporaryFile(prefix = __appname__ + '_', suffix = '.jpg')
  1965.                 cf.close()
  1966.                 cim.convert('RGB').save(cf.name)
  1967.                 options.cover = cf.name
  1968.                 tim = im.resize((int(0.75 * th), th), PILImage.ANTIALIAS).convert('RGB')
  1969.                 tf = PersistentTemporaryFile(prefix = __appname__ + '_', suffix = '.jpg')
  1970.                 tf.close()
  1971.                 tim.save(tf.name)
  1972.                 tpath = tf.name
  1973.             except IOError:
  1974.                 err = None
  1975.                 logger.warn(_('Could not read cover image: %s'), err)
  1976.                 options.cover = None
  1977.             except:
  1978.                 None<EXCEPTION MATCH>IOError
  1979.             
  1980.  
  1981.         None<EXCEPTION MATCH>IOError
  1982.         raise ConversionError, _('Cannot read from: %s') % (options.cover,)
  1983.     getattr(options, 'cover', None)
  1984.     if not options.title:
  1985.         options.title = default_title
  1986.     
  1987.     for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
  1988.         val = getattr(options, prop, None)
  1989.         if val and not isinstance(val, unicode):
  1990.             soup = BeautifulSoup(val)
  1991.             setattr(options, prop, unicode(soup))
  1992.             continue
  1993.     
  1994.     title = (options.title, options.title_sort)
  1995.     author = (options.author, options.author_sort)
  1996.     args = dict(font_delta = options.font_delta, title = title, author = author, sourceencoding = 'utf8', freetext = options.freetext, category = options.category, publisher = options.publisher, booksetting = BookSetting(dpi = 10 * options.profile.dpi, screenheight = options.profile.screen_height, screenwidth = options.profile.screen_width))
  1997.     if tpath:
  1998.         args['thumbnail'] = tpath
  1999.     
  2000.     header = None
  2001.     if options.header:
  2002.         header = Paragraph()
  2003.         fheader = options.headerformat
  2004.         if not options.title:
  2005.             options.title = _('Unknown')
  2006.         
  2007.         if not options.author:
  2008.             options.author = _('Unknown')
  2009.         
  2010.         if not fheader:
  2011.             fheader = '%t by %a'
  2012.         
  2013.         fheader = re.sub('(?<!%)%t', options.title, fheader)
  2014.         fheader = re.sub('(?<!%)%a', options.author, fheader)
  2015.         fheader = re.sub('%%a', '%a', fheader)
  2016.         fheader = re.sub('%%t', '%t', fheader)
  2017.         header.append(fheader + '  ')
  2018.     
  2019.     (book, fonts) = Book(options, logger, header = header, **args)
  2020.     le = None if options.link_exclude else re.compile('$')
  2021.     pb = None if options.page_break else re.compile('$')
  2022.     fpb = None if options.force_page_break else re.compile('$')
  2023.     cq = options.chapter_attr.split(',')
  2024.     if len(cq) < 3:
  2025.         raise ValueError('The --chapter-attr setting must have 2 commas.')
  2026.     len(cq) < 3
  2027.     options.chapter_attr = [
  2028.         re.compile(cq[0], re.IGNORECASE),
  2029.         cq[1],
  2030.         re.compile(cq[2], re.IGNORECASE)]
  2031.     options.force_page_break = fpb
  2032.     options.link_exclude = le
  2033.     options.page_break = pb
  2034.     if not isinstance(options.chapter_regex, unicode):
  2035.         options.chapter_regex = options.chapter_regex.decode(preferred_encoding)
  2036.     
  2037.     options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
  2038.     fpba = options.force_page_break_attr.split(',')
  2039.     if len(fpba) != 3:
  2040.         fpba = [
  2041.             '$',
  2042.             '',
  2043.             '$']
  2044.     
  2045.     options.force_page_break_attr = [
  2046.         re.compile(fpba[0], re.IGNORECASE),
  2047.         fpba[1],
  2048.         re.compile(fpba[2], re.IGNORECASE)]
  2049.     if not hasattr(options, 'anchor_ids'):
  2050.         options.anchor_ids = True
  2051.     
  2052.     files = None if options.use_spine and hasattr(options, 'spine') else [
  2053.         path]
  2054.     conv = HTMLConverter(book, fonts, options, logger, files)
  2055.     if options.use_spine and hasattr(options, 'toc') and options.toc is not None:
  2056.         conv.create_toc(options.toc)
  2057.     
  2058.     oname = options.output
  2059.     if not oname:
  2060.         suffix = None if options.lrs else '.lrf'
  2061.         name = os.path.splitext(os.path.basename(path))[0] + suffix
  2062.         oname = os.path.join(os.getcwd(), name)
  2063.     
  2064.     oname = os.path.abspath(os.path.expanduser(oname))
  2065.     conv.writeto(oname, lrs = options.lrs)
  2066.     conv.cleanup()
  2067.     return oname
  2068.  
  2069.  
  2070. def try_opf(path, options, logger):
  2071.     if hasattr(options, 'opf'):
  2072.         opf = options.opf
  2073.     else:
  2074.         files = glob.glob(os.path.join(os.path.dirname(path), '*'))
  2075.         opf = None
  2076.         for f in files:
  2077.             ext = f.rpartition('.')[-1].lower()
  2078.             if ext == 'opf':
  2079.                 opf = f
  2080.                 break
  2081.                 continue
  2082.         
  2083.     if opf is None:
  2084.         return None
  2085.     dirpath = os.path.dirname(os.path.abspath(opf))
  2086.     OPF2 = OPF
  2087.     import calibre.ebooks.metadata.opf2
  2088.     opf = OPF2(open(opf, 'rb'), dirpath)
  2089.     
  2090.     try:
  2091.         title = opf.title
  2092.         if title and not getattr(options, 'title', None):
  2093.             options.title = title
  2094.         
  2095.         if getattr(options, 'author', 'Unknown') == 'Unknown':
  2096.             if opf.authors:
  2097.                 options.author = ', '.join(opf.authors)
  2098.             
  2099.             if opf.author_sort:
  2100.                 options.author_sort = opf.author_sort
  2101.             
  2102.         
  2103.         if options.publisher == 'Unknown':
  2104.             publisher = opf.publisher
  2105.             if publisher:
  2106.                 options.publisher = publisher
  2107.             
  2108.         
  2109.         if not getattr(options, 'cover', None) or options.use_metadata_cover:
  2110.             orig_cover = getattr(options, 'cover', None)
  2111.             options.cover = None
  2112.             cover = opf.cover
  2113.             if cover:
  2114.                 cover = cover.replace('/', os.sep)
  2115.                 if not os.path.isabs(cover):
  2116.                     cover = os.path.join(dirpath, cover)
  2117.                 
  2118.                 if os.access(cover, os.R_OK):
  2119.                     
  2120.                     try:
  2121.                         PILImage.open(cover)
  2122.                         options.cover = cover
  2123.  
  2124.                 
  2125.             
  2126.             if not getattr(options, 'cover', None) and orig_cover is not None:
  2127.                 options.cover = orig_cover
  2128.             
  2129.         
  2130.         if not getattr(options, 'toc', None):
  2131.             options.toc = opf.toc
  2132.     except Exception:
  2133.         opf is None
  2134.         opf is None
  2135.         logger.exception(_('Failed to process opf file'))
  2136.     except:
  2137.         opf is None
  2138.  
  2139.  
  2140.