home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_1447 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-08-06  |  21.5 KB  |  680 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. from __future__ import with_statement
  5. __license__ = 'GPL v3'
  6. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  7. import sys
  8. import socket
  9. import os
  10. import urlparse
  11. import re
  12. import time
  13. import copy
  14. import urllib2
  15. import threading
  16. import traceback
  17. from urllib import url2pathname, quote
  18. from httplib import responses
  19. from PIL import Image
  20. from cStringIO import StringIO
  21. from calibre import browser, relpath, unicode_path
  22. from calibre.utils.filenames import ascii_filename
  23. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
  24. from calibre.ebooks.chardet import xml_to_unicode
  25. from calibre.utils.config import OptionParser
  26. from calibre.utils.logging import Log
  27.  
  28. class FetchError(Exception):
  29.     pass
  30.  
  31.  
  32. class closing(object):
  33.     
  34.     def __init__(self, thing):
  35.         self.thing = thing
  36.  
  37.     
  38.     def __enter__(self):
  39.         return self.thing
  40.  
  41.     
  42.     def __exit__(self, *exc_info):
  43.         
  44.         try:
  45.             self.thing.close()
  46.         except Exception:
  47.             pass
  48.  
  49.  
  50.  
  51. bad_url_counter = 0
  52.  
  53. def basename(url):
  54.     global bad_url_counter
  55.     
  56.     try:
  57.         parts = urlparse.urlsplit(url)
  58.         path = url2pathname(parts.path)
  59.         res = os.path.basename(path)
  60.     except:
  61.         bad_url_counter += 1
  62.         return 'bad_url_%d.html' % bad_url_counter
  63.  
  64.     if not os.path.splitext(res)[1]:
  65.         return 'index.html'
  66.     return res
  67.  
  68.  
  69. def save_soup(soup, target):
  70.     ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
  71.     nm = ns.find('meta')
  72.     metas = soup.findAll('meta', content = True)
  73.     added = False
  74.     for meta in metas:
  75.         if 'charset' in meta.get('content', '').lower():
  76.             meta.replaceWith(nm)
  77.             added = True
  78.             continue
  79.     
  80.     if not added:
  81.         head = soup.find('head')
  82.         if head is not None:
  83.             head.insert(0, nm)
  84.         
  85.     
  86.     selfdir = os.path.dirname(target)
  87.     for tag in soup.findAll([
  88.         'img',
  89.         'link',
  90.         'a']):
  91.         for key in ('src', 'href'):
  92.             path = tag.get(key, None)
  93.             if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
  94.                 tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
  95.                 continue
  96.         
  97.     
  98.     html = unicode(soup)
  99.     
  100.     try:
  101.         f = _[1]
  102.         f.write(html.encode('utf-8'))
  103.     finally:
  104.         pass
  105.  
  106.  
  107.  
  108. class response(str):
  109.     
  110.     def __new__(cls, *args):
  111.         obj = super(response, cls).__new__(cls, *args)
  112.         obj.newurl = None
  113.         return obj
  114.  
  115.  
  116.  
  117. class DummyLock(object):
  118.     
  119.     def __enter__(self, *args):
  120.         return self
  121.  
  122.     
  123.     def __exit__(self, *args):
  124.         pass
  125.  
  126.  
  127.  
  128. def default_is_link_wanted(url, tag):
  129.     raise NotImplementedError()
  130.  
  131.  
  132. class RecursiveFetcher(object):
  133.     LINK_FILTER = tuple((lambda .0: for i in .0:
  134. re.compile(i, re.IGNORECASE))(('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$')))
  135.     CSS_IMPORT_PATTERN = re.compile('\\@import\\s+url\\((.*?)\\)', re.IGNORECASE)
  136.     default_timeout = socket.getdefaulttimeout()
  137.     DUMMY_LOCK = DummyLock()
  138.     
  139.     def __init__(self, options, log, image_map = { }, css_map = { }, job_info = None):
  140.         self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
  141.         if not os.path.exists(self.base_dir):
  142.             os.makedirs(self.base_dir)
  143.         
  144.         self.log = log
  145.         self.verbose = options.verbose
  146.         self.timeout = options.timeout
  147.         self.encoding = options.encoding
  148.         self.browser = None if hasattr(options, 'browser') else browser()
  149.         self.max_recursions = options.max_recursions
  150.         self.match_regexps = [ re.compile(i, re.IGNORECASE) for i in options.match_regexps ]
  151.         self.filter_regexps = [ re.compile(i, re.IGNORECASE) for i in options.filter_regexps ]
  152.         self.max_files = options.max_files
  153.         self.delay = options.delay
  154.         self.last_fetch_at = 0
  155.         self.filemap = { }
  156.         self.imagemap = image_map
  157.         self.imagemap_lock = threading.RLock()
  158.         self.stylemap = css_map
  159.         self.image_url_processor = None
  160.         self.stylemap_lock = threading.RLock()
  161.         self.downloaded_paths = []
  162.         self.current_dir = self.base_dir
  163.         self.files = 0
  164.         self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
  165.         self.remove_tags = getattr(options, 'remove_tags', [])
  166.         self.remove_tags_after = getattr(options, 'remove_tags_after', None)
  167.         self.remove_tags_before = getattr(options, 'remove_tags_before', None)
  168.         self.keep_only_tags = getattr(options, 'keep_only_tags', [])
  169.         self.preprocess_html_ext = getattr(options, 'preprocess_html', (lambda soup: soup))
  170.         self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', (lambda soup: pass))
  171.         self.postprocess_html_ext = getattr(options, 'postprocess_html', None)
  172.         self._is_link_wanted = getattr(options, 'is_link_wanted', default_is_link_wanted)
  173.         self.download_stylesheets = not (options.no_stylesheets)
  174.         self.show_progress = True
  175.         self.failed_links = []
  176.         self.job_info = job_info
  177.  
  178.     
  179.     def get_soup(self, src):
  180.         nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
  181.         nmassage.extend(self.preprocess_regexps)
  182.         nmassage += [
  183.             (re.compile('<!DOCTYPE .+?>', re.DOTALL), (lambda m: ''))]
  184.         nmassage.append((re.compile('<!--.*?-->', re.DOTALL), (lambda m: '')))
  185.         soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats = True)[0], markupMassage = nmassage)
  186.         replace = self.prepreprocess_html_ext(soup)
  187.         if replace is not None:
  188.             soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats = True)[0], markupMassage = nmassage)
  189.         
  190.         if self.keep_only_tags:
  191.             body = Tag(soup, 'body')
  192.             
  193.             try:
  194.                 if isinstance(self.keep_only_tags, dict):
  195.                     self.keep_only_tags = [
  196.                         self.keep_only_tags]
  197.                 
  198.                 for spec in self.keep_only_tags:
  199.                     for tag in soup.find('body').findAll(**spec):
  200.                         body.insert(len(body.contents), tag)
  201.                     
  202.                 
  203.                 soup.find('body').replaceWith(body)
  204.             except AttributeError:
  205.                 pass
  206.             except:
  207.                 None<EXCEPTION MATCH>AttributeError
  208.             
  209.  
  210.         None<EXCEPTION MATCH>AttributeError
  211.         
  212.         def remove_beyond(tag, next):
  213.             while tag is not None and getattr(tag, 'name', None) != 'body':
  214.                 after = getattr(tag, next)
  215.                 while after is not None:
  216.                     ns = getattr(tag, next)
  217.                     after.extract()
  218.                     after = ns
  219.                 tag = tag.parent
  220.  
  221.         if self.remove_tags_after is not None:
  222.             rt = None if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
  223.             for spec in rt:
  224.                 tag = soup.find(**spec)
  225.                 remove_beyond(tag, 'nextSibling')
  226.             
  227.         
  228.         if self.remove_tags_before is not None:
  229.             tag = soup.find(**self.remove_tags_before)
  230.             remove_beyond(tag, 'previousSibling')
  231.         
  232.         for kwds in self.remove_tags:
  233.             for tag in soup.findAll(**kwds):
  234.                 tag.extract()
  235.             
  236.         
  237.         return self.preprocess_html_ext(soup)
  238.  
  239.     
  240.     def fetch_url(self, url):
  241.         data = None
  242.         self.log.debug('Fetching', url)
  243.         delta = time.time() - self.last_fetch_at
  244.         if delta < self.delay:
  245.             time.sleep(delta)
  246.         
  247.         if isinstance(url, unicode):
  248.             url = url.encode('utf-8')
  249.         
  250.         if re.search('\\s+', url) is not None:
  251.             purl = list(urlparse.urlparse(url))
  252.             for i in range(2, 6):
  253.                 purl[i] = quote(purl[i])
  254.             
  255.             url = urlparse.urlunparse(purl)
  256.         
  257.         open_func = getattr(self.browser, 'open_novisit', self.browser.open)
  258.         
  259.         try:
  260.             
  261.             try:
  262.                 f = _[1]
  263.                 data = response(f.read() + f.read())
  264.                 data.newurl = f.geturl()
  265.             finally:
  266.                 pass
  267.  
  268.         except urllib2.URLError:
  269.             err = None
  270.             if hasattr(err, 'code') and responses.has_key(err.code):
  271.                 raise FetchError, responses[err.code]
  272.             responses.has_key(err.code)
  273.             if getattr(err, 'reason', [
  274.                 0])[0] == 104 or getattr(getattr(err, 'args', [
  275.                 None])[0], 'errno', None) in (-2, -3):
  276.                 self.log.debug('Temporary error, retrying in 1 second')
  277.                 time.sleep(1)
  278.                 
  279.                 try:
  280.                     f = _[2]
  281.                     data = response(f.read() + f.read())
  282.                     data.newurl = f.geturl()
  283.                 finally:
  284.                     pass
  285.  
  286.             else:
  287.                 raise err
  288.             getattr(getattr(err, 'args', [
  289.                 None])[0], 'errno', None) in (-2, -3)
  290.         finally:
  291.             self.last_fetch_at = time.time()
  292.  
  293.         return data
  294.  
  295.     
  296.     def start_fetch(self, url):
  297.         soup = BeautifulSoup(u'<a href="' + url + '" />')
  298.         self.log.debug('Downloading')
  299.         res = self.process_links(soup, url, 0, into_dir = '')
  300.         self.log.debug(url, 'saved to', res)
  301.         return res
  302.  
  303.     
  304.     def is_link_ok(self, url):
  305.         for i in self.__class__.LINK_FILTER:
  306.             if i.search(url):
  307.                 return False
  308.         
  309.         return True
  310.  
  311.     
  312.     def is_link_wanted(self, url, tag):
  313.         
  314.         try:
  315.             return self._is_link_wanted(url, tag)
  316.         except NotImplementedError:
  317.             pass
  318.         except:
  319.             return False
  320.  
  321.         if self.filter_regexps:
  322.             for f in self.filter_regexps:
  323.                 if f.search(url):
  324.                     return False
  325.             
  326.         
  327.         if self.match_regexps:
  328.             for m in self.match_regexps:
  329.                 if m.search(url):
  330.                     return True
  331.             
  332.             return False
  333.         return True
  334.  
  335.     
  336.     def process_stylesheets(self, soup, baseurl):
  337.         diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
  338.         if not os.path.exists(diskpath):
  339.             os.mkdir(diskpath)
  340.         
  341.         for c, tag in enumerate(soup.findAll((lambda tag: if tag.name.lower() in ('link', 'style') and tag.has_key('type'):
  342. passtag['type'].lower() == 'text/css'))):
  343.             if tag.has_key('href'):
  344.                 iurl = tag['href']
  345.                 if not urlparse.urlsplit(iurl).scheme:
  346.                     iurl = urlparse.urljoin(baseurl, iurl, False)
  347.                 
  348.                 self.stylemap_lock.__enter__()
  349.                 
  350.                 try:
  351.                     if self.stylemap.has_key(iurl):
  352.                         tag['href'] = self.stylemap[iurl]
  353.                         continue
  354.                 finally:
  355.                     pass
  356.  
  357.                 
  358.                 try:
  359.                     data = self.fetch_url(iurl)
  360.                 except Exception:
  361.                     self.stylemap_lock.__exit__
  362.                     self.stylemap_lock.__exit__
  363.                     self.stylemap_lock
  364.                     self.log.exception('Could not fetch stylesheet ', iurl)
  365.                     continue
  366.                 except:
  367.                     self.stylemap_lock.__exit__
  368.  
  369.                 stylepath = os.path.join(diskpath, 'style' + str(c) + '.css')
  370.                 self.stylemap_lock.__enter__()
  371.                 
  372.                 try:
  373.                     self.stylemap[iurl] = stylepath
  374.                 finally:
  375.                     pass
  376.  
  377.                 
  378.                 try:
  379.                     x = _[1]
  380.                     x.write(data)
  381.                 finally:
  382.                     pass
  383.  
  384.                 tag['href'] = stylepath
  385.                 continue
  386.             open(stylepath, 'wb').__exit__
  387.             for ns in tag.findAll(text = True):
  388.                 src = str(ns)
  389.                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
  390.                 if m:
  391.                     iurl = m.group(1)
  392.                     self.stylemap_lock.__enter__()
  393.                     
  394.                     try:
  395.                         if self.stylemap.has_key(iurl):
  396.                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
  397.                             continue
  398.                     finally:
  399.                         pass
  400.  
  401.                     
  402.                     try:
  403.                         data = self.fetch_url(iurl)
  404.                     except Exception:
  405.                         self.stylemap_lock.__exit__
  406.                         self.stylemap_lock.__exit__
  407.                         self.stylemap_lock
  408.                         self.log.exception('Could not fetch stylesheet ', iurl)
  409.                         continue
  410.                     except:
  411.                         self.stylemap_lock if not urlparse.urlsplit(iurl).scheme else self.stylemap_lock.__exit__
  412.  
  413.                     c += 1
  414.                     stylepath = os.path.join(diskpath, 'style' + str(c) + '.css')
  415.                     self.stylemap_lock.__enter__()
  416.                     
  417.                     try:
  418.                         self.stylemap[iurl] = stylepath
  419.                     finally:
  420.                         pass
  421.  
  422.                     
  423.                     try:
  424.                         x = _[2]
  425.                         x.write(data)
  426.                     finally:
  427.                         pass
  428.  
  429.                     ns.replaceWith(src.replace(m.group(1), stylepath))
  430.                     continue
  431.                 open(stylepath, 'wb').__exit__
  432.             
  433.         
  434.  
  435.     
  436.     def process_images(self, soup, baseurl):
  437.         diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
  438.         if not os.path.exists(diskpath):
  439.             os.mkdir(diskpath)
  440.         
  441.         c = 0
  442.         for tag in soup.findAll((lambda tag: if tag.name.lower() == 'img':
  443. passtag.has_key('src'))):
  444.             iurl = tag['src']
  445.             if callable(self.image_url_processor):
  446.                 iurl = self.image_url_processor(baseurl, iurl)
  447.             
  448.             if not urlparse.urlsplit(iurl).scheme:
  449.                 iurl = urlparse.urljoin(baseurl, iurl, False)
  450.             
  451.             self.imagemap_lock.__enter__()
  452.             
  453.             try:
  454.                 if self.imagemap.has_key(iurl):
  455.                     tag['src'] = self.imagemap[iurl]
  456.                     continue
  457.             finally:
  458.                 pass
  459.  
  460.             
  461.             try:
  462.                 data = self.fetch_url(iurl)
  463.                 if data == 'GIF89a\x01':
  464.                     continue
  465.             except Exception:
  466.                 self.imagemap_lock.__exit__
  467.                 self.imagemap_lock.__exit__
  468.                 self.imagemap_lock
  469.                 self.log.exception('Could not fetch image ', iurl)
  470.                 continue
  471.             except:
  472.                 self.imagemap_lock.__exit__
  473.  
  474.             c += 1
  475.             fname = ascii_filename('img' + str(c))
  476.             imgpath = os.path.join(diskpath, fname + '.jpg')
  477.             
  478.             try:
  479.                 im = Image.open(StringIO(data)).convert('RGBA')
  480.                 self.imagemap_lock.__enter__()
  481.                 
  482.                 try:
  483.                     self.imagemap[iurl] = imgpath
  484.                 finally:
  485.                     pass
  486.  
  487.                 
  488.                 try:
  489.                     x = _[1]
  490.                     im.save(x, 'JPEG')
  491.                 finally:
  492.                     pass
  493.  
  494.                 tag['src'] = imgpath
  495.             continue
  496.             traceback.print_exc()
  497.             continue
  498.             continue
  499.  
  500.         
  501.  
  502.     
  503.     def absurl(self, baseurl, tag, key, filter = True):
  504.         iurl = tag[key]
  505.         parts = urlparse.urlsplit(iurl)
  506.         if not (parts.netloc) and not (parts.path):
  507.             return None
  508.         if not parts.scheme:
  509.             iurl = urlparse.urljoin(baseurl, iurl, False)
  510.         
  511.         if not self.is_link_ok(iurl):
  512.             self.log.debug('Skipping invalid link:', iurl)
  513.             return None
  514.         if filter and not self.is_link_wanted(iurl, tag):
  515.             self.log.debug('Filtered link: ' + iurl)
  516.             return None
  517.         return iurl
  518.  
  519.     
  520.     def normurl(self, url):
  521.         parts = list(urlparse.urlsplit(url))
  522.         parts[4] = ''
  523.         return urlparse.urlunsplit(parts)
  524.  
  525.     
  526.     def localize_link(self, tag, key, path):
  527.         parts = urlparse.urlsplit(tag[key])
  528.         suffix = None if parts.fragment else ''
  529.         tag[key] = path + suffix
  530.  
  531.     
  532.     def process_return_links(self, soup, baseurl):
  533.         for tag in soup.findAll((lambda tag: if tag.name.lower() == 'a':
  534. passtag.has_key('href'))):
  535.             iurl = self.absurl(baseurl, tag, 'href')
  536.             if not iurl:
  537.                 continue
  538.             
  539.             nurl = self.normurl(iurl)
  540.             if self.filemap.has_key(nurl):
  541.                 self.localize_link(tag, 'href', self.filemap[nurl])
  542.                 continue
  543.         
  544.  
  545.     
  546.     def process_links(self, soup, baseurl, recursion_level, into_dir = 'links'):
  547.         res = ''
  548.         diskpath = os.path.join(self.current_dir, into_dir)
  549.         if not os.path.exists(diskpath):
  550.             os.mkdir(diskpath)
  551.         
  552.         prev_dir = self.current_dir
  553.         
  554.         try:
  555.             self.current_dir = diskpath
  556.             tags = list(soup.findAll('a', href = True))
  557.             for c, tag in enumerate(tags):
  558.                 if self.show_progress:
  559.                     print '.',
  560.                     sys.stdout.flush()
  561.                 
  562.                 sys.stdout.flush()
  563.                 iurl = self.absurl(baseurl, tag, 'href', filter = recursion_level != 0)
  564.                 if not iurl:
  565.                     continue
  566.                 
  567.                 nurl = self.normurl(iurl)
  568.                 if self.filemap.has_key(nurl):
  569.                     self.localize_link(tag, 'href', self.filemap[nurl])
  570.                     continue
  571.                 
  572.                 if self.files > self.max_files:
  573.                     return res
  574.                 linkdir = self.files > self.max_files if into_dir else ''
  575.                 linkdiskpath = os.path.join(diskpath, linkdir)
  576.                 if not os.path.exists(linkdiskpath):
  577.                     os.mkdir(linkdiskpath)
  578.                 
  579.                 
  580.                 try:
  581.                     self.current_dir = linkdiskpath
  582.                     dsrc = self.fetch_url(iurl)
  583.                     newbaseurl = dsrc.newurl
  584.                     if len(dsrc) == 0 or len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
  585.                         raise ValueError('No content at URL %r' % iurl)
  586.                     len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0
  587.                     if callable(self.encoding):
  588.                         dsrc = self.encoding(dsrc)
  589.                     elif self.encoding is not None:
  590.                         dsrc = dsrc.decode(self.encoding, 'replace')
  591.                     else:
  592.                         dsrc = xml_to_unicode(dsrc, self.verbose)[0]
  593.                     soup = self.get_soup(dsrc)
  594.                     base = soup.find('base', href = True)
  595.                     if base is not None:
  596.                         newbaseurl = base['href']
  597.                     
  598.                     self.log.debug('Processing images...')
  599.                     self.process_images(soup, newbaseurl)
  600.                     if self.download_stylesheets:
  601.                         self.process_stylesheets(soup, newbaseurl)
  602.                     
  603.                     _fname = basename(iurl)
  604.                     if not isinstance(_fname, unicode):
  605.                         _fname.decode('latin1', 'replace')
  606.                     
  607.                     _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
  608.                     _fname = ascii_filename(_fname)
  609.                     _fname = os.path.splitext(_fname)[0] + '.xhtml'
  610.                     res = os.path.join(linkdiskpath, _fname)
  611.                     self.downloaded_paths.append(res)
  612.                     self.filemap[nurl] = res
  613.                     if recursion_level < self.max_recursions:
  614.                         self.log.debug('Processing links...')
  615.                         self.process_links(soup, newbaseurl, recursion_level + 1)
  616.                     else:
  617.                         self.process_return_links(soup, newbaseurl)
  618.                         self.log.debug('Recursion limit reached. Skipping links in', iurl)
  619.                     if callable(self.postprocess_html_ext):
  620.                         if c == 0 and recursion_level == 0:
  621.                             pass
  622.                         soup = self.postprocess_html_ext(soup, not getattr(self, 'called_first', False), self.job_info)
  623.                         if c == 0 and recursion_level == 0:
  624.                             self.called_first = True
  625.                         
  626.                     
  627.                     save_soup(soup, res)
  628.                     self.localize_link(tag, 'href', res)
  629.                 except Exception:
  630.                     self.failed_links.append((iurl, traceback.format_exc()))
  631.                     self.log.exception('Could not fetch link', iurl)
  632.                 finally:
  633.                     self.current_dir = diskpath
  634.                     self.files += 1
  635.  
  636.         finally:
  637.             self.current_dir = prev_dir
  638.  
  639.         if self.show_progress:
  640.             print 
  641.         
  642.         return res
  643.  
  644.  
  645.  
  646. def option_parser(usage = _('%prog URL\n\nWhere URL is for example http://google.com')):
  647.     parser = OptionParser(usage = usage)
  648.     parser.add_option('-d', '--base-dir', help = _('Base directory into which URL is saved. Default is %default'), default = '.', type = 'string', dest = 'dir')
  649.     parser.add_option('-t', '--timeout', help = _('Timeout in seconds to wait for a response from the server. Default: %default s'), default = 10, type = 'float', dest = 'timeout')
  650.     parser.add_option('-r', '--max-recursions', default = 1, help = _('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), type = 'int', dest = 'max_recursions')
  651.     parser.add_option('-n', '--max-files', default = sys.maxint, type = 'int', dest = 'max_files', help = _('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
  652.     parser.add_option('--delay', default = 0, dest = 'delay', type = 'int', help = _('Minimum interval in seconds between consecutive fetches. Default is %default s'))
  653.     parser.add_option('--encoding', default = None, help = _('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
  654.     parser.add_option('--match-regexp', default = [], action = 'append', dest = 'match_regexps', help = _('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
  655.     parser.add_option('--filter-regexp', default = [], action = 'append', dest = 'filter_regexps', help = _('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
  656.     parser.add_option('--dont-download-stylesheets', action = 'store_true', default = False, help = _('Do not download CSS stylesheets.'), dest = 'no_stylesheets')
  657.     parser.add_option('--verbose', help = _('Show detailed output information. Useful for debugging'), default = False, action = 'store_true', dest = 'verbose')
  658.     return parser
  659.  
  660.  
  661. def create_fetcher(options, image_map = { }, log = None):
  662.     if log is None:
  663.         log = Log()
  664.     
  665.     return RecursiveFetcher(options, log, image_map = { })
  666.  
  667.  
  668. def main(args = sys.argv):
  669.     parser = option_parser()
  670.     (options, args) = parser.parse_args(args)
  671.     if len(args) != 2:
  672.         parser.print_help()
  673.         return 1
  674.     fetcher = create_fetcher(options)
  675.     fetcher.start_fetch(args[1])
  676.  
  677. if __name__ == '__main__':
  678.     sys.exit(main())
  679.  
  680.