home *** CD-ROM | disk | FTP | other *** search
Wrap
# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' import sys import socket import os import urlparse import re import time import copy import urllib2 import threading import traceback from urllib import url2pathname, quote from httplib import responses from PIL import Image from cStringIO import StringIO from calibre import browser, relpath, unicode_path from calibre.utils.filenames import ascii_filename from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser from calibre.utils.logging import Log class FetchError(Exception): pass class closing(object): def __init__(self, thing): self.thing = thing def __enter__(self): return self.thing def __exit__(self, *exc_info): try: self.thing.close() except Exception: pass bad_url_counter = 0 def basename(url): global bad_url_counter try: parts = urlparse.urlsplit(url) path = url2pathname(parts.path) res = os.path.basename(path) except: bad_url_counter += 1 return 'bad_url_%d.html' % bad_url_counter if not os.path.splitext(res)[1]: return 'index.html' return res def save_soup(soup, target): ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = ns.find('meta') metas = soup.findAll('meta', content = True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True continue if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll([ 'img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) continue html = unicode(soup) try: f = _[1] f.write(html.encode('utf-8')) finally: pass class response(str): def __new__(cls, *args): obj = super(response, cls).__new__(cls, *args) obj.newurl = None return obj class DummyLock(object): def __enter__(self, *args): return self def __exit__(self, *args): pass def default_is_link_wanted(url, tag): raise NotImplementedError() class RecursiveFetcher(object): LINK_FILTER = tuple((lambda .0: for i in .0: re.compile(i, re.IGNORECASE))(('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$'))) CSS_IMPORT_PATTERN = re.compile('\\@import\\s+url\\((.*?)\\)', re.IGNORECASE) default_timeout = socket.getdefaulttimeout() DUMMY_LOCK = DummyLock() def __init__(self, options, log, image_map = { }, css_map = { }, job_info = None): self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.log = log self.verbose = options.verbose self.timeout = options.timeout self.encoding = options.encoding self.browser = None if hasattr(options, 'browser') else browser() self.max_recursions = options.max_recursions self.match_regexps = [ re.compile(i, re.IGNORECASE) for i in options.match_regexps ] self.filter_regexps = [ re.compile(i, re.IGNORECASE) for i in options.filter_regexps ] self.max_files = options.max_files self.delay = options.delay self.last_fetch_at = 0 self.filemap = { } self.imagemap = image_map self.imagemap_lock = threading.RLock() self.stylemap = css_map self.image_url_processor = None self.stylemap_lock = threading.RLock() self.downloaded_paths = [] self.current_dir = self.base_dir self.files = 0 self.preprocess_regexps = getattr(options, 'preprocess_regexps', []) self.remove_tags = getattr(options, 'remove_tags', []) self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.preprocess_html_ext = getattr(options, 'preprocess_html', (lambda soup: soup)) self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', (lambda soup: pass)) self.postprocess_html_ext = getattr(options, 'postprocess_html', None) self._is_link_wanted = getattr(options, 'is_link_wanted', default_is_link_wanted) self.download_stylesheets = not (options.no_stylesheets) self.show_progress = True self.failed_links = [] self.job_info = job_info def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) nmassage += [ (re.compile('<!DOCTYPE .+?>', re.DOTALL), (lambda m: ''))] nmassage.append((re.compile('<!--.*?-->', re.DOTALL), (lambda m: ''))) soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats = True)[0], markupMassage = nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats = True)[0], markupMassage = nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [ self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: pass except: None<EXCEPTION MATCH>AttributeError None<EXCEPTION MATCH>AttributeError def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = None if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup) def fetch_url(self, url): data = None self.log.debug('Fetching', url) delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(delta) if isinstance(url, unicode): url = url.encode('utf-8') if re.search('\\s+', url) is not None: purl = list(urlparse.urlparse(url)) for i in range(2, 6): purl[i] = quote(purl[i]) url = urlparse.urlunparse(purl) open_func = getattr(self.browser, 'open_novisit', self.browser.open) try: try: f = _[1] data = response(f.read() + f.read()) data.newurl = f.geturl() finally: pass except urllib2.URLError: err = None if hasattr(err, 'code') and responses.has_key(err.code): raise FetchError, responses[err.code] responses.has_key(err.code) if getattr(err, 'reason', [ 0])[0] == 104 or getattr(getattr(err, 'args', [ None])[0], 'errno', None) in (-2, -3): self.log.debug('Temporary error, retrying in 1 second') time.sleep(1) try: f = _[2] data = response(f.read() + f.read()) data.newurl = f.geturl() finally: pass else: raise err getattr(getattr(err, 'args', [ None])[0], 'errno', None) in (-2, -3) finally: self.last_fetch_at = time.time() return data def start_fetch(self, url): soup = BeautifulSoup(u'<a href="' + url + '" />') self.log.debug('Downloading') res = self.process_links(soup, url, 0, into_dir = '') self.log.debug(url, 'saved to', res) return res def is_link_ok(self, url): for i in self.__class__.LINK_FILTER: if i.search(url): return False return True def is_link_wanted(self, url, tag): try: return self._is_link_wanted(url, tag) except NotImplementedError: pass except: return False if self.filter_regexps: for f in self.filter_regexps: if f.search(url): return False if self.match_regexps: for m in self.match_regexps: if m.search(url): return True return False return True def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll((lambda tag: if tag.name.lower() in ('link', 'style') and tag.has_key('type'): passtag['type'].lower() == 'text/css'))): if tag.has_key('href'): iurl = tag['href'] if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) self.stylemap_lock.__enter__() try: if self.stylemap.has_key(iurl): tag['href'] = self.stylemap[iurl] continue finally: pass try: data = self.fetch_url(iurl) except Exception: self.stylemap_lock.__exit__ self.stylemap_lock.__exit__ self.stylemap_lock self.log.exception('Could not fetch stylesheet ', iurl) continue except: self.stylemap_lock.__exit__ stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') self.stylemap_lock.__enter__() try: self.stylemap[iurl] = stylepath finally: pass try: x = _[1] x.write(data) finally: pass tag['href'] = stylepath continue open(stylepath, 'wb').__exit__ for ns in tag.findAll(text = True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) self.stylemap_lock.__enter__() try: if self.stylemap.has_key(iurl): ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) continue finally: pass try: data = self.fetch_url(iurl) except Exception: self.stylemap_lock.__exit__ self.stylemap_lock.__exit__ self.stylemap_lock self.log.exception('Could not fetch stylesheet ', iurl) continue except: self.stylemap_lock if not urlparse.urlsplit(iurl).scheme else self.stylemap_lock.__exit__ c += 1 stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') self.stylemap_lock.__enter__() try: self.stylemap[iurl] = stylepath finally: pass try: x = _[2] x.write(data) finally: pass ns.replaceWith(src.replace(m.group(1), stylepath)) continue open(stylepath, 'wb').__exit__ def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll((lambda tag: if tag.name.lower() == 'img': passtag.has_key('src'))): iurl = tag['src'] if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) self.imagemap_lock.__enter__() try: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue finally: pass try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': continue except Exception: self.imagemap_lock.__exit__ self.imagemap_lock.__exit__ self.imagemap_lock self.log.exception('Could not fetch image ', iurl) continue except: self.imagemap_lock.__exit__ c += 1 fname = ascii_filename('img' + str(c)) imgpath = os.path.join(diskpath, fname + '.jpg') try: im = Image.open(StringIO(data)).convert('RGBA') self.imagemap_lock.__enter__() try: self.imagemap[iurl] = imgpath finally: pass try: x = _[1] im.save(x, 'JPEG') finally: pass tag['src'] = imgpath continue traceback.print_exc() continue continue def absurl(self, baseurl, tag, key, filter = True): iurl = tag[key] parts = urlparse.urlsplit(iurl) if not (parts.netloc) and not (parts.path): return None if not parts.scheme: iurl = urlparse.urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl, tag): self.log.debug('Filtered link: ' + iurl) return None return iurl def normurl(self, url): parts = list(urlparse.urlsplit(url)) parts[4] = '' return urlparse.urlunsplit(parts) def localize_link(self, tag, key, path): parts = urlparse.urlsplit(tag[key]) suffix = None if parts.fragment else '' tag[key] = path + suffix def process_return_links(self, soup, baseurl): for tag in soup.findAll((lambda tag: if tag.name.lower() == 'a': passtag.has_key('href'))): iurl = self.absurl(baseurl, tag, 'href') if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): self.localize_link(tag, 'href', self.filemap[nurl]) continue def process_links(self, soup, baseurl, recursion_level, into_dir = 'links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) if not os.path.exists(diskpath): os.mkdir(diskpath) prev_dir = self.current_dir try: self.current_dir = diskpath tags = list(soup.findAll('a', href = True)) for c, tag in enumerate(tags): if self.show_progress: print '.', sys.stdout.flush() sys.stdout.flush() iurl = self.absurl(baseurl, tag, 'href', filter = recursion_level != 0) if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): self.localize_link(tag, 'href', self.filemap[nurl]) continue if self.files > self.max_files: return res linkdir = self.files > self.max_files if into_dir else '' linkdiskpath = os.path.join(diskpath, linkdir) if not os.path.exists(linkdiskpath): os.mkdir(linkdiskpath) try: self.current_dir = linkdiskpath dsrc = self.fetch_url(iurl) newbaseurl = dsrc.newurl if len(dsrc) == 0 or len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: raise ValueError('No content at URL %r' % iurl) len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0 if callable(self.encoding): dsrc = self.encoding(dsrc) elif self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'replace') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] soup = self.get_soup(dsrc) base = soup.find('base', href = True) if base is not None: newbaseurl = base['href'] self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) _fname = basename(iurl) if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '') _fname = ascii_filename(_fname) _fname = os.path.splitext(_fname)[0] + '.xhtml' res = os.path.join(linkdiskpath, _fname) self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level + 1) else: self.process_return_links(soup, newbaseurl) self.log.debug('Recursion limit reached. Skipping links in', iurl) if callable(self.postprocess_html_ext): if c == 0 and recursion_level == 0: pass soup = self.postprocess_html_ext(soup, not getattr(self, 'called_first', False), self.job_info) if c == 0 and recursion_level == 0: self.called_first = True save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception: self.failed_links.append((iurl, traceback.format_exc())) self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print return res def option_parser(usage = _('%prog URL\n\nWhere URL is for example http://google.com')): parser = OptionParser(usage = usage) parser.add_option('-d', '--base-dir', help = _('Base directory into which URL is saved. Default is %default'), default = '.', type = 'string', dest = 'dir') parser.add_option('-t', '--timeout', help = _('Timeout in seconds to wait for a response from the server. Default: %default s'), default = 10, type = 'float', dest = 'timeout') parser.add_option('-r', '--max-recursions', default = 1, help = _('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), type = 'int', dest = 'max_recursions') parser.add_option('-n', '--max-files', default = sys.maxint, type = 'int', dest = 'max_files', help = _('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')) parser.add_option('--delay', default = 0, dest = 'delay', type = 'int', help = _('Minimum interval in seconds between consecutive fetches. Default is %default s')) parser.add_option('--encoding', default = None, help = _('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) parser.add_option('--match-regexp', default = [], action = 'append', dest = 'match_regexps', help = _('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) parser.add_option('--filter-regexp', default = [], action = 'append', dest = 'filter_regexps', help = _('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')) parser.add_option('--dont-download-stylesheets', action = 'store_true', default = False, help = _('Do not download CSS stylesheets.'), dest = 'no_stylesheets') parser.add_option('--verbose', help = _('Show detailed output information. Useful for debugging'), default = False, action = 'store_true', dest = 'verbose') return parser def create_fetcher(options, image_map = { }, log = None): if log is None: log = Log() return RecursiveFetcher(options, log, image_map = { }) def main(args = sys.argv): parser = option_parser() (options, args) = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 fetcher = create_fetcher(options) fetcher.start_fetch(args[1]) if __name__ == '__main__': sys.exit(main())