home *** CD-ROM | disk | FTP | other *** search
/ H4CK3R 14 / hacker14.iso / programacao / pythonwin / python.exe / WEBSUCKER.PY < prev    next >
Encoding:
Python Source  |  2002-09-11  |  3.4 KB  |  126 lines

  1. #! /usr/bin/env python
  2.  
  3. """A variant on webchecker that creates a mirror copy of a remote site."""
  4.  
  5. __version__ = "$Revision: 1.10 $"
  6.  
  7. import os
  8. import sys
  9. import urllib
  10. import getopt
  11.  
  12. import webchecker
  13.  
  14. # Extract real version number if necessary
  15. if __version__[0] == '$':
  16.     _v = __version__.split()
  17.     if len(_v) == 3:
  18.         __version__ = _v[1]
  19.  
  20. def main():
  21.     verbose = webchecker.VERBOSE
  22.     try:
  23.         opts, args = getopt.getopt(sys.argv[1:], "qv")
  24.     except getopt.error, msg:
  25.         print msg
  26.         print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
  27.         return 2
  28.     for o, a in opts:
  29.         if o == "-q":
  30.             verbose = 0
  31.         if o == "-v":
  32.             verbose = verbose + 1
  33.     c = Sucker()
  34.     c.setflags(verbose=verbose)
  35.     c.urlopener.addheaders = [
  36.             ('User-agent', 'websucker/%s' % __version__),
  37.         ]
  38.     for arg in args:
  39.         print "Adding root", arg
  40.         c.addroot(arg)
  41.     print "Run..."
  42.     c.run()
  43.  
  44. class Sucker(webchecker.Checker):
  45.  
  46.     checkext = 0
  47.     nonames = 1
  48.  
  49.     # SAM 11/13/99: in general, URLs are now URL pairs.
  50.     # Since we've suppressed name anchor checking,
  51.     # we can ignore the second dimension.
  52.  
  53.     def readhtml(self, url_pair):
  54.         url = url_pair[0]
  55.         text = None
  56.         path = self.savefilename(url)
  57.         try:
  58.             f = open(path, "rb")
  59.         except IOError:
  60.             f = self.openpage(url_pair)
  61.             if f:
  62.                 info = f.info()
  63.                 nurl = f.geturl()
  64.                 if nurl != url:
  65.                     url = nurl
  66.                     path = self.savefilename(url)
  67.                 text = f.read()
  68.                 f.close()
  69.                 self.savefile(text, path)
  70.                 if not self.checkforhtml(info, url):
  71.                     text = None
  72.         else:
  73.             if self.checkforhtml({}, url):
  74.                 text = f.read()
  75.             f.close()
  76.         return text, url
  77.  
  78.     def savefile(self, text, path):
  79.         dir, base = os.path.split(path)
  80.         makedirs(dir)
  81.         try:
  82.             f = open(path, "wb")
  83.             f.write(text)
  84.             f.close()
  85.             self.message("saved %s", path)
  86.         except IOError, msg:
  87.             self.message("didn't save %s: %s", path, str(msg))
  88.  
  89.     def savefilename(self, url):
  90.         type, rest = urllib.splittype(url)
  91.         host, path = urllib.splithost(rest)
  92.         path = path.lstrip("/")
  93.         user, host = urllib.splituser(host)
  94.         host, port = urllib.splitnport(host)
  95.         host = host.lower()
  96.         if not path or path[-1] == "/":
  97.             path = path + "index.html"
  98.         if os.sep != "/":
  99.             path = os.sep.join(path.split("/"))
  100.             if os.name == "mac":
  101.                 path = os.sep + path
  102.         path = os.path.join(host, path)
  103.         return path
  104.  
  105. def makedirs(dir):
  106.     if not dir:
  107.         return
  108.     if os.path.exists(dir):
  109.         if not os.path.isdir(dir):
  110.             try:
  111.                 os.rename(dir, dir + ".bak")
  112.                 os.mkdir(dir)
  113.                 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
  114.             except os.error:
  115.                 pass
  116.         return
  117.     head, tail = os.path.split(dir)
  118.     if not tail:
  119.         print "Huh?  Don't know how to make dir", dir
  120.         return
  121.     makedirs(head)
  122.     os.mkdir(dir, 0777)
  123.  
  124. if __name__ == '__main__':
  125.     sys.exit(main() or 0)
  126.