home *** CD-ROM | disk | FTP | other *** search
/ MacHack 2000 / MacHack 2000.toast / pc / The Hacks / MacHacksBug / Python 1.5.2c1 / Tools / webchecker / robotparser.py < prev    next >
Encoding:
Python Source  |  2000-06-23  |  3.2 KB  |  98 lines

  1. """
  2.  
  3. Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
  4. input, builds a set of rules from that list, then answers questions about
  5. fetchability of other URLs.
  6.  
  7. """
  8.  
  9. class RobotFileParser:
  10.  
  11.     def __init__(self):
  12.         self.rules = {}
  13.         self.debug = 0
  14.         self.url = ''
  15.         self.last_checked = 0
  16.  
  17.     def mtime(self):
  18.         return self.last_checked
  19.  
  20.     def modified(self):
  21.         import time
  22.         self.last_checked = time.time()
  23.  
  24.     def set_url(self, url):
  25.         self.url = url
  26. ##      import urlmisc
  27. ##      self.url = urlmisc.canonical_url(url)
  28.  
  29.     def read(self):
  30.         import urllib
  31.         self.parse(urllib.urlopen(self.url).readlines())
  32.  
  33.     def parse(self, lines):
  34.         import regsub, string, regex
  35.         active = []
  36.         for line in lines:
  37.             if self.debug: print '>', line,
  38.             # blank line terminates current record
  39.             if not line[:-1]:
  40.                 active = []
  41.                 continue
  42.             # remove optional comment and strip line
  43.             line = string.strip(line[:string.find(line, '#')])
  44.             if not line:
  45.                 continue
  46.             line = regsub.split(line, ' *: *')
  47.             if len(line) == 2:
  48.                 line[0] = string.lower(line[0])
  49.                 if line[0] == 'user-agent':
  50.                     # this record applies to this user agent
  51.                     if self.debug: print '>> user-agent:', line[1]
  52.                     active.append(line[1])
  53.                     if not self.rules.has_key(line[1]):
  54.                         self.rules[line[1]] = []
  55.                 elif line[0] == 'disallow':
  56.                     if line[1]:
  57.                         if self.debug: print '>> disallow:', line[1]
  58.                         for agent in active:
  59.                             self.rules[agent].append(regex.compile(line[1]))
  60.                     else:
  61.                         pass
  62.                         for agent in active:
  63.                             if self.debug: print '>> allow', agent
  64.                             self.rules[agent] = []
  65.                 else:
  66.                     if self.debug: print '>> unknown:', line
  67.  
  68.         self.modified()
  69.  
  70.     # returns true if agent is allowed to fetch url
  71.     def can_fetch(self, agent, url):
  72.         import urlparse
  73.         ag = agent
  74.         if not self.rules.has_key(ag): ag = '*'
  75.         if not self.rules.has_key(ag):
  76.             if self.debug: print '>> allowing', url, 'fetch by', agent
  77.             return 1
  78.         path = urlparse.urlparse(url)[2]
  79.         for rule in self.rules[ag]:
  80.             if rule.match(path) != -1:
  81.                 if self.debug: print '>> disallowing', url, 'fetch by', agent
  82.                 return 0
  83.         if self.debug: print '>> allowing', url, 'fetch by', agent
  84.         return 1
  85.  
  86. def test():
  87.     rp = RobotFileParser()
  88.     rp.debug = 1
  89.     rp.set_url('http://www.automatrix.com/robots.txt')
  90.     rp.read()
  91.     print rp.rules
  92.     print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
  93.     print rp.can_fetch('Musi-Cal-Robot',
  94.                        'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
  95.  
  96.     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
  97.     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
  98.