home *** CD-ROM | disk | FTP | other *** search
/ Chip 2004 December / CHIP_CD_2004-12.iso / bonus / oo / OOo_1.1.3_ru_RU_infra_WinIntel_install.exe / $PLUGINSDIR / f_0372 / python-core-2.2.2 / lib / robotparser.py < prev    next >
Text File  |  2004-10-09  |  9KB  |  273 lines

  1. """ robotparser.py
  2.  
  3.     Copyright (C) 2000  Bastian Kleineidam
  4.  
  5.     You can choose between two licenses when using this package:
  6.     1) GNU GPLv2
  7.     2) PYTHON 2.0 OPEN SOURCE LICENSE
  8.  
  9.     The robots.txt Exclusion Protocol is implemented as specified in
  10.     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  11. """
  12. import re,urlparse,urllib
  13.  
  14. __all__ = ["RobotFileParser"]
  15.  
  16. debug = 0
  17.  
  18. def _debug(msg):
  19.     if debug: print msg
  20.  
  21.  
  22. class RobotFileParser:
  23.     def __init__(self, url=''):
  24.         self.entries = []
  25.         self.disallow_all = 0
  26.         self.allow_all = 0
  27.         self.set_url(url)
  28.         self.last_checked = 0
  29.  
  30.     def mtime(self):
  31.         return self.last_checked
  32.  
  33.     def modified(self):
  34.         import time
  35.         self.last_checked = time.time()
  36.  
  37.     def set_url(self, url):
  38.         self.url = url
  39.         self.host, self.path = urlparse.urlparse(url)[1:3]
  40.  
  41.     def read(self):
  42.         opener = URLopener()
  43.         f = opener.open(self.url)
  44.         lines = f.readlines()
  45.         self.errcode = opener.errcode
  46.         if self.errcode == 401 or self.errcode == 403:
  47.             self.disallow_all = 1
  48.             _debug("disallow all")
  49.         elif self.errcode >= 400:
  50.             self.allow_all = 1
  51.             _debug("allow all")
  52.         elif self.errcode == 200 and lines:
  53.             _debug("parse lines")
  54.             self.parse(lines)
  55.  
  56.     def parse(self, lines):
  57.         """parse the input lines from a robot.txt file.
  58.            We allow that a user-agent: line is not preceded by
  59.            one or more blank lines."""
  60.         state = 0
  61.         linenumber = 0
  62.         entry = Entry()
  63.  
  64.         for line in lines:
  65.             line = line.strip()
  66.             linenumber = linenumber + 1
  67.             if not line:
  68.                 if state==1:
  69.                     _debug("line %d: warning: you should insert"
  70.                            " allow: or disallow: directives below any"
  71.                            " user-agent: line" % linenumber)
  72.                     entry = Entry()
  73.                     state = 0
  74.                 elif state==2:
  75.                     self.entries.append(entry)
  76.                     entry = Entry()
  77.                     state = 0
  78.             # remove optional comment and strip line
  79.             i = line.find('#')
  80.             if i>=0:
  81.                 line = line[:i]
  82.             line = line.strip()
  83.             if not line:
  84.                 continue
  85.             line = line.split(':', 1)
  86.             if len(line) == 2:
  87.                 line[0] = line[0].strip().lower()
  88.                 line[1] = line[1].strip()
  89.                 if line[0] == "user-agent":
  90.                     if state==2:
  91.                         _debug("line %d: warning: you should insert a blank"
  92.                                " line before any user-agent"
  93.                                " directive" % linenumber)
  94.                         self.entries.append(entry)
  95.                         entry = Entry()
  96.                     entry.useragents.append(line[1])
  97.                     state = 1
  98.                 elif line[0] == "disallow":
  99.                     if state==0:
  100.                         _debug("line %d: error: you must insert a user-agent:"
  101.                                " directive before this line" % linenumber)
  102.                     else:
  103.                         entry.rulelines.append(RuleLine(line[1], 0))
  104.                         state = 2
  105.                 elif line[0] == "allow":
  106.                     if state==0:
  107.                         _debug("line %d: error: you must insert a user-agent:"
  108.                                " directive before this line" % linenumber)
  109.                     else:
  110.                         entry.rulelines.append(RuleLine(line[1], 1))
  111.                 else:
  112.                     _debug("line %d: warning: unknown key %s" % (linenumber,
  113.                                line[0]))
  114.             else:
  115.                 _debug("line %d: error: malformed line %s"%(linenumber, line))
  116.         if state==2:
  117.             self.entries.append(entry)
  118.         _debug("Parsed rules:\n%s" % str(self))
  119.  
  120.  
  121.     def can_fetch(self, useragent, url):
  122.         """using the parsed robots.txt decide if useragent can fetch url"""
  123.         _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" %
  124.                (useragent, url))
  125.         if self.disallow_all:
  126.             return 0
  127.         if self.allow_all:
  128.             return 1
  129.         # search for given user agent matches
  130.         # the first match counts
  131.         url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
  132.         for entry in self.entries:
  133.             if entry.applies_to(useragent):
  134.                 return entry.allowance(url)
  135.         # agent not found ==> access granted
  136.         return 1
  137.  
  138.  
  139.     def __str__(self):
  140.         ret = ""
  141.         for entry in self.entries:
  142.             ret = ret + str(entry) + "\n"
  143.         return ret
  144.  
  145.  
  146. class RuleLine:
  147.     """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
  148.        (allowance==0) followed by a path."""
  149.     def __init__(self, path, allowance):
  150.         self.path = urllib.quote(path)
  151.         self.allowance = allowance
  152.  
  153.     def applies_to(self, filename):
  154.         return self.path=="*" or re.match(self.path, filename)
  155.  
  156.     def __str__(self):
  157.         return (self.allowance and "Allow" or "Disallow")+": "+self.path
  158.  
  159.  
  160. class Entry:
  161.     """An entry has one or more user-agents and zero or more rulelines"""
  162.     def __init__(self):
  163.         self.useragents = []
  164.         self.rulelines = []
  165.  
  166.     def __str__(self):
  167.         ret = ""
  168.         for agent in self.useragents:
  169.             ret = ret + "User-agent: "+agent+"\n"
  170.         for line in self.rulelines:
  171.             ret = ret + str(line) + "\n"
  172.         return ret
  173.  
  174.     def applies_to(self, useragent):
  175.         """check if this entry applies to the specified agent"""
  176.         # split the name token and make it lower case
  177.         useragent = useragent.split("/")[0].lower()
  178.         for agent in self.useragents:
  179.             if agent=='*':
  180.                 # we have the catch-all agent
  181.                 return 1
  182.             agent = agent.lower()
  183.             # don't forget to re.escape
  184.             if re.search(re.escape(useragent), agent):
  185.                 return 1
  186.         return 0
  187.  
  188.     def allowance(self, filename):
  189.         """Preconditions:
  190.         - our agent applies to this entry
  191.         - filename is URL decoded"""
  192.         for line in self.rulelines:
  193.             _debug((filename, str(line), line.allowance))
  194.             if line.applies_to(filename):
  195.                 return line.allowance
  196.         return 1
  197.  
  198. class URLopener(urllib.FancyURLopener):
  199.     def __init__(self, *args):
  200.         apply(urllib.FancyURLopener.__init__, (self,) + args)
  201.         self.errcode = 200
  202.         self.tries = 0
  203.         self.maxtries = 10
  204.  
  205.     def http_error_default(self, url, fp, errcode, errmsg, headers):
  206.         self.errcode = errcode
  207.         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
  208.                                                         errmsg, headers)
  209.  
  210.     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
  211.         self.tries += 1
  212.         if self.tries >= self.maxtries:
  213.             return self.http_error_default(url, fp, 500,
  214.                                            "Internal Server Error: Redirect Recursion",
  215.                                            headers)
  216.         result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
  217.                                                       errmsg, headers, data)
  218.         self.tries = 0
  219.         return result
  220.  
  221. def _check(a,b):
  222.     if not b:
  223.         ac = "access denied"
  224.     else:
  225.         ac = "access allowed"
  226.     if a!=b:
  227.         print "failed"
  228.     else:
  229.         print "ok (%s)" % ac
  230.     print
  231.  
  232. def _test():
  233.     global debug
  234.     rp = RobotFileParser()
  235.     debug = 1
  236.  
  237.     # robots.txt that exists, gotten to by redirection
  238.     rp.set_url('http://www.musi-cal.com/robots.txt')
  239.     rp.read()
  240.  
  241.     # test for re.escape
  242.     _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
  243.     # this should match the first rule, which is a disallow
  244.     _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
  245.     # various cherry pickers
  246.     _check(rp.can_fetch('CherryPickerSE',
  247.                        'http://www.musi-cal.com/cgi-bin/event-search'
  248.                        '?city=San+Francisco'), 0)
  249.     _check(rp.can_fetch('CherryPickerSE/1.0',
  250.                        'http://www.musi-cal.com/cgi-bin/event-search'
  251.                        '?city=San+Francisco'), 0)
  252.     _check(rp.can_fetch('CherryPickerSE/1.5',
  253.                        'http://www.musi-cal.com/cgi-bin/event-search'
  254.                        '?city=San+Francisco'), 0)
  255.     # case sensitivity
  256.     _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
  257.     _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
  258.     # substring test
  259.     _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
  260.     # tests for catch-all * agent
  261.     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
  262.     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
  263.     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
  264.     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
  265.  
  266.     # robots.txt that does not exist
  267.     rp.set_url('http://www.lycos.com/robots.txt')
  268.     rp.read()
  269.     _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
  270.  
  271. if __name__ == '__main__':
  272.     _test()
  273.