home *** CD-ROM | disk | FTP | other *** search
/ Chip 2006 June / CHIP 2006-06.2.iso / program / freeware / Democracy-0.8.2.exe / xulrunner / python / download_utils.py < prev    next >
Encoding:
Python Source  |  2006-04-10  |  17.4 KB  |  464 lines

  1. import config
  2. import re
  3. import socket
  4. from urlparse import urlparse,urljoin
  5. from threading import RLock
  6. from time import time
  7. from httplib import HTTPConnection, HTTPSConnection,HTTPException
  8.  
  9. # Filter invalid URLs with duplicated ports (http://foo.bar:123:123/baz) which
  10. # seem to be part of #441.
  11. def parseURL(url):
  12.     (scheme, host, path, params, query, fragment) = urlparse(url)
  13.     if host.count(':') > 1:
  14.         host = host[0:host.rfind(':')]
  15.     return (scheme, host, path, params, query, fragment)
  16.  
  17. # Returns a filename minus nasty characters
  18. def cleanFilename(filename):
  19.     return filename.replace("\\","").replace("/","").replace(":","").replace("*","").replace("?","").replace("\"","").replace("<","").replace(">","").replace("|","")
  20.  
  21. # FIXME: Currently, returns a None object in the case where it can't
  22. # download the file. In the future, we should probably raise
  23. # exceptions for each possible failure case and catch those everywhere
  24. # this is used.
  25.  
  26. # Given a URL returns an info object which may contain the following
  27. # keys: content-length, accept-ranges, server, last-modified, date,
  28. # etag, content-type, redirected-url, updated-url, file-handle
  29. #
  30. # redirected-url, updated-url, filename, and file-handle are generated by
  31. # getURLInfo. All of the other information is grabbed from the actual
  32. # HTTP headers.
  33. #
  34. # Currently, only GET and HEAD requests are supported
  35. #
  36. # File handle is passed when a GET request is made. Call read() on it
  37. # until read() returns '', then call close(). If you do not call
  38. # close(), the connection will never be freed up.
  39. #
  40. # Redirected URL is the URL actually loaded after all of the redirects.
  41. # Updated-url is the URL of the last permanent redirect
  42. def grabURL(url, type="GET",start = 0, etag=None,modified=None,findHTTPAuth=None):
  43.     if findHTTPAuth is None:
  44.         import downloader
  45.         findHTTPAuth = downloader.findHTTPAuth
  46.     maxDepth = 10
  47.     maxAuthAttempts = 5
  48.     redirURL = url
  49.     userAgent = "%s/%s (%s)" % \
  50.         (config.get(config.SHORT_APP_NAME),
  51.          config.get(config.APP_VERSION),
  52.          config.get(config.PROJECT_URL))
  53.     myHeaders = {"User-Agent": userAgent}
  54.  
  55.     (scheme, host, path, params, query, fragment) = parseURL(url)
  56.     #print "grab URL called for "+host
  57.  
  58.     auth = findHTTPAuth(host,path)
  59.     if not auth is None:
  60.         #print " adding auth header"
  61.         myHeaders["Authorization"] = auth.getAuthScheme()+' '+auth.getAuthToken()
  62.  
  63.     if len(params):
  64.         path += ';'+params
  65.     if len(query):
  66.         path += '?'+query
  67.  
  68.     if start > 0:
  69.         myHeaders["Range"] = "bytes="+str(start)+"-"
  70.  
  71.     if not etag is None:
  72.         myHeaders["If-None-Match"] = etag
  73.  
  74.     if not modified is None:
  75.         myHeaders["If-Modified-Since"] = modified
  76.  
  77.     download = connectionPool.getRequest(scheme,host,type,path, headers = myHeaders)
  78.  
  79.     if download is None:
  80.         return None
  81.  
  82.     #print "Got it!"
  83.     depth = 0
  84.     authAttempts = 0
  85.     while ((download.status != 304) and
  86.            ((start == 0 and download.status != 200) or
  87.             (start > 0 and download.status != 206)) and 
  88.            (depth < maxDepth and authAttempts < maxAuthAttempts)):
  89.         if download.status == 302 or download.status == 307 or download.status == 301:
  90.             #print " redirect"
  91.             depth += 1
  92.             info = download.msg
  93.             download.close()
  94.             if info.has_key('location'):
  95.                 redirURL = urljoin(redirURL,info['location'])
  96.             if download.status == 301:
  97.                 url = redirURL
  98.             (scheme, host, path, params, query, fragment) = parseURL(redirURL)
  99.  
  100.             try:
  101.                 del myHeaders["Authorization"]
  102.             except KeyError:
  103.                 pass
  104.             auth = findHTTPAuth(host,path)
  105.             if not auth is None:
  106.                 #print " adding auth header"
  107.                 myHeaders["Authorization"] = auth.getAuthScheme()+' '+auth.getAuthToken()
  108.  
  109.             if len(params):
  110.                 path += ';'+params
  111.             if len(query):
  112.                 path += '?'+query
  113.             #print "getURLInfo Redirected to "+host
  114.             download = connectionPool.getRequest(scheme,host,type,path, headers=myHeaders)
  115.             if download is None:
  116.                 return None
  117.         elif download.status == 401:
  118.             if download.msg.has_key('WWW-Authenticate'):
  119.                 authAttempts += 1
  120.                 info = download.msg
  121.                 download.close()
  122.                 regExp = re.compile("^(.*?)\s+realm\s*=\s*\"(.*?)\"$").search(info['WWW-Authenticate'])
  123.                 authScheme = regExp.expand("\\1")
  124.                 realm = regExp.expand("\\2")
  125.                 #print "Trying to authenticate "+host+" realm:"+realm
  126.                 result = delegate.getHTTPAuth(host,realm)
  127.                 if not result is None:
  128.                     auth = HTTPAuthPassword(result[0],result[1],host, realm, path, authScheme)
  129.                     myHeaders["Authorization"] = auth.getAuthScheme()+' '+auth.getAuthToken()
  130.                     download = connectionPool.getRequest(scheme,host,type,path, headers=myHeaders)
  131.                 else:
  132.                     return None #The user hit Cancel
  133.  
  134.                 #This is where we would do our magic to prompt for a password
  135.                 #If we get a good password, we save it
  136.             else:
  137.                 break
  138.         else: #Some state we don't handle
  139.             break
  140.  
  141.     # Valid or cached pages
  142.     if not download.status in [200,206,304]:
  143.         return None
  144.  
  145.     #print "processing request"
  146.     info = download.msg
  147.     myInfo = {}
  148.     for key in info.keys():
  149.         myInfo[key] = info[key]
  150.     info = myInfo
  151.     if type == 'GET':
  152.         info['file-handle'] = download
  153.     else:
  154.         download.close()
  155.     #print "closed request"
  156.  
  157.     info['filename'] = 'unknown'
  158.     try:
  159.         disposition = info['content-disposition']
  160.         info['filename'] = re.compile("^.*filename\s*=\s*\"(.*?)\"$").search(disposition).expand("\\1")
  161.         info['filename'] = cleanFilename(info['filename'])
  162.     except:
  163.         try:
  164.             info['filename'] = re.compile("^.*?([^/]+)/?$").search(path).expand("\\1")
  165.             info['filename'] = cleanFilename(info['filename'])
  166.         except:
  167.             pass
  168.  
  169.     info['redirected-url'] = redirURL
  170.     info['updated-url'] = url
  171.     info['status'] = download.status
  172.     try:
  173.         info['charset'] = re.compile("^.*charset\s*=\s*(\S+)/?$").search(info['content-type']).expand("\\1")
  174.     except (AttributeError, KeyError):
  175.         pass
  176.     return info
  177.  
  178. # An HTTP response that tells the connection pool when it is free,
  179. # so that the connection can be reused
  180. class PooledHTTPResponse:
  181.     def __init__(self,conn,response,connPool):
  182.         self.conn = conn
  183.         self.response = response
  184.         self.connPool = connPool
  185.         self.beenRead = False
  186.  
  187.     def read(self,amt=None):
  188.         if amt is None:
  189.             ret = ''
  190.             next = self.response.read()
  191.             while len(next) > 0:
  192.                 ret += next
  193.                 next = self.response.read()
  194.             self.beenRead = True
  195.         else:
  196.             ret = self.response.read(amt)
  197.         if ret == '':
  198.             self.beenRead = True
  199.         return ret
  200.  
  201.     def getheader(self,name,default = None):
  202.         if isinstance(default, None):
  203.             return self.response.getheader(name)
  204.         else:
  205.             return self.response.getheader(name,default)
  206.         
  207.     def getheaders(self):
  208.         return self.response.getheaders()
  209.  
  210.     def __getattr__(self,key):
  211.         return getattr(self.response, key)
  212.  
  213.     #
  214.     # Use like close(), but in the middle of a download
  215.     def kill(self):
  216.         if not self.beenRead:
  217.             self.connPool.removeConn(self.conn)
  218.         else:
  219.             self.connPool.freeConn(self.conn)
  220.  
  221.     def close(self):
  222.         if not self.beenRead:
  223.             #print "Closing unread response..."+str(self.response)
  224.             try:
  225.                 out = self.response.read(8192)
  226.                 while len(out)>0:
  227.                      #print "still closing "+str(self.response)
  228.                     out = self.response.read(8192)
  229.                 #print "done closing"
  230.                 self.connPool.freeConn(self.conn)
  231.             except ValueError:
  232.                 print "Caught error in httplib"
  233.                 self.connPool.removeConn(self.conn)
  234.  
  235.     ##
  236.     # Called by pickle during serialization
  237.     def __getstate__(self):
  238.         assert(0) #This should never be serialized
  239.  
  240. #
  241. # This class a set of HTTP connections so that we always do the
  242. # optimal thing when we need a new connection. Generally, if there's a
  243. # free existing connection, we use that, otherwise we create a new one
  244. #
  245. # FIXME: add certificate validation for HTTPS
  246. class HTTPConnectionPool:
  247.  
  248.     # The maximum number of connections we keep active. The total
  249.     # number may exceed this, but free connections that bring the
  250.     # total number above maxConns will be closed
  251.     maxConns = 30
  252.     maxConnsPerServer = 8
  253.     connTimeout = 300
  254.     def __init__(self):
  255.         self.conns={'free':{'http':{},'https':{}},
  256.                     'inuse':{'http':{},'https':{}}}
  257.         self.lock = RLock()
  258.  
  259.     def __len__(self):
  260.         self.lock.acquire()
  261.         try:
  262.             length = 0
  263.             for state in self.conns:
  264.                 for protocol in self.conns[state]:
  265.                     for host in self.conns[state][protocol]:
  266.                         length += len(self.conns[state][protocol][host])
  267.         finally:
  268.             self.lock.release()
  269.         return length
  270.  
  271.     # moves connection from inuse to free
  272.     #
  273.     # get your freeConn!
  274.     def freeConn(self,conn):
  275.         freed = False
  276.         #print "Trying to free connection..."
  277.         self.lock.acquire()
  278.         try:
  279.             for prot in self.conns['inuse']:
  280.                 for h in self.conns['inuse'][prot]:
  281.                     try:
  282.                         index = self.conns['inuse'][prot][h].index(conn)
  283.                         del self.conns['inuse'][prot][h][index]
  284.                         protocol = prot
  285.                         host = h
  286.                         freed = True
  287.                     except ValueError:
  288.                         pass
  289.             if freed:
  290.                 #print "Connection to "+host+ " is idle"
  291.                 if not self.conns['free'][protocol].has_key(host):
  292.                     self.conns['free'][protocol][host] = []
  293.                 self.conns['free'][protocol][host].append((conn, time()+self.connTimeout))
  294.             #else:
  295.                 #print "Not freed!"
  296.         finally:
  297.             self.lock.release()
  298.  
  299.     #
  300.     # Removes a connection from the pool
  301.     def removeConn(self,conn):
  302.         self.lock.acquire()
  303.         try:
  304.             for protocol in self.conns['free']:
  305.                 for host in self.conns['free'][protocol]:
  306.                     for pair in self.conns['free'][protocol][host]:
  307.                         if pair[0] is conn:
  308.                             #print "Removing connection to "+host
  309.                             self.conns['free'][protocol][host].remove(pair)
  310.             for protocol in self.conns['inuse']:
  311.                 for host in self.conns['inuse'][protocol]:
  312.                     for uConn in self.conns['inuse'][protocol][host]:
  313.                         if uConn is conn:
  314.                             #print "Removing connection to "+host
  315.                             self.conns['inuse'][protocol][host].remove(uConn)
  316.         finally:
  317.             self.lock.release()
  318.         conn.close()
  319.  
  320.     def removeOldestFreeConnection(self):
  321.         #print "Removing oldest connection..."
  322.         self.lock.acquire()
  323.         try:
  324.             conn = None
  325.             oldest = -1
  326.             for protocol in self.conns['free']:
  327.                 for host in self.conns['free'][protocol]:
  328.                     for (newConn, newExp) in self.conns['free'][protocol][host]:
  329.                         if newExp > oldest:
  330.                             oldest = newExp
  331.                             conn = newConn
  332.             if not (conn is None):
  333.                 self.removeConn(conn)
  334.         finally:
  335.             self.lock.release()
  336.         #print "...done"
  337.  
  338.     def removeOldestFreeByHost(self,protocol,host):
  339.         #print "Removing oldest connection to "+host+"..."
  340.         self.lock.acquire()
  341.         try:
  342.             conn = None
  343.             oldest = -1
  344.             for (newConn, newExp) in self.conns['free'][protocol][host]:
  345.                 if newExp > oldest:
  346.                     oldest = newExp
  347.                     conn = newConn
  348.             if not (conn is None):
  349.                 self.removeConn(conn)
  350.         finally:
  351.             self.lock.release()
  352.         #print "...done"
  353.  
  354.     def expireOldConnections(self):
  355.         now = time()
  356.         self.lock.acquire()
  357.         try:
  358.             for protocol in self.conns['free']:
  359.                 for host in self.conns['free'][protocol]:
  360.                     for pair in self.conns['free'][protocol][host]:
  361.                         if pair[1] <= now:
  362.                             #print "Expiring connection to "+host
  363.                             pair[0].close()
  364.                             self.conns['free'][protocol][host].remove(pair)
  365.         finally:
  366.             self.lock.release()
  367.  
  368.     def getNumConnsByHost(self,protocol,host):
  369.         self.lock.acquire()
  370.         try:
  371.             if not self.conns['free'][protocol].has_key(host):
  372.                 self.conns['free'][protocol][host] = []
  373.             if not self.conns['inuse'][protocol].has_key(host):
  374.                 self.conns['inuse'][protocol][host] = []
  375.             ret = (len(self.conns['inuse'][protocol][host])+
  376.                    len(self.conns['free'][protocol][host]))
  377.         finally:
  378.             self.lock.release()
  379.         return ret
  380.  
  381.     def getRequest(self,protocol,host,method,url,*args,**keywords):
  382.         #print "Making "+protocol+" connection to "+host+"..."
  383.         madeNewConn = False
  384.         self.lock.acquire()
  385.         try:
  386.             conn = None
  387.             self.expireOldConnections()
  388.             if (self.conns['free'][protocol].has_key(host) and
  389.                         len(self.conns['free'][protocol][host]) > 0):
  390.                 (conn, expiration) = self.conns['free'][protocol][host].pop(0)
  391.                 if not self.conns['inuse'][protocol].has_key(host):
  392.                     self.conns['inuse'][protocol][host] = []
  393.                 self.conns['inuse'][protocol][host].append(conn)
  394.                 #print "Using existing connection"
  395.         finally:
  396.             self.lock.release()
  397.         
  398.         # We don't already have a connection -- get one
  399.         if conn is None:
  400.             madeNewConn = True
  401.             #print "Making new connection..."
  402.             if protocol.lower() == 'http':
  403.                 conn = HTTPConnection(host)
  404.             elif protocol.lower() == 'https':
  405.                 conn = HTTPSConnection(host)
  406.  
  407.             #Save our newly created connection
  408.             self.lock.acquire()
  409.             try:   
  410.                 if not self.conns['free'][protocol].has_key(host):
  411.                     self.conns['free'][protocol][host] = []
  412.                 if not self.conns['inuse'][protocol].has_key(host):
  413.                     self.conns['inuse'][protocol][host] = []
  414.             
  415.                 if (self.getNumConnsByHost(protocol,host) == 
  416.                                                  self.maxConnsPerServer):
  417.                     self.removeOldestFreeByHost(protocol, host)
  418.  
  419.                 if len(self) == self.maxConns:
  420.                     self.removeOldestFreeConnection()
  421.                 if (len(self) < self.maxConns and 
  422.                 self.getNumConnsByHost(protocol,host) < self.maxConnsPerServer):
  423.                     if not self.conns['inuse'][protocol].has_key(host):
  424.                         self.conns['inuse'][protocol][host] = []
  425.                     self.conns['inuse'][protocol][host].append(conn)
  426.                     #print "...saving connection"
  427.                 #else:
  428.                     #print "...not saving connection"
  429.             finally:
  430.                 self.lock.release()
  431.  
  432.         #print "Making request..."
  433.         try:
  434.             conn.request(method,url,*args,**keywords)
  435.         except socket.error:
  436.             if madeNewConn:
  437.                 return None
  438.             else: # We had a connection before. Maybe the connection
  439.                   # just timed out...
  440.                 #print "An old connection may have timed out. Trying again."
  441.                 self.removeConn(conn)
  442.                 return self.getRequest(protocol,host,method,url,*args,**keywords)
  443.  
  444.         #print "Getting response..."
  445.         try:
  446.             response = conn.getresponse()
  447.         except (HTTPException, socket.timeout):
  448.             if madeNewConn:
  449.                 return None
  450.             else: # We had a connection before. Maybe the connection
  451.                   # just timed out...
  452.                 #print "An old connection may have timed out. Trying again."
  453.                 self.removeConn(conn)
  454.                 return self.getRequest(protocol,host,method,url,*args,**keywords)
  455.         #print "Leaving connectionPool"
  456.         return PooledHTTPResponse(conn,response,self)
  457.  
  458.     ##
  459.     # Called by pickle during serialization
  460.     def __getstate__(self):
  461.         assert(0) #This should never be serialized
  462.  
  463. connectionPool = HTTPConnectionPool()
  464.