home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3716 < prev    next >
Encoding:
Text File  |  2010-06-11  |  4.9 KB  |  137 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. economist.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  10. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  11.  
  12. import mechanize, string, urllib, time, re
  13.  
  14. class Economist(BasicNewsRecipe):
  15.  
  16.     title = 'The Economist'
  17.     language = 'en'
  18.  
  19.     __author__ = "Kovid Goyal"
  20.     INDEX = 'http://www.economist.com/printedition'
  21.     description = ('Global news and current affairs from a European perspective.'
  22.             ' Needs a subscription from ')+INDEX
  23.  
  24.     oldest_article = 7.0
  25.     cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
  26.     remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
  27.             dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
  28.     keep_only_tags = [dict(id='ec-article-body')]
  29.     needs_subscription = True
  30.     no_stylesheets = True
  31.     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
  32.         lambda x:'</html>')]
  33.  
  34.     def get_browser(self):
  35.         br = BasicNewsRecipe.get_browser()
  36.         br.open('http://www.economist.com')
  37.         req = mechanize.Request(
  38.                 'http://www.economist.com/members/members.cfm?act=exec_login',
  39.                 headers = {
  40.                     'Referer':'http://www.economist.com/',
  41.                     },
  42.                 data=urllib.urlencode({
  43.                     'logging_in' : 'Y',
  44.                     'returnURL'  : '/',
  45.                     'email_address': self.username,
  46.                     'fakepword' : 'Password',
  47.                     'pword'     : self.password,
  48.                     'x'         : '0',
  49.                     'y'         : '0',
  50.                     }))
  51.         br.open(req).read()
  52.         return br
  53.  
  54.     def parse_index(self):
  55.         try:
  56.             return self.economist_parse_index()
  57.         except:
  58.             raise
  59.             self.log.warn(
  60.                 'Initial attempt to parse index failed, retrying in 30 seconds')
  61.             time.sleep(30)
  62.             return self.economist_parse_index()
  63.  
  64.     def economist_parse_index(self):
  65.         soup = BeautifulSoup(self.browser.open(self.INDEX).read(),
  66.                              convertEntities=BeautifulSoup.HTML_ENTITIES)
  67.         index_started = False
  68.         feeds = {}
  69.         ans = []
  70.         key = None
  71.         for tag in soup.findAll(['h1', 'h2']):
  72.             text = ''.join(tag.findAll(text=True))
  73.             if tag.name in ('h1', 'h2') and 'Classified ads' in text:
  74.                 break
  75.             if tag.name == 'h1':
  76.                 if 'The world this week' in text or 'The world this year' in text:
  77.                     index_started = True
  78.                 if not index_started:
  79.                     continue
  80.                 text = string.capwords(text)
  81.                 if text not in feeds.keys():
  82.                     feeds[text] = []
  83.                 if text not in ans:
  84.                     ans.append(text)
  85.                 key = text
  86.                 continue
  87.             if key is None:
  88.                 continue
  89.             a = tag.find('a', href=True)
  90.             if a is not None:
  91.                 url=a['href']
  92.                 id_ = re.search(r'story_id=(\d+)', url).group(1)
  93.                 url = 'http://www.economist.com/node/%s/print'%id_
  94.                 if url.startswith('Printer'):
  95.                     url = '/'+url
  96.                 if url.startswith('/'):
  97.                     url = 'http://www.economist.com' + url
  98.                 try:
  99.                    subtitle = tag.previousSibling.contents[0].contents[0]
  100.                    text = subtitle + ': ' + text
  101.                 except:
  102.                    pass
  103.                 article = dict(title=text,
  104.                     url = url,
  105.                     description='', content='', date='')
  106.                 feeds[key].append(article)
  107.  
  108.         ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)]
  109.         if not ans:
  110.             raise Exception('Could not find any articles. Has your subscription expired?')
  111.         return ans
  112.  
  113.     def eco_find_image_tables(self, soup):
  114.         for x in soup.findAll('table', align=['right', 'center']):
  115.             if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
  116.                 yield x
  117.  
  118.     def postprocess_html(self, soup, first):
  119.         body = soup.find('body')
  120.         for name, val in body.attrs:
  121.             del body[name]
  122.  
  123.         for table in list(self.eco_find_image_tables(soup)):
  124.             caption = table.find('font')
  125.             img = table.find('img')
  126.             div = Tag(soup, 'div')
  127.             div['style'] = 'text-align:left;font-size:70%'
  128.             ns = NavigableString(self.tag_to_string(caption))
  129.             div.insert(0, ns)
  130.             div.insert(1, Tag(soup, 'br'))
  131.             del img['width']
  132.             del img['height']
  133.             img.extract()
  134.             div.insert(2, img)
  135.             table.replaceWith(div)
  136.         return soup
  137.