home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3792 < prev    next >
Encoding:
Text File  |  2010-02-08  |  4.8 KB  |  124 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. import re
  3.  
  4. class HBR(BasicNewsRecipe):
  5.  
  6.     title = 'Harvard Business Review'
  7.     description = 'To subscribe go to http://hbr.harvardbusiness.org'
  8.     needs_subscription = True
  9.     __author__ = 'Kovid Goyal and Sujata Raman'
  10.     timefmt                = ' [%B %Y]'
  11.     language = 'en'
  12.     no_stylesheets = True
  13.  
  14.     LOGIN_URL = 'http://hbr.org/login?request_url=/'
  15.     INDEX = 'http://hbr.org/current'
  16.  
  17.     keep_only_tags = [dict(name='div', id='pageContainer')]
  18.     remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
  19.         'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
  20.         'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
  21.         'mailingListTout', 'partnerCenter', 'pageFooter',
  22.         'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
  23.         dict(name='iframe')]
  24.     extra_css = '''
  25.                 a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
  26.                 .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  27.                 h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
  28.                 h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;  }
  29.                 #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
  30.                 #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
  31.                 '''
  32.  
  33.     def get_browser(self):
  34.         br = BasicNewsRecipe.get_browser(self)
  35.         br.open(self.LOGIN_URL)
  36.         br.select_form(name='signInForm')
  37.         br['signInForm:username'] = self.username
  38.         br['signInForm:password'] = self.password
  39.         raw = br.submit().read()
  40.         if 'My Account' not in raw:
  41.             raise Exception('Failed to login, are you sure your username and password are correct?')
  42.         self.logout_url = None
  43.         link = br.find_link(text='Sign out')
  44.         if link:
  45.             self.logout_url = link.absolute_url
  46.         return br
  47.  
  48.     def cleanup(self):
  49.         if self.logout_url is not None:
  50.             self.browser.open(self.logout_url)
  51.  
  52.     def map_url(self, url):
  53.         if url.endswith('/ar/1'):
  54.             return url[:-1]+'pr'
  55.  
  56.  
  57.     def hbr_get_toc(self):
  58.         soup = self.index_to_soup(self.INDEX)
  59.         url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href')
  60.         return self.index_to_soup('http://hbr.org'+url)
  61.  
  62.     def hbr_parse_section(self, container, feeds):
  63.         current_section = None
  64.         current_articles = []
  65.         for x in container.findAll(name=['li', 'h3', 'h4']):
  66.             if x.name in ['h3', 'h4'] and not x.findAll(True):
  67.                 if current_section and current_articles:
  68.                     feeds.append((current_section, current_articles))
  69.                 current_section = self.tag_to_string(x)
  70.                 current_articles = []
  71.                 self.log('\tFound section:', current_section)
  72.             if x.name == 'li':
  73.                 a = x.find('a', href=True)
  74.                 if a is not None:
  75.                     title = self.tag_to_string(a)
  76.                     url = a.get('href')
  77.                     if '/ar/' not in url:
  78.                         continue
  79.                     if url.startswith('/'):
  80.                         url = 'http://hbr.org'+url
  81.                     url = self.map_url(url)
  82.                     p = x.find('p')
  83.                     desc = ''
  84.                     if p is not None:
  85.                         desc = self.tag_to_string(p)
  86.                     if not title or not url:
  87.                         continue
  88.                     self.log('\t\tFound article:', title)
  89.                     self.log('\t\t\t', url)
  90.                     self.log('\t\t\t', desc)
  91.                     current_articles.append({'title':title, 'url':url,
  92.                         'description':desc, 'date':''})
  93.         if current_section and current_articles:
  94.             feeds.append((current_section, current_articles))
  95.  
  96.  
  97.  
  98.     def hbr_parse_toc(self, soup):
  99.         feeds = []
  100.         features = soup.find(id='issueFeaturesContent')
  101.         self.hbr_parse_section(features, feeds)
  102.         departments = soup.find(id='issueDepartments')
  103.         self.hbr_parse_section(departments, feeds)
  104.         return feeds
  105.  
  106.  
  107.     def parse_index(self):
  108.         soup = self.hbr_get_toc()
  109.         feeds = self.hbr_parse_toc(soup)
  110.         return feeds
  111.  
  112.     def get_cover_url(self):
  113.         cover_url = None
  114.         index = 'http://hbr.org/current'
  115.         soup = self.index_to_soup(index)
  116.         link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
  117.  
  118.         if link_item:
  119.            cover_url = 'http://hbr.org' + link_item['src']
  120.  
  121.         return cover_url
  122.  
  123.  
  124.