home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3793 < prev    next >
Encoding:
Text File  |  2010-06-05  |  8.0 KB  |  198 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. import re
  3.  
  4. # Needed for BLOGs
  5. from calibre.web.feeds import Feed
  6.  
  7. class HBR(BasicNewsRecipe):
  8.  
  9.     title = 'Harvard Business Review Blogs'
  10.     description = 'To subscribe go to http://hbr.harvardbusiness.org'
  11.     needs_subscription = True
  12.     __author__ = 'Kovid Goyal and Sujata Raman, enhanced by BrianG'
  13.     language = 'en'
  14.     no_stylesheets = True
  15.  
  16.     LOGIN_URL = 'http://hbr.org/login?request_url=/'
  17.     INDEX = 'http://hbr.org/current'
  18.  
  19.     #
  20.     # Blog Stuff
  21.     #
  22.  
  23.  
  24.     INCLUDE_BLOGS = True
  25.     INCLUDE_ARTICLES = False
  26.  
  27.     # option-specific settings.
  28.  
  29.     if INCLUDE_BLOGS == True:
  30.         remove_tags_after = dict(id='articleBody')
  31.         remove_tags_before = dict(id='pageFeature')
  32.         feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
  33.         oldest_article = 30
  34.         max_articles_per_feed = 100
  35.     else:
  36.         timefmt                = ' [%B %Y]'
  37.  
  38.  
  39.     keep_only_tags = [    dict(name='div', id='pageContainer')
  40.                 ]
  41.  
  42.     remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
  43.         'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
  44.         'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
  45.         'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD',
  46.         'mailingListTout', 'partnerCenter', 'pageFooter']),
  47.         dict(name='iframe')]
  48.  
  49.     extra_css = '''
  50.                 a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
  51.                 .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  52.                 h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
  53.                 h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;  }
  54.                 #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
  55.                 #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
  56.                 '''
  57. #-------------------------------------------------------------------------------------------------
  58.  
  59.     def get_browser(self):
  60.         br = BasicNewsRecipe.get_browser(self)
  61.         br.open(self.LOGIN_URL)
  62.         br.select_form(name='signInForm')
  63.         br['signInForm:username'] = self.username
  64.         br['signInForm:password'] = self.password
  65.         raw = br.submit().read()
  66.         if 'My Account' not in raw:
  67.             raise Exception('Failed to login, are you sure your username and password are correct?')
  68.         self.logout_url = None
  69.         link = br.find_link(text='Sign out')
  70.         if link:
  71.             self.logout_url = link.absolute_url
  72.         return br
  73. #-------------------------------------------------------------------------------------------------
  74.     def cleanup(self):
  75.         if self.logout_url is not None:
  76.             self.browser.open(self.logout_url)
  77. #-------------------------------------------------------------------------------------------------
  78.     def map_url(self, url):
  79.         if url.endswith('/ar/1'):
  80.             return url[:-1]+'pr'
  81. #-------------------------------------------------------------------------------------------------
  82.  
  83.     def hbr_get_toc(self):
  84.         soup = self.index_to_soup(self.INDEX)
  85.         url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href')
  86.         return self.index_to_soup('http://hbr.org'+url)
  87.  
  88. #-------------------------------------------------------------------------------------------------
  89.  
  90.     def hbr_parse_section(self, container, feeds):
  91.         current_section = None
  92.         current_articles = []
  93.         for x in container.findAll(name=['li', 'h3', 'h4']):
  94.             if x.name in ['h3', 'h4'] and not x.findAll(True):
  95.                 if current_section and current_articles:
  96.                     feeds.append((current_section, current_articles))
  97.                 current_section = self.tag_to_string(x)
  98.                 current_articles = []
  99.                 self.log('\tFound section:', current_section)
  100.             if x.name == 'li':
  101.                 a = x.find('a', href=True)
  102.                 if a is not None:
  103.                     title = self.tag_to_string(a)
  104.                     url = a.get('href')
  105.                     if '/ar/' not in url:
  106.                         continue
  107.                     if url.startswith('/'):
  108.                         url = 'http://hbr.org'+url
  109.                     url = self.map_url(url)
  110.                     p = x.find('p')
  111.                     desc = ''
  112.                     if p is not None:
  113.                         desc = self.tag_to_string(p)
  114.                     if not title or not url:
  115.                         continue
  116.                     self.log('\t\tFound article:', title)
  117.                     self.log('\t\t\t', url)
  118.                     self.log('\t\t\t', desc)
  119.                     current_articles.append({'title':title, 'url':url,
  120.                         'description':desc, 'date':''})
  121.         if current_section and current_articles:
  122.             feeds.append((current_section, current_articles))
  123.  
  124. #-------------------------------------------------------------------------------------------------
  125.  
  126.     def hbr_parse_toc(self, soup):
  127.         feeds = []
  128.         features = soup.find(id='issueFeaturesContent')
  129.         self.hbr_parse_section(features, feeds)
  130.         departments = soup.find(id='issueDepartments')
  131.         self.hbr_parse_section(departments, feeds)
  132.         return feeds
  133. #-------------------------------------------------------------------------------------------------
  134.     def feed_to_index_append(self, feedObject, masterFeed):
  135.         # Loop thru the feed object and build the correct type of article list
  136.         for feed in feedObject:
  137.         # build the correct structure from the feed object
  138.             newArticles = []
  139.             for article in feed.articles:
  140.                 newArt = {
  141.                 'title' : article.title,
  142.                 'url'   : article.url,
  143.                 'date'  : article.date,
  144.                 'description' : article.text_summary
  145.                 }
  146.                 newArticles.append(newArt)
  147.  
  148.         # Append the earliest/latest dates of the feed to the feed title
  149.         startDate, endDate = self.get_feed_dates(feed, '%d-%b')
  150.         newFeedTitle = feed.title + '  (' + startDate + ' thru ' + endDate + ')'
  151.  
  152.         # append the newly-built list object to the index object passed in
  153.         # as masterFeed.
  154.         masterFeed.append( (newFeedTitle,newArticles) )
  155.  
  156. #-------------------------------------------------------------------------------------------------
  157.     def get_feed_dates(self, feedObject, dateMask):
  158.         startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask)
  159.         endDate   = feedObject.articles[0].localtime.strftime(dateMask)
  160.  
  161.         return startDate, endDate
  162.  
  163. #-------------------------------------------------------------------------------------------------
  164.     def hbr_parse_blogs(self, feeds):
  165.         # Do the "official" parse_feeds first
  166.         rssFeeds = Feed()
  167.  
  168.         # Use the PARSE_FEEDS method to get a Feeds object of the articles
  169.         rssFeeds = BasicNewsRecipe.parse_feeds(self)
  170.  
  171.         # Create a new feed of the right configuration and append to existing afeeds
  172.         self.feed_to_index_append(rssFeeds[:], feeds)
  173.  
  174. #-------------------------------------------------------------------------------------------------
  175.     def parse_index(self):
  176.         if self.INCLUDE_ARTICLES == True:
  177.             soup = self.hbr_get_toc()
  178.             feeds = self.hbr_parse_toc(soup)
  179.         else:
  180.             feeds = []
  181.  
  182.         # blog stuff
  183.         if self.INCLUDE_BLOGS == True:
  184.             self.hbr_parse_blogs(feeds)
  185.  
  186.         return feeds
  187. #-------------------------------------------------------------------------------------------------
  188.     def get_cover_url(self):
  189.         cover_url = None
  190.         index = 'http://hbr.org/current'
  191.         soup = self.index_to_soup(index)
  192.         link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
  193.  
  194.         if link_item:
  195.            cover_url = 'http://hbr.org' + link_item['src']
  196.  
  197.         return cover_url
  198.