home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4002 < prev    next >
Encoding:
Text File  |  2010-01-19  |  6.9 KB  |  189 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. '''
  4. '''
  5. from calibre.web.feeds.recipes import BasicNewsRecipe
  6. from calibre.web.feeds import Feed
  7.  
  8.  
  9. class ReadersDigest(BasicNewsRecipe):
  10.  
  11.     title       = 'Readers Digest'
  12.     __author__  = 'BrianG'
  13.     language = 'en'
  14.     description = 'Readers Digest Feeds'
  15.     no_stylesheets        = True
  16.     use_embedded_content  = False
  17.     oldest_article = 60
  18.     max_articles_per_feed = 200
  19.  
  20.     language = 'en'
  21.     remove_javascript     = True
  22.  
  23.     extra_css      = ''' h1 {font-family:georgia,serif;color:#000000;}
  24.                         .mainHd{font-family:georgia,serif;color:#000000;}
  25.                          h2 {font-family:Arial,Sans-serif;}
  26.                         .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
  27.                         .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
  28.                         .byline{font-family:Arial,Sans-serif; font-size:x-small ;}
  29.                         .photoBkt{ font-size:x-small ;}
  30.                         .vertPhoto{font-size:x-small ;}
  31.                         .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
  32.                         .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
  33.                         .artTxt{font-family:georgia,serif;}
  34.                         .caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
  35.                         .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
  36.                         a:link{color:#CC0000;}
  37.                         .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
  38.                         '''
  39.  
  40.  
  41.     remove_tags = [
  42.         dict(name='h4', attrs={'class':'close'}),
  43.         dict(name='div', attrs={'class':'fromLine'}),
  44.         dict(name='img', attrs={'class':'colorTag'}),
  45.         dict(name='div', attrs={'id':'sponsorArticleHeader'}),
  46.         dict(name='div', attrs={'class':'horizontalAd'}),
  47.         dict(name='div', attrs={'id':'imageCounterLeft'}),
  48.         dict(name='div', attrs={'id':'commentsPrint'})
  49.         ]
  50.  
  51.  
  52.     feeds = [
  53.             ('New in RD', 'http://feeds.rd.com/ReadersDigest'),
  54.             ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
  55.             ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
  56.             ('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
  57.         ]
  58.  
  59.     cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
  60.  
  61.  
  62.  
  63. #-------------------------------------------------------------------------------------------------
  64.  
  65.     def print_version(self, url):
  66.  
  67.         # Get the identity number of the current article and append it to the root print URL
  68.  
  69.         if url.find('/article') > 0:
  70.             ident = url[url.find('/article')+8:url.find('.html?')-4]
  71.             url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
  72.  
  73.         elif url.find('/post') > 0:
  74.  
  75.             # in this case, have to get the page itself to derive the Print page.
  76.             soup = self.index_to_soup(url)
  77.             newsoup = soup.find('ul',attrs={'class':'printBlock'})
  78.             url = 'http://www.rd.com' + newsoup('a')[0]['href']
  79.             url = url[0:url.find('&Keep')]
  80.  
  81.         return url
  82.  
  83. #-------------------------------------------------------------------------------------------------
  84.  
  85.     def parse_index(self):
  86.  
  87.         pages = [
  88.                 ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
  89.                 # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
  90.                 ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
  91.  
  92.             ]
  93.  
  94.         feeds = []
  95.  
  96.         for page in pages:
  97.             section, url, divider, attrList = page
  98.             newArticles = self.page_parse(url, divider, attrList)
  99.             feeds.append((section,newArticles))
  100.  
  101.         # after the pages of the site have been processed, parse several RSS feeds for additional sections
  102.         newfeeds = Feed()
  103.         newfeeds = self.parse_rss()
  104.  
  105.  
  106.         # The utility code in parse_rss returns a Feed object.  Convert each feed/article combination into a form suitable
  107.         # for this module (parse_index).
  108.  
  109.         for feed in newfeeds:
  110.             newArticles = []
  111.             for article in feed.articles:
  112.                 newArt = {
  113.                             'title' : article.title,
  114.                             'url'   : article.url,
  115.                             'date'  : article.date,
  116.                             'description' : article.text_summary
  117.                         }
  118.                 newArticles.append(newArt)
  119.  
  120.  
  121.             # New and Blogs should be the first two feeds.
  122.             if feed.title == 'New in RD':
  123.                 feeds.insert(0,(feed.title,newArticles))
  124.             elif feed.title == 'Blogs':
  125.                 feeds.insert(1,(feed.title,newArticles))
  126.             else:
  127.                 feeds.append((feed.title,newArticles))
  128.  
  129.  
  130.         return feeds
  131.  
  132. #-------------------------------------------------------------------------------------------------
  133.  
  134.     def page_parse(self, mainurl, divider, attrList):
  135.  
  136.         articles = []
  137.         mainsoup = self.index_to_soup(mainurl)
  138.         for item in mainsoup.findAll(attrs=attrList):
  139.             newArticle = {
  140.                         'title' : item('img')[0]['alt'],
  141.                         'url'   : 'http://www.rd.com'+item('a')[0]['href'],
  142.                         'date'  : '',
  143.                         'description' : ''
  144.                     }
  145.             articles.append(newArticle)
  146.  
  147.  
  148.  
  149.         return articles
  150.  
  151.  
  152.  
  153. #-------------------------------------------------------------------------------------------------
  154.  
  155.     def parse_rss (self):
  156.  
  157.         # Do the "official" parse_feeds first
  158.         feeds = BasicNewsRecipe.parse_feeds(self)
  159.  
  160.  
  161.         # Loop thru the articles in all feeds to find articles with "recipe" in it
  162.         recipeArticles = []
  163.         for curfeed in feeds:
  164.             delList = []
  165.             for a,curarticle in enumerate(curfeed.articles):
  166.                 if curarticle.title.upper().find('RECIPE') >= 0:
  167.                     recipeArticles.append(curarticle)
  168.                     delList.append(curarticle)
  169.             if len(delList)>0:
  170.                 for d in delList:
  171.                     index = curfeed.articles.index(d)
  172.                     curfeed.articles[index:index+1] = []
  173.  
  174.         # If there are any recipes found, create a new Feed object and append.
  175.         if len(recipeArticles) > 0:
  176.             pfeed = Feed()
  177.             pfeed.title = 'Recipes'
  178.             pfeed.descrition = 'Recipe Feed (Virtual)'
  179.             pfeed.image_url  = None
  180.             pfeed.oldest_article = 30
  181.             pfeed.id_counter = len(recipeArticles)
  182.             # Create a new Feed, add the recipe articles, and then append
  183.             # to "official" list of feeds
  184.             pfeed.articles = recipeArticles[:]
  185.             feeds.append(pfeed)
  186.  
  187.         return feeds
  188.  
  189.