home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3917 < prev    next >
Encoding:
Text File  |  2010-01-15  |  2.6 KB  |  59 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. msnbc.msn.com
  5. '''
  6.  
  7. import re
  8. from calibre.web.feeds.recipes import BasicNewsRecipe
  9.  
  10. class MsNBC(BasicNewsRecipe):
  11.     title                  = 'msnbc.com'
  12.     __author__             = 'Darko Miletic'
  13.     description            = 'A Fuller Spectrum of News'
  14.     oldest_article         = 2
  15.     max_articles_per_feed  = 100
  16.     no_stylesheets         = True
  17.     use_embedded_content   = False
  18.     encoding               = 'utf8'
  19.     publisher              = 'msnbc.com'
  20.     category               = 'news, USA, world'
  21.     language               = 'en'
  22.     extra_css              = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
  23.  
  24.     conversion_options = {
  25.                              'comments' : description
  26.                             ,'tags'     : category
  27.                             ,'language' : language
  28.                             ,'publisher': publisher
  29.                          }
  30.  
  31.     preprocess_regexps = [
  32.         (re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
  33.        ,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
  34.     ]
  35.  
  36.     remove_tags_before = dict(name='div', attrs={'class':'head'})
  37.     remove_tags_after = dict(name='div', attrs={'class':'copyright'})
  38.     remove_tags      = [dict(name=['iframe','object','link','script','form'])]
  39.  
  40.     feeds = [
  41.                (u'US News'       , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml'      )
  42.               ,(u'World News'    , u'http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml'      )
  43.               ,(u'Politics'      , u'http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml'      )
  44.               ,(u'Business'      , u'http://rss.msnbc.msn.com/id/3032071/device/rss/rss.xml'      )
  45.               ,(u'Sports'        , u'http://rss.nbcsports.msnbc.com/id/3032112/device/rss/rss.xml')
  46.               ,(u'Entertainment' , u'http://rss.msnbc.msn.com/id/3032083/device/rss/rss.xml'      )
  47.               ,(u'Health'        , u'http://rss.msnbc.msn.com/id/3088327/device/rss/rss.xml'      )
  48.               ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml'      )
  49.             ]
  50.  
  51.     def print_version(self, url):
  52.         return url + 'print/1/displaymode/1098/'
  53.  
  54.     def preprocess_html(self, soup):
  55.         for item in soup.head.findAll('div'):
  56.             item.extract()
  57.         return soup
  58.  
  59.