home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4441 < prev    next >
Encoding:
Text File  |  2010-10-02  |  2.5 KB  |  60 lines

  1. import re
  2. from calibre.web.feeds.news import BasicNewsRecipe
  3.  
  4.  
  5. class WashingtonPost(BasicNewsRecipe):
  6.  
  7.     title = 'Washington Post'
  8.     description = 'US political news'
  9.     __author__ = 'Kovid Goyal and Sujata Raman'
  10.     use_embedded_content   = False
  11.     max_articles_per_feed = 20
  12.     language = 'en'
  13.  
  14.  
  15.     remove_javascript = True
  16.     no_stylesheets = True
  17.  
  18.     extra_css       = '''
  19.                         #articleCopyright { font-family:Arial,helvetica,sans-serif ; font-weight:bold ; font-size:x-small ;}
  20.                         p { font-family:"Times New Roman",times,serif ; font-weight:normal ; font-size:small ;}
  21.                         body{font-family:arial,helvetica,sans-serif}
  22.                             '''
  23.  
  24.     feeds = [   ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'),
  25.                 ('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'),
  26.                 ('Nation', 'http://www.washingtonpost.com/wp-dyn/rss/nation/index.xml'),
  27.                 ('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'),
  28.                 ('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'),
  29.                 ('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'),
  30.                 ('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'),
  31.                 ('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'),
  32.                 ('Style',
  33.                      'http://www.washingtonpost.com/wp-dyn/rss/print/style/index.xml'),
  34.                 ('Sports',
  35.                      'http://feeds.washingtonpost.com/wp-dyn/rss/linkset/2010/08/19/LI2010081904067_xml'),
  36.                 ('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'),
  37.     ]
  38.  
  39.     remove_tags = [{'id':['pfmnav', 'ArticleCommentsWrapper']}]
  40.  
  41.  
  42.     def get_article_url(self, article):
  43.         return article.get('guid', article.get('link', None))
  44.  
  45.     def print_version(self, url):
  46.         return url.rpartition('.')[0] + '_pf.html'
  47.  
  48.     def postprocess_html(self, soup, first):
  49.         for div in soup.findAll(name='div', style=re.compile('margin')):
  50.             div['style'] = ''
  51.         return soup
  52.  
  53.     def preprocess_html(self, soup):
  54.         for tag in soup.findAll('font'):
  55.             if tag.has_key('size'):
  56.                 if tag['size'] == '+2':
  57.                     if tag.b:
  58.                         return soup
  59.         return None
  60.