home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3692 < prev    next >
Encoding:
Text File  |  2009-12-31  |  1.9 KB  |  58 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class DenverPost(BasicNewsRecipe):
  5.     title          = u'Denver Post'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 1 #days
  9.     max_articles_per_feed = 20
  10.  
  11.     conversion_options = {'linearize_tables':True}
  12.  
  13.     no_stylesheets = True
  14.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  15.     #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
  16.     remove_tags = [
  17.        dict(name='iframe'),
  18.        dict(name='img', src=lambda x: not x or '/tracking/' in x),
  19.        dict(name='span', attrs={'fd-id':True}),
  20.        dict(name='div', attrs={'class':['articleOptions', 'articlePosition2']}),
  21.        #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
  22.        #dict(name='ul', attrs={'class':'article-tools'}),
  23.        #dict(name='ul', attrs={'class':'articleTools'}),
  24.     ]
  25.  
  26.     feeds          = [
  27. ('Top Stories',
  28.  'http://feeds.denverpost.com/dp-news-topstories'),
  29. ('Business',
  30.  'http://feeds.denverpost.com/dp-business'),
  31. ('Sports',
  32.  'http://feeds.denverpost.com/dp-sports'),
  33. ('Lifestyles',
  34.  'http://feeds.denverpost.com/dp-lifestyles'),
  35. ('Politics',
  36.  'http://feeds.denverpost.com/dp-politics'),
  37. ('Entertainment',
  38.  'http://feeds.denverpost.com/dp-entertainment'),
  39.  
  40. ]
  41.  
  42.     def preprocess_html(self, soup):
  43.         story = soup.find(name='td', attrs={'class':'articleBox'})
  44.         #td = heading.findParent(name='td')
  45.         #td.extract()
  46.         story.extract()
  47.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  48.         body = soup.find(name='body')
  49.         body.insert(0, story)
  50.         story.name = 'div'
  51.  
  52.         for img in soup.findAll(name='img', style='visibility:hidden;'):
  53.             del img['style']
  54.  
  55.         for div in soup.findAll(id='caption', style=True):
  56.             del  div['style']
  57.         return soup
  58.