home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4042 < prev    next >
Encoding:
Text File  |  2010-03-03  |  1.9 KB  |  53 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class SmithsonianMagazine(BasicNewsRecipe):
  5.     title          = u'Smithsonian Magazine'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 31#days
  9.     max_articles_per_feed = 50
  10.     #encoding = 'latin1'
  11.     recursions = 1
  12.     match_regexps = ['&page=[2-9]$']
  13.  
  14.     remove_stylesheets = True
  15.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  16.     remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
  17.     remove_tags = [
  18.        dict(name='iframe'),
  19.        dict(name='div', attrs={'class':'article_sidebar_border'}),
  20.        dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
  21.        #dict(name='ul', attrs={'class':'article-tools'}),
  22.        dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
  23.     ]
  24.  
  25.  
  26.     feeds          = [
  27. ('History and Archeology',
  28.  'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
  29. ('People and Places',
  30.  'http://feeds.feedburner.com/smithsonianmag/people-places'),
  31. ('Science and Nature',
  32.  'http://feeds.feedburner.com/smithsonianmag/science-nature'),
  33. ('Arts and Culture',
  34.  'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
  35. ('Travel',
  36.  'http://feeds.feedburner.com/smithsonianmag/travel'),
  37. ]
  38.  
  39.     def preprocess_html(self, soup):
  40.         story = soup.find(name='div', attrs={'id':'article-left'})
  41.         #td = heading.findParent(name='td')
  42.         #td.extract()
  43.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  44.         body = soup.find(name='body')
  45.         body.insert(0, story)
  46.         return soup
  47.  
  48.     def postprocess_html(self, soup, first):
  49.         for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
  50.         if not first:
  51.              for div in soup.findAll(id='article-head'): div.extract()
  52.         return soup
  53.