home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3931 < prev    next >
Encoding:
Text File  |  2010-06-01  |  2.7 KB  |  77 lines

  1. import string
  2. from calibre.web.feeds.news import BasicNewsRecipe
  3.  
  4. class Newsweek(BasicNewsRecipe):
  5.  
  6.     title          = 'Newsweek'
  7.     __author__     = 'Kovid Goyal'
  8.     description    = 'Weekly news and current affairs in the US'
  9.     language       = 'en'
  10.     encoding       = 'utf-8'
  11.     no_stylesheets = True
  12.  
  13.     BASE_URL = 'http://www.newsweek.com'
  14.     INDEX = BASE_URL+'/topics.html'
  15.  
  16.     keep_only_tags = dict(name='article', attrs={'class':'article-text'})
  17.     remove_tags = [dict(attrs={'data-dartad':True})]
  18.     remove_attributes = ['property']
  19.  
  20.     def postprocess_html(self, soup, first):
  21.         for tag in soup.findAll(name=['article', 'header']):
  22.             tag.name = 'div'
  23.         return soup
  24.  
  25.     def newsweek_sections(self):
  26.         soup = self.index_to_soup(self.INDEX)
  27.         for a in soup.findAll('a', title='Primary tag', href=True):
  28.             yield (string.capitalize(self.tag_to_string(a)),
  29.                     self.BASE_URL+a['href'])
  30.  
  31.  
  32.     def newsweek_parse_section_page(self, soup):
  33.         for article in soup.findAll('article', about=True,
  34.                 attrs={'class':'stream-item'}):
  35.             title = article.find(attrs={'property': 'dc:title'})
  36.             if title is None: continue
  37.             title = self.tag_to_string(title)
  38.             url = self.BASE_URL + article['about']
  39.             desc = ''
  40.             author = article.find({'property':'dc:creator'})
  41.             if author:
  42.                 desc = u'by %s. '%self.tag_to_string(author)
  43.             p = article.find(attrs={'property':'dc:abstract'})
  44.             if p is not None:
  45.                 for a in p.find('a'): a.extract()
  46.                 desc += self.tag_to_string(p)
  47.             t = article.find('time', attrs={'property':'dc:created'})
  48.             date = ''
  49.             if t is not None:
  50.                 date = u' [%s]'%self.tag_to_string(t)
  51.             self.log('\tFound article:', title, 'at', url)
  52.             self.log('\t\t', desc)
  53.             yield {'title':title, 'url':url, 'description':desc, 'date':date}
  54.  
  55.  
  56.     def parse_index(self):
  57.         sections = []
  58.         for section, shref in self.newsweek_sections():
  59.             self.log('Processing section', section, shref)
  60.             articles = []
  61.             soups = [self.index_to_soup(shref)]
  62.             na = soups[0].find('a', rel='next')
  63.             if na:
  64.                 soups.append(self.index_to_soup(self.BASE_URL+na['href']))
  65.             for soup in soups:
  66.                 articles.extend(self.newsweek_parse_section_page(soup))
  67.                 if self.test and len(articles) > 1:
  68.                     break
  69.             if articles:
  70.                 sections.append((section, articles))
  71.             if self.test and len(sections) > 1:
  72.                 break
  73.         return sections
  74.  
  75.  
  76.  
  77.