home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3937 < prev    next >
Encoding:
Text File  |  2010-05-17  |  2.4 KB  |  77 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. '''
  7. nybooks.com
  8. '''
  9. import re
  10.  
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class NewYorkReviewOfBooks(BasicNewsRecipe):
  14.  
  15.     title = u'New York Review of Books (no subscription)'
  16.     description = u'Book reviews'
  17.     language = 'en'
  18.  
  19.     __author__ = 'Kovid Goyal'
  20.  
  21.     no_stylesheets = True
  22.     no_javascript = True
  23.  
  24.     keep_only_tags = [dict(id=['article-body', 'page-title'])]
  25.     remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
  26.         'center advertisement']})]
  27.  
  28.     preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
  29.         m:'<head></head>')]
  30.  
  31.     def print_version(self, url):
  32.         return url+'?pagination=false'
  33.  
  34.     def parse_index(self):
  35.         soup = self.index_to_soup('http://www.nybooks.com/current-issue')
  36.  
  37.         # Find cover
  38.         sidebar = soup.find(id='sidebar')
  39.         if sidebar is not None:
  40.             a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
  41.             if a is not None:
  42.                 psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
  43.                 cover = psoup.find('img', src=True)
  44.                 self.cover_url = cover['src']
  45.                 self.log('Found cover at:', self.cover_url)
  46.  
  47.         # Find date
  48.         div = soup.find(id='page-title')
  49.         if div is not None:
  50.             h5 = div.find('h5')
  51.             if h5 is not None:
  52.                 text = self.tag_to_string(h5)
  53.                 date = text.partition(u'\u2022')[0].strip()
  54.                 self.timefmt = u' [%s]'%date
  55.                 self.log('Issue date:', date)
  56.  
  57.         # Find TOC
  58.         toc = soup.find('ul', attrs={'class':'issue-article-list'})
  59.         articles = []
  60.         for li in toc.findAll('li'):
  61.             h3 = li.find('h3')
  62.             title = self.tag_to_string(h3)
  63.             author = self.tag_to_string(li.find('h4'))
  64.             title = title + u' (%s)'%author
  65.             url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
  66.             desc = ''
  67.             for p in li.findAll('p'):
  68.                 desc += self.tag_to_string(p)
  69.             self.log('Found article:', title)
  70.             self.log('\t', url)
  71.             self.log('\t', desc)
  72.             articles.append({'title':title, 'url':url, 'date':'',
  73.                 'description':desc})
  74.  
  75.         return [('Current Issue', articles)]
  76.  
  77.