home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3936 < prev    next >
Encoding:
Text File  |  2010-05-17  |  2.7 KB  |  88 lines

  1.  
  2. #!/usr/bin/env  python
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  5. __docformat__ = 'restructuredtext en'
  6.  
  7. '''
  8. nybooks.com
  9. '''
  10. import re
  11.  
  12. from calibre.web.feeds.news import BasicNewsRecipe
  13.  
  14. class NewYorkReviewOfBooks(BasicNewsRecipe):
  15.  
  16.     title = u'New York Review of Books'
  17.     description = u'Book reviews'
  18.     language = 'en'
  19.  
  20.     __author__ = 'Kovid Goyal'
  21.  
  22.     no_stylesheets = True
  23.     no_javascript = True
  24.     needs_subscription = True
  25.  
  26.     keep_only_tags = [dict(id=['article-body','page-title'])]
  27.     remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
  28.         'center advertisement']})]
  29.  
  30.     preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
  31.         m:'<head></head>')]
  32.  
  33.     def get_browser(self):
  34.         br = BasicNewsRecipe.get_browser()
  35.         br.open('http://www.nybooks.com/account/signin/')
  36.         br.select_form(nr = 1)
  37.         br['username'] = self.username
  38.         br['password'] = self.password
  39.         br.submit()
  40.         return br
  41.  
  42.     def print_version(self, url):
  43.         return url+'?pagination=false'
  44.  
  45.     def parse_index(self):
  46.         soup = self.index_to_soup('http://www.nybooks.com/current-issue')
  47.  
  48.         # Find cover
  49.         sidebar = soup.find(id='sidebar')
  50.         if sidebar is not None:
  51.             a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
  52.             if a is not None:
  53.                 psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
  54.                 cover = psoup.find('img', src=True)
  55.                 self.cover_url = cover['src']
  56.                 self.log('Found cover at:', self.cover_url)
  57.  
  58.         # Find date
  59.         div = soup.find(id='page-title')
  60.         if div is not None:
  61.             h5 = div.find('h5')
  62.             if h5 is not None:
  63.                 text = self.tag_to_string(h5)
  64.                 date = text.partition(u'\u2022')[0].strip()
  65.                 self.timefmt = u' [%s]'%date
  66.                 self.log('Issue date:', date)
  67.  
  68.         # Find TOC
  69.         toc = soup.find('ul', attrs={'class':'issue-article-list'})
  70.         articles = []
  71.         for li in toc.findAll('li'):
  72.             h3 = li.find('h3')
  73.             title = self.tag_to_string(h3)
  74.             author = self.tag_to_string(li.find('h4'))
  75.             title = title + u' (%s)'%author
  76.             url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
  77.             desc = ''
  78.             for p in li.findAll('p'):
  79.                 desc += self.tag_to_string(p)
  80.             self.log('Found article:', title)
  81.             self.log('\t', url)
  82.             self.log('\t', desc)
  83.             articles.append({'title':title, 'url':url, 'date':'',
  84.                 'description':desc})
  85.  
  86.         return [('Current Issue', articles)]
  87.  
  88.