home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_3851 < prev    next >
Encoding:
Text File  |  2010-10-18  |  4.2 KB  |  115 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. theatlantic.com
  7. '''
  8. import string, re
  9.  
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  12.  
  13. class TheAtlantic(BasicNewsRecipe):
  14.  
  15.     title      = 'The Atlantic'
  16.     __author__ = 'Kovid Goyal and Sujata Raman'
  17.     description = 'Current affairs and politics focussed on the US'
  18.     INDEX = 'http://www.theatlantic.com/magazine/toc/0/'
  19.     language = 'en'
  20.  
  21.     remove_tags_before = dict(name='div', id='articleHead')
  22.     remove_tags_after  = dict(id='copyright')
  23.     remove_tags        = [dict(id=['header', 'printAds', 'pageControls'])]
  24.     no_stylesheets = True
  25.  
  26.     preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
  27.  
  28.  
  29.     def print_version(self, url):
  30.         return url.replace('/archive/', '/print/')
  31.  
  32.     def parse_index(self):
  33.         articles = []
  34.  
  35.         soup = self.index_to_soup(self.INDEX)
  36.         sectit = soup.find('h1', attrs={'class':'sectionTitle'})
  37.         if sectit is not None:
  38.             texts = self.tag_to_string(sectit).strip().split()[-2:]
  39.             if texts:
  40.                 self.timefmt = ' [%s]'%(' '.join(texts))
  41.  
  42.         cover = soup.find('img', src=True, attrs={'class':'cover'})
  43.         if cover is not None:
  44.             self.cover_url = cover['src']
  45.  
  46.         feeds = []
  47.         for section in soup.findAll('div', attrs={'class':'magazineSection'}):
  48.             section_title = section.find(attrs={'class':'sectionHeader'})
  49.             section_title = string.capwords(self.tag_to_string(section_title))
  50.             self.log('Found section:', section_title)
  51.             articles = []
  52.             for post in section.findAll('div', attrs={'class':'post'}):
  53.                 h = post.find(['h3', 'h4'])
  54.                 title = self.tag_to_string(h)
  55.                 a = post.find('a', href=True)
  56.                 url = a['href']
  57.                 if url.startswith('/'):
  58.                     url = 'http://www.theatlantic.com'+url
  59.                 p = post.find('p', attrs={'class':'dek'})
  60.                 desc = None
  61.                 self.log('\tFound article:', title, 'at', url)
  62.                 if p is not None:
  63.                     desc = self.tag_to_string(p)
  64.                     self.log('\t\t', desc)
  65.                 articles.append({'title':title, 'url':url, 'description':desc,
  66.                     'date':''})
  67.             feeds.append((section_title, articles))
  68.  
  69.         poems = []
  70.         self.log('Found section: Poems')
  71.         for poem in soup.findAll('div', attrs={'class':'poem'}):
  72.             title = self.tag_to_string(poem.find('h4'))
  73.             desc  = self.tag_to_string(poem.find(attrs={'class':'author'}))
  74.             url   = poem.find('a')['href']
  75.             if url.startswith('/'):
  76.                 url = 'http://www.theatlantic.com' + url
  77.             self.log('\tFound article:', title, 'at', url)
  78.             self.log('\t\t', desc)
  79.             poems.append({'title':title, 'url':url, 'description':desc,
  80.                     'date':''})
  81.         if poems:
  82.             feeds.append(('Poems', poems))
  83.  
  84.         div = soup.find(id='advice')
  85.         if div is not None:
  86.             self.log('Found section: Advice')
  87.             title = self.tag_to_string(div.find('h4'))
  88.             url = div.find('a')['href']
  89.             if url.startswith('/'):
  90.                 url = 'http://www.theatlantic.com' + url
  91.             desc = self.tag_to_string(div.find('p'))
  92.             self.log('\tFound article:', title, 'at', url)
  93.             self.log('\t\t', desc)
  94.  
  95.         feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
  96.                     'date':''}]))
  97.         return feeds
  98.  
  99.     def postprocess_html(self, soup, first):
  100.         for table in soup.findAll('table', align='right'):
  101.             img = table.find('img')
  102.             if img is not None:
  103.                 img.extract()
  104.                 caption = self.tag_to_string(table).strip()
  105.                 div = Tag(soup, 'div')
  106.                 div['style'] = 'text-align:center'
  107.                 div.insert(0, img)
  108.                 div.insert(1, Tag(soup, 'br'))
  109.                 if caption:
  110.                     div.insert(2, NavigableString(caption))
  111.                 table.replaceWith(div)
  112.  
  113.         return soup
  114.  
  115.