home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3616 < prev    next >
Encoding:
Text File  |  2010-05-21  |  4.1 KB  |  111 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. theatlantic.com
  7. '''
  8. import string, re
  9.  
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  12.  
  13. class TheAtlantic(BasicNewsRecipe):
  14.  
  15.     title      = 'The Atlantic'
  16.     __author__ = 'Kovid Goyal and Sujata Raman'
  17.     description = 'Current affairs and politics focussed on the US'
  18.     INDEX = 'http://www.theatlantic.com/magazine/toc/0/'
  19.     language = 'en'
  20.  
  21.     remove_tags_before = dict(name='div', id='articleHead')
  22.     remove_tags_after  = dict(id='copyright')
  23.     remove_tags        = [dict(id=['header', 'printAds', 'pageControls'])]
  24.     no_stylesheets = True
  25.  
  26.     preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
  27.  
  28.  
  29.     def print_version(self, url):
  30.         return url.replace('/archive/', '/print/')
  31.  
  32.     def parse_index(self):
  33.         articles = []
  34.  
  35.         soup = self.index_to_soup(self.INDEX)
  36.         sectit = soup.find('h1', attrs={'class':'sectionTitle'})
  37.         if sectit is not None:
  38.             texts = self.tag_to_string(sectit).strip().split()[-2:]
  39.             if texts:
  40.                 self.timefmt = ' [%s]'%(' '.join(texts))
  41.  
  42.         cover = soup.find('img', src=True, attrs={'class':'cover'})
  43.         if cover is not None:
  44.             self.cover_url = cover['src']
  45.  
  46.         feeds = []
  47.         for section in soup.findAll('div', attrs={'class':'magazineSection'}):
  48.             section_title = section.find(attrs={'class':'sectionHeader'})
  49.             section_title = string.capwords(self.tag_to_string(section_title))
  50.             self.log('Found section:', section_title)
  51.             articles = []
  52.             for post in section.findAll('div', attrs={'class':'post'}):
  53.                 h = post.find(['h3', 'h4'])
  54.                 title = self.tag_to_string(h)
  55.                 a = post.find('a', href=True)
  56.                 url = a['href']
  57.                 if url.startswith('/'):
  58.                     url = 'http://www.theatlantic.com'+url
  59.                 p = post.find('p', attrs={'class':'dek'})
  60.                 desc = None
  61.                 self.log('\tFound article:', title, 'at', url)
  62.                 if p is not None:
  63.                     desc = self.tag_to_string(p)
  64.                     self.log('\t\t', desc)
  65.                 articles.append({'title':title, 'url':url, 'description':desc,
  66.                     'date':''})
  67.             feeds.append((section_title, articles))
  68.  
  69.         poems = []
  70.         self.log('Found section: Poems')
  71.         for poem in soup.findAll('div', attrs={'class':'poem'}):
  72.             title = self.tag_to_string(poem.find('h4'))
  73.             desc  = self.tag_to_string(poem.find(attrs={'class':'author'}))
  74.             url   = 'http://www.theatlantic.com'+poem.find('a')['href']
  75.             self.log('\tFound article:', title, 'at', url)
  76.             self.log('\t\t', desc)
  77.             poems.append({'title':title, 'url':url, 'description':desc,
  78.                     'date':''})
  79.         if poems:
  80.             feeds.append(('Poems', poems))
  81.  
  82.         div = soup.find(id='advice')
  83.         if div is not None:
  84.             self.log('Found section: Advice')
  85.             title = self.tag_to_string(div.find('h4'))
  86.             url = 'http://www.theatlantic.com'+div.find('a')['href']
  87.             desc = self.tag_to_string(div.find('p'))
  88.             self.log('\tFound article:', title, 'at', url)
  89.             self.log('\t\t', desc)
  90.  
  91.         feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
  92.                     'date':''}]))
  93.         return feeds
  94.  
  95.     def postprocess_html(self, soup, first):
  96.         for table in soup.findAll('table', align='right'):
  97.             img = table.find('img')
  98.             if img is not None:
  99.                 img.extract()
  100.                 caption = self.tag_to_string(table).strip()
  101.                 div = Tag(soup, 'div')
  102.                 div['style'] = 'text-align:center'
  103.                 div.insert(0, img)
  104.                 div.insert(1, Tag(soup, 'br'))
  105.                 if caption:
  106.                     div.insert(2, NavigableString(caption))
  107.                 table.replaceWith(div)
  108.  
  109.         return soup
  110.  
  111.