home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3819 < prev    next >
Encoding:
Text File  |  2009-12-30  |  2.7 KB  |  77 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2.  
  3. class IndiaToday(BasicNewsRecipe):
  4.  
  5.     title = 'India Today'
  6.     __author__ = 'Kovid Goyal'
  7.     language = 'en_IN'
  8.     timefmt = ' [%d %m, %Y]'
  9.  
  10.     oldest_article = 700
  11.     max_articles_per_feed = 10
  12.  
  13.     no_stylesheets = True
  14.  
  15.     remove_tags_before = dict(id='content_story_title')
  16.     remove_tags_after = dict(id='rightblockdiv')
  17.     remove_tags = [dict(id=['rightblockdiv', 'share_links'])]
  18.  
  19.     extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}'
  20.     conversion_options = { 'linearize_tables': True }
  21.  
  22.     def it_get_index(self):
  23.         soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive')
  24.         a = soup.find('a', href=lambda x: x and 'issueId=' in x)
  25.         url = 'http://indiatoday.intoday.in/site/'+a.get('href')
  26.         img = a.find('img')
  27.         self.cover_url = img.get('src')
  28.         return self.index_to_soup(url)
  29.  
  30.     def parse_index(self):
  31.         soup = self.it_get_index()
  32.         feeds, current_section, current_articles = [], None, []
  33.         for x in soup.findAll(name=['h1', 'a']):
  34.             if x.name == 'h1':
  35.                 if current_section and current_articles:
  36.                     feeds.append((current_section, current_articles))
  37.                 current_section = self.tag_to_string(x)
  38.                 current_articles = []
  39.                 self.log('\tFound section:', current_section)
  40.             elif x.name == 'a' and 'Story' in x.get('href', ''):
  41.                 title = self.tag_to_string(x)
  42.                 url = x.get('href')
  43.                 url = url.replace(' ', '%20')
  44.                 if not url.startswith('/'):
  45.                     url = 'http://indiatoday.intoday.in/site/' + url
  46.                 if title and url:
  47.                     url += '?complete=1'
  48.                     self.log('\tFound article:', title)
  49.                     self.log('\t\t', url)
  50.                     desc = ''
  51.                     h3 = x.parent.findNextSibling('h3')
  52.                     if h3 is not None:
  53.                         desc = 'By ' + self.tag_to_string(h3)
  54.                         h4 = h3.findNextSibling('h4')
  55.                         if h4 is not None:
  56.                             desc = self.tag_to_string(h4) + ' ' + desc
  57.                     if desc:
  58.                         self.log('\t\t', desc)
  59.                     current_articles.append({'title':title, 'description':desc,
  60.                         'url':url, 'date':''})
  61.  
  62.         if current_section and current_articles:
  63.             feeds.append((current_section, current_articles))
  64.  
  65.         return feeds
  66.  
  67.     def postprocess_html(self, soup, first):
  68.         a = soup.find(text='Print')
  69.         if a is not None:
  70.             tr = a.findParent('tr')
  71.             if tr is not None:
  72.                 tr.extract()
  73.         return soup
  74.  
  75.  
  76.  
  77.