home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3954 < prev    next >
Encoding:
Text File  |  2010-01-15  |  2.9 KB  |  75 lines

  1. from calibre.web.feeds.recipes import BasicNewsRecipe
  2.  
  3. class NewZealandHerald(BasicNewsRecipe):
  4.  
  5.     title       = 'New Zealand Herald'
  6.     __author__  = 'Krittika Goyal'
  7.     description = 'Daily news'
  8.     timefmt = ' [%d %b, %Y]'
  9.     language = 'en_NZ'
  10.  
  11.     no_stylesheets = True
  12.     remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
  13.     remove_tags_after  = dict(name='div', attrs={'class':'callToAction'})
  14.     remove_tags = [
  15.        dict(name='iframe'),
  16.        dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
  17.        #dict(name='div', attrs={'id':['shareContainer']}),
  18.        #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
  19.        #dict(name='table', attrs={'cellspacing':'0'}),
  20.     ]
  21.  
  22.     def preprocess_html(self, soup):
  23.         table = soup.find('table')
  24.         if table is not None:
  25.             table.extract()
  26.         return soup
  27.  
  28.     #TO GET ARTICLES IN SECTION
  29.     def nz_parse_section(self, url):
  30.             soup = self.index_to_soup(url)
  31.             div = soup.find(attrs={'class':'col-300 categoryList'})
  32.             date = div.find(attrs={'class':'link-list-heading'})
  33.  
  34.             current_articles = []
  35.             for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
  36.                 if x.get('class') == 'link-list-heading': break
  37.                 for li in x.findAll('li'):
  38.                     a = li.find('a', href=True)
  39.                     if a is None:
  40.                         continue
  41.                     title = self.tag_to_string(a)
  42.                     url = a.get('href', False)
  43.                     if not url or not title:
  44.                         continue
  45.                     if url.startswith('/'):
  46.                          url = 'http://www.nzherald.co.nz'+url
  47.                     self.log('\t\tFound article:', title)
  48.                     self.log('\t\t\t', url)
  49.                     current_articles.append({'title': title, 'url':url,
  50.                         'description':'', 'date':''})
  51.  
  52.             return current_articles
  53.  
  54.  
  55.     # To GET SECTIONS
  56.     def parse_index(self):
  57.             feeds = []
  58.             for title, url in [
  59.                 ('National',
  60.                  'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
  61.                 ('World',
  62.                  'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
  63.                 ('Politics',
  64.                  'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
  65.                 ('Crime',
  66.                  'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
  67.                 ('Environment',
  68.                  'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
  69.              ]:
  70.                articles = self.nz_parse_section(url)
  71.                if articles:
  72.                    feeds.append((title, articles))
  73.             return feeds
  74.  
  75.