home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3926 < prev    next >
Encoding:
Text File  |  2010-01-14  |  3.7 KB  |  99 lines

  1. # -*- coding: utf-8 -*-
  2. from calibre.web.feeds.recipes import BasicNewsRecipe
  3.  
  4. class NYTimes(BasicNewsRecipe):
  5.  
  6.     title       = 'New England Journal of Medicine'
  7.     __author__  = 'Krittika Goyal'
  8.     description = 'Medical news'
  9.     timefmt = ' [%d %b, %Y]'
  10.     needs_subscription = True
  11.     language = 'en'
  12.  
  13.     no_stylesheets = True
  14.     remove_tags_before = dict(name='div', attrs={'align':'center'})
  15.     remove_tags_after  = dict(name='ol', attrs={'compact':'COMPACT'})
  16.     remove_tags = [
  17.        dict(name='iframe'),
  18.        #dict(name='div', attrs={'class':'related-articles'}),
  19.        dict(name='div', attrs={'id':['sidebar']}),
  20.        #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}),
  21.        dict(name='table', attrs={'align':'RIGHT'}),
  22.     ]
  23.  
  24.  
  25.  
  26.     #TO LOGIN
  27.     def get_browser(self):
  28.         br = BasicNewsRecipe.get_browser()
  29.         br.open('http://content.nejm.org/cgi/login?uri=/')
  30.         br.select_form(nr=0)
  31.         br['username'] = self.username
  32.         br['code'] = self.password
  33.         response = br.submit()
  34.         raw = response.read()
  35.         if '<strong>Welcome' not in raw:
  36.             raise Exception('Login failed. Check your username and password')
  37.         return br
  38.  
  39.     #TO GET ARTICLE TOC
  40.     def nejm_get_index(self):
  41.             return self.index_to_soup('http://content.nejm.org/current.dtl')
  42.  
  43.     # To parse artice toc
  44.     def parse_index(self):
  45.             parse_soup = self.nejm_get_index()
  46.  
  47.             div = parse_soup.find(id='centerTOC')
  48.  
  49.             current_section = None
  50.             current_articles = []
  51.             feeds = []
  52.             for x in div.findAll(True):
  53.                 if x.name == 'img' and '/toc/' in x.get('src', '') and 'uarrow.gif' not in x.get('src', ''):
  54.                     # Section heading found
  55.                     if current_articles and current_section and 'Week in the' not in current_section:
  56.                         feeds.append((current_section, current_articles))
  57.                     current_section = x.get('alt')
  58.                     current_articles = []
  59.                     self.log('\tFound section:', current_section)
  60.                 if current_section is not None and x.name == 'strong':
  61.                     title = self.tag_to_string(x)
  62.                     a = x.parent.find('a', href=lambda x: x and '/full/' in x)
  63.                     if a is None:
  64.                         continue
  65.                     url = a.get('href', False)
  66.                     if not url or not title:
  67.                         continue
  68.                     if url.startswith('/'):
  69.                          url = 'http://content.nejm.org'+url
  70.                     self.log('\t\tFound article:', title)
  71.                     self.log('\t\t\t', url)
  72.                     if url.startswith('/'):
  73.                         url = 'http://online.wsj.com'+url
  74.                     current_articles.append({'title': title, 'url':url,
  75.                         'description':'', 'date':''})
  76.  
  77.             if current_articles and current_section:
  78.                 feeds.append((current_section, current_articles))
  79.  
  80.             return feeds
  81.  
  82.     def preprocess_html(self, soup):
  83.         for a in soup.findAll(text=lambda x: x and '[in this window]' in x):
  84.             a = a.findParent('a')
  85.             url = a.get('href', None)
  86.             if not url:
  87.                 continue
  88.             if url.startswith('/'):
  89.                 url = 'http://content.nejm.org'+url
  90.             isoup = self.index_to_soup(url)
  91.             img = isoup.find('img', src=lambda x: x and
  92.                     x.startswith('/content/'))
  93.             if img is not None:
  94.                 img.extract()
  95.                 table = a.findParent('table')
  96.                 table.replaceWith(img)
  97.         return soup
  98.  
  99.