home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3838 < prev    next >
Encoding:
Text File  |  2010-03-30  |  3.3 KB  |  89 lines

  1. # -*- coding: utf-8 -*-
  2.  
  3. from calibre.web.feeds.recipes import BasicNewsRecipe
  4.  
  5. class JournalofHospitalMedicine(BasicNewsRecipe):
  6.  
  7.     title       = 'Journal of Hospital Medicine'
  8.     __author__  = 'Krittika Goyal'
  9.     description = 'Medical news'
  10.     timefmt = ' [%d %b, %Y]'
  11.     needs_subscription = True
  12.     language = 'en'
  13.  
  14.     no_stylesheets = True
  15.     #remove_tags_before = dict(name='div', attrs={'align':'center'})
  16.     #remove_tags_after  = dict(name='ol', attrs={'compact':'COMPACT'})
  17.     remove_tags = [
  18.        dict(name='iframe'),
  19.        dict(name='div', attrs={'class':'subContent'}),
  20.        dict(name='div', attrs={'id':['contentFrame']}),
  21.        #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}),
  22.        #dict(name='table', attrs={'align':'RIGHT'}),
  23.     ]
  24.  
  25.  
  26.  
  27.    # TO LOGIN
  28.     def get_browser(self):
  29.         br = BasicNewsRecipe.get_browser()
  30.         br.open('http://www3.interscience.wiley.com/cgi-bin/home')
  31.         br.select_form(name='siteLogin')
  32.         br['LoginName'] = self.username
  33.         br['Password'] = self.password
  34.         response = br.submit()
  35.         raw = response.read()
  36.         if 'userName = ""' in raw:
  37.             raise Exception('Login failed. Check your username and password')
  38.         return br
  39.  
  40.     #TO GET ARTICLE TOC
  41.     def johm_get_index(self):
  42.             return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
  43.  
  44.     # To parse artice toc
  45.     def parse_index(self):
  46.             parse_soup = self.johm_get_index()
  47.  
  48.             div = parse_soup.find(id='contentCell')
  49.  
  50.             current_section = None
  51.             current_articles = []
  52.             feeds = []
  53.             for x in div.findAll(True):
  54.                 if x.name == 'h4':
  55.                     # Section heading found
  56.                     if current_articles and current_section:
  57.                         feeds.append((current_section, current_articles))
  58.                     current_section = self.tag_to_string(x)
  59.                     current_articles = []
  60.                     self.log('\tFound section:', current_section)
  61.                 if current_section is not None and x.name == 'strong':
  62.                     title = self.tag_to_string(x)
  63.                     p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
  64.                     if p is None:
  65.                         continue
  66.                     url = p.get('href', False)
  67.                     if not url or not title:
  68.                         continue
  69.                     if url.startswith('/'):
  70.                          url = 'http://www3.interscience.wiley.com'+url
  71.                     url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
  72.                     self.log('\t\tFound article:', title)
  73.                     self.log('\t\t\t', url)
  74.                     #if url.startswith('/'):
  75.                         #url = 'http://online.wsj.com'+url
  76.                     current_articles.append({'title': title, 'url':url,
  77.                         'description':'', 'date':''})
  78.  
  79.             if current_articles and current_section:
  80.                 feeds.append((current_section, current_articles))
  81.  
  82.             return feeds
  83.  
  84.     def preprocess_html(self, soup):
  85.         for img in soup.findAll('img', src=True):
  86.             img['src'] = img['src'].replace('tfig', 'nfig')
  87.         return soup
  88.  
  89.