home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4219 < prev    next >
Encoding:
Text File  |  2010-10-21  |  4.5 KB  |  90 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. newscientist.com
  5. '''
  6.  
  7. import re
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class NewScientist(BasicNewsRecipe):
  11.     title                 = 'New Scientist - Online News w. subscription'
  12.     __author__            = 'Darko Miletic'
  13.     description           = 'Science news and science articles from New Scientist.'
  14.     language              = 'en'
  15.     publisher             = 'Reed Business Information Ltd.'
  16.     category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
  17.     oldest_article        = 7
  18.     max_articles_per_feed = 100
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     cover_url             = 'http://www.newscientist.com/currentcover.jpg'
  22.     masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
  23.     encoding              = 'utf-8'
  24.     needs_subscription    = 'optional'
  25.     extra_css             = """
  26.                                  body{font-family: Arial,sans-serif}
  27.                                  img{margin-bottom: 0.8em}
  28.                                  .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em}
  29.                             """
  30.  
  31.     conversion_options = {
  32.                           'comment'          : description
  33.                         , 'tags'             : category
  34.                         , 'publisher'        : publisher
  35.                         , 'language'         : language
  36.                         }
  37.     preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
  38.  
  39.     keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
  40.  
  41.     def get_browser(self):
  42.         br = BasicNewsRecipe.get_browser()
  43.         br.open('http://www.newscientist.com/')
  44.         if self.username is not None and self.password is not None:
  45.             br.open('https://www.newscientist.com/user/login?redirectURL=')
  46.             br.select_form(nr=2)
  47.             br['loginId' ] = self.username
  48.             br['password'] = self.password
  49.             br.submit()
  50.         return br
  51.  
  52.     remove_tags = [
  53.                      dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
  54.                     ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']})
  55.                     ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
  56.                     ,dict(name='meta' , attrs={'name' :'description'                       })
  57.                     ,dict(name='a'    , attrs={'rel'  :'tag'                               })
  58.                     ,dict(name=['link','base','meta','iframe','object','embed'])
  59.                   ]
  60.     remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
  61.     remove_attributes = ['height','width','lang']
  62.  
  63.     feeds          = [
  64.                         (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
  65.                        ,(u'Magazine'                , u'http://www.newscientist.com/feed/magazine'               )
  66.                        ,(u'Health'                  , u'http://www.newscientist.com/feed/view?id=2&type=channel' )
  67.                        ,(u'Life'                    , u'http://www.newscientist.com/feed/view?id=3&type=channel' )
  68.                        ,(u'Space'                   , u'http://www.newscientist.com/feed/view?id=6&type=channel' )
  69.                        ,(u'Physics and Mathematics' , u'http://www.newscientist.com/feed/view?id=4&type=channel' )
  70.                        ,(u'Environment'             , u'http://www.newscientist.com/feed/view?id=1&type=channel' )
  71.                        ,(u'Science in Society'      , u'http://www.newscientist.com/feed/view?id=5&type=channel' )
  72.                        ,(u'Tech'                    , u'http://www.newscientist.com/feed/view?id=7&type=channel' )
  73.                      ]
  74.  
  75.     def get_article_url(self, article):
  76.         return article.get('guid',  None)
  77.  
  78.     def print_version(self, url):
  79.         return url + '?full=true&print=true'
  80.  
  81.     def preprocess_html(self, soup):
  82.         for item in soup.findAll(['quote','quotetext']):
  83.             item.name='p'
  84.         for tg in soup.findAll('a'):
  85.             if tg.string == 'Home':
  86.                 tg.parent.extract()
  87.                 return self.adeify_images(soup)
  88.         return self.adeify_images(soup)
  89.  
  90.