home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3934 < prev    next >
Encoding:
Text File  |  2010-08-01  |  3.6 KB  |  71 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. newscientist.com
  5. '''
  6.  
  7. import re
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class NewScientist(BasicNewsRecipe):
  11.     title                 = 'New Scientist - Online News'
  12.     __author__            = 'Darko Miletic'
  13.     description           = 'Science news and science articles from New Scientist.'
  14.     language              = 'en'
  15.     publisher             = 'New Scientist'
  16.     category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
  17.     oldest_article        = 7
  18.     max_articles_per_feed = 100
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     cover_url             = 'http://www.newscientist.com/currentcover.jpg'
  22.     masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
  23.     encoding              = 'utf-8'
  24.     extra_css             = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '
  25.  
  26.     conversion_options = {
  27.                           'comment'          : description
  28.                         , 'tags'             : category
  29.                         , 'publisher'        : publisher
  30.                         , 'language'         : language
  31.                         }
  32.     preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
  33.  
  34.     keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
  35.  
  36.     remove_tags = [
  37.                      dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
  38.                     ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']})
  39.                     ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
  40.                     ,dict(name='meta' , attrs={'name' :'description'                       })
  41.                     ,dict(name='a'    , attrs={'rel'  :'tag'                                })
  42.                   ]
  43.     remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
  44.     remove_attributes = ['height','width']
  45.  
  46.     feeds          = [
  47.                         (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
  48.                        ,(u'Magazine'                , u'http://www.newscientist.com/feed/magazine'               )
  49.                        ,(u'Health'                  , u'http://www.newscientist.com/feed/view?id=2&type=channel' )
  50.                        ,(u'Life'                    , u'http://www.newscientist.com/feed/view?id=3&type=channel' )
  51.                        ,(u'Space'                   , u'http://www.newscientist.com/feed/view?id=6&type=channel' )
  52.                        ,(u'Physics and Mathematics' , u'http://www.newscientist.com/feed/view?id=4&type=channel' )
  53.                        ,(u'Environment'             , u'http://www.newscientist.com/feed/view?id=1&type=channel' )
  54.                        ,(u'Science in Society'      , u'http://www.newscientist.com/feed/view?id=5&type=channel' )
  55.                        ,(u'Tech'                    , u'http://www.newscientist.com/feed/view?id=7&type=channel' )
  56.                      ]
  57.  
  58.     def get_article_url(self, article):
  59.         return article.get('guid',  None)
  60.  
  61.     def print_version(self, url):
  62.         return url + '?full=true&print=true'
  63.  
  64.     def preprocess_html(self, soup):
  65.         for tg in soup.findAll('a'):
  66.             if tg.string == 'Home':
  67.                 tg.parent.extract()
  68.                 return self.adeify_images(soup)
  69.         return self.adeify_images(soup)
  70.  
  71.