home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4066 < prev    next >
Encoding:
Text File  |  2010-02-11  |  3.6 KB  |  75 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
  4. '''
  5. telegraph.co.uk
  6. '''
  7.  
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class TelegraphUK(BasicNewsRecipe):
  11.     title                 = u'Telegraph.co.uk'
  12.     __author__            = 'Darko Miletic and Sujata Raman'
  13.     description           = 'News from United Kingdom'
  14.     oldest_article        = 7
  15.     max_articles_per_feed = 100
  16.     no_stylesheets        = True
  17.     language = 'en'
  18.  
  19.     use_embedded_content  = False
  20.  
  21.     extra_css           = '''
  22.                         h1{font-family :Arial,Helvetica,sans-serif; font-size:large; }
  23.                         h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444;}
  24.                         .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
  25.                         .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
  26.                         a{color:#234B7B; }
  27.                         .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
  28.                         '''
  29.  
  30.     keep_only_tags      = [
  31.                            dict(name='div', attrs={'class':'storyHead'})
  32.                           ,dict(name='div', attrs={'class':'story'    })
  33.                           #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ]   })
  34.                           ]
  35.     remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']})
  36.                           #,dict(name='div', attrs={'class':['toolshideoneQuarter']})
  37.                           ,dict(name='span', attrs={'class':['num','placeComment']})
  38.                           ]
  39.  
  40.     feeds               = [
  41.                          (u'UK News'        , u'http://www.telegraph.co.uk/news/uknews/rss'                                      )
  42.                         ,(u'World News'     , u'http://www.telegraph.co.uk/news/worldnews/rss'                                   )
  43.                         ,(u'Politics'       , u'http://www.telegraph.co.uk/news/newstopics/politics/rss'                         )
  44.                         ,(u'Technology News', u'http://www.telegraph.co.uk/scienceandtechnology/technology/technologynews/rss'   )
  45.                         ,(u'UK News'        , u'http://www.telegraph.co.uk/scienceandtechnology/technology/technologyreviews/rss')
  46.                         ,(u'Science News'   , u'http://www.telegraph.co.uk/scienceandtechnology/science/sciencenews/rss'         )
  47.                         ,(u'Sport'          , u'http://www.telegraph.co.uk/sport/rss'                                            )
  48.                         ,(u'Earth News'     , u'http://www.telegraph.co.uk/earth/earthnews/rss'                                  )
  49.                         ,(u'Comment'        , u'http://www.telegraph.co.uk/comment/rss'                                          )
  50.                         ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss'                     )
  51.                          ]
  52.  
  53.     def get_article_url(self, article):
  54.  
  55.         url = article.get('guid', None)
  56.  
  57.         if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
  58.             url = None
  59.  
  60.         return url
  61.  
  62.  
  63.     def postprocess_html(self,soup,first):
  64.  
  65.         for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
  66.             for pTag in bylineTag.findAll(name='p'):
  67.                 if getattr(pTag.contents[0],"Comments",True):
  68.                     pTag.extract()
  69.         return soup
  70.  
  71.  
  72.  
  73.  
  74.  
  75.