home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4096 < prev    next >
Encoding:
Text File  |  2010-05-21  |  5.0 KB  |  104 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
  5. '''
  6. timesonline.co.uk
  7. '''
  8. import re
  9.  
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import Tag
  12.  
  13. class Timesonline(BasicNewsRecipe):
  14.     title                  = 'The Times Online'
  15.     __author__             = 'Darko Miletic and Sujata Raman'
  16.     description            = 'UK news'
  17.     publisher              = 'timesonline.co.uk'
  18.     category               = 'news, politics, UK'
  19.     oldest_article         = 2
  20.     max_articles_per_feed  = 100
  21.     no_stylesheets         = True
  22.     use_embedded_content   = False
  23.     simultaneous_downloads = 1
  24.     encoding               = 'ISO-8859-1'
  25.     remove_javascript = True
  26.     language = 'en_GB'
  27.     recursions = 9
  28.     match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
  29.  
  30.     preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
  31.  
  32.     keep_only_tags =  [
  33.                         dict(name='div', attrs= {'id':['region-column1and2-layout2']}),
  34.                         {'class' : ['subheading']},
  35.                         dict(name='div', attrs= {'id':['dynamic-image-holder']}),
  36.                         dict(name='div', attrs= {'class':['article-author']}),
  37.                         dict(name='div', attrs= {'id':['related-article-links']}),
  38.                         ]
  39.  
  40.     remove_tags        = [
  41.                           dict(name=['embed','object','form','iframe']),
  42.                           dict(name='span', attrs = {'class':'float-left padding-left-8 padding-top-2'}),
  43.                           dict(name='div', attrs= {'id':['region-footer','region-column2-layout2','grid-column4','login-status','comment-sort-order']}),
  44.                           dict(name='div', attrs= {'class': ['debate-quote-container','clear','your-comment','float-left related-attachements-container','float-left padding-bottom-5 padding-top-8','puff-top']}),
  45.                           dict(name='span', attrs = {'id': ['comment-count']}),
  46.                           dict(name='ul',attrs = {'id': 'read-all-comments'}),
  47.                           dict(name='a', attrs = {'class':'reg-bold'}),
  48.                           ]
  49.  
  50.  
  51.     extra_css = '''
  52.                 .small{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
  53.                 .byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; background:#F8F1D8;}
  54.                 .color-666{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666; }
  55.                 h1{font-family:Georgia,Times New Roman,Times,serif;font-size:large; }
  56.                 .color-999 {color:#999999;}
  57.                 .x-small {font-size:x-small;}
  58.                 #related-article-links{font-family :Arial,Helvetica,sans-serif; font-size:small;}
  59.                 h2{color:#333333;font-family :Georgia,Times New Roman,Times,serif; font-size:small;}
  60.                 p{font-family :Arial,Helvetica,sans-serif; font-size:small;}
  61.                 '''
  62.  
  63.     feeds          = [
  64.                         (u'Top stories from Times Online', u'http://www.timesonline.co.uk/tol/feeds/rss/topstories.xml'     ),
  65.                         ('Latest Business News', 'http://www.timesonline.co.uk/tol/feeds/rss/business.xml'),
  66.                         ('Economics', 'http://www.timesonline.co.uk/tol/feeds/rss/economics.xml'),
  67.                         ('World News', 'http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml'),
  68.                         ('UK News', 'http://www.timesonline.co.uk/tol/feeds/rss/uknews.xml'),
  69.                         ('Travel News', 'http://www.timesonline.co.uk/tol/feeds/rss/travel.xml'),
  70.                         ('Sports News', 'http://www.timesonline.co.uk/tol/feeds/rss/sport.xml'),
  71.                         ('Film News', 'http://www.timesonline.co.uk/tol/feeds/rss/film.xml'),
  72.                         ('Tech news', 'http://www.timesonline.co.uk/tol/feeds/rss/tech.xml'),
  73.                         ('Literary Supplement', 'http://www.timesonline.co.uk/tol/feeds/rss/thetls.xml'),
  74.                      ]
  75.  
  76.     def get_cover_url(self):
  77.         cover_url = None
  78.         index = 'http://www.timesonline.co.uk/tol/newspapers/'
  79.         soup = self.index_to_soup(index)
  80.         link_item = soup.find(name = 'div',attrs ={'class': "float-left margin-right-15"})
  81.         if link_item:
  82.            cover_url = link_item.img['src']
  83.         return cover_url
  84.  
  85.     def get_article_url(self, article):
  86.         return article.get('guid',  None)
  87.  
  88.  
  89.     def preprocess_html(self, soup):
  90.         soup.html['xml:lang'] = self.language
  91.         soup.html['lang']     = self.language
  92.         mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.language)])
  93.         mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=ISO-8859-1")])
  94.         soup.head.insert(0,mlang)
  95.         soup.head.insert(1,mcharset)
  96.         return self.adeify_images(soup)
  97.  
  98.     def postprocess_html(self,soup,first):
  99.         for tag in soup.findAll(text = ['Previous Page','Next Page']):
  100.             tag.extract()
  101.         return soup
  102.  
  103.  
  104.