home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4139 < prev    next >
Encoding:
Text File  |  2010-09-30  |  4.0 KB  |  87 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
  5. '''
  6. latimes.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class LATimes(BasicNewsRecipe):
  11.     title                 = u'The Los Angeles Times'
  12.     __author__            = u'Darko Miletic and Sujata Raman'
  13.     description           = u'News from Los Angeles'
  14.     oldest_article        = 7
  15.     max_articles_per_feed = 100
  16.     language              = 'en'
  17.     no_stylesheets        = True
  18.     use_embedded_content  = False
  19.     encoding              = 'utf-8'
  20.     lang                  = 'en-US'
  21.  
  22.     conversion_options = {
  23.           'comment'          : description
  24.         , 'language'         : lang
  25.     }
  26.  
  27.     extra_css = '''
  28.                 h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; }
  29.                 h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
  30.                 .story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
  31.                 .entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
  32.                 .entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
  33.                 .credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  34.                 .small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  35.                 .byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  36.                 .date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
  37.                 .time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
  38.                 .copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; }
  39.                 .subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
  40.                 '''
  41.  
  42.    # recursions = 1
  43.    # match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
  44.  
  45.     keep_only_tags    = [dict(name='div', attrs={'class':["story"  ,"entry"] })]
  46.  
  47.  
  48.     remove_tags      = [   dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
  49.                             dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
  50.                             dict(name='p', attrs={'class':["entry-footer",]}),
  51.                            dict(name='ul', attrs={'class':"article-nav clearfix"}),
  52.                             dict(name=['iframe'])
  53.                         ]
  54.  
  55.  
  56.     feeds          = [(u'News', u'http://feeds.latimes.com/latimes/news')
  57.                       ,(u'Local','http://feeds.latimes.com/latimes/news/local')
  58.                       ,(u'MostEmailed','http://feeds.latimes.com/MostEmailed')
  59.                       ,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
  60.                       ,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/')
  61.                       ,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
  62.                       ,('Politics','http://feeds.latimes.com/latimes/news/politics/')
  63.                       ,('Business','http://feeds.latimes.com/latimes/business')
  64.                       ,('Sports','http://feeds.latimes.com/latimes/sports/')
  65.                       ,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
  66.                       ]
  67.  
  68.  
  69.     def get_article_url(self, article):
  70.         ans = article.get('feedburner_origlink').rpartition('?')[0]
  71.  
  72.         try:
  73.             self.log('Looking for full story link in', ans)
  74.             soup = self.index_to_soup(ans)
  75.             x = soup.find(text="single page")
  76.  
  77.             if x is not None:
  78.                 a = x.parent
  79.                 if a and a.has_key('href'):
  80.                     ans = 'http://www.latimes.com'+a['href']
  81.                     self.log('Found full story link', ans)
  82.         except:
  83.             pass
  84.         return ans
  85.  
  86.  
  87.