home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3882 < prev    next >
Encoding:
Text File  |  2009-10-14  |  6.9 KB  |  153 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Mathieu Godlewski <mathieu at godlewski.fr>'
  5. '''
  6. lemonde.fr
  7. '''
  8.  
  9. import re
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12.  
  13. class LeMonde(BasicNewsRecipe):
  14.     title          = 'LeMonde.fr'
  15.     __author__ = 'Mathieu Godlewski and Sujata Raman'
  16.     description = 'Global news in french'
  17.     oldest_article = 3
  18.     language = 'fr'
  19.  
  20.     max_articles_per_feed = 30
  21.     no_stylesheets = True
  22.     remove_javascript = True
  23.  
  24.  
  25.    # cover_url='http://abonnes.lemonde.fr/titresdumonde/'+date.today().strftime("%y%m%d")+'/1.jpg'
  26.  
  27.  
  28.     extra_css = '''
  29.                     .dateline{color:#666666;font-family:verdana,sans-serif;font-size:x-small;}
  30.                     .author{font-family:verdana,sans-serif;font-size:x-small;color:#222222;}
  31.                     .articleImage{color:#666666;font-family:verdana,sans-serif;font-size:x-small;}
  32.                     .mainText{font-family:Georgia,serif;color:#222222;}
  33.                     .LM_articleText{font-family:Arial,Helvetica,sans-serif;}
  34.                     .LM_titleZone{font-family:Arial,Helvetica,sans-serif;}
  35.                     .mainContent{font-family:Georgia,serif;}
  36.                     .LM_content{font-family:Georgia,serif;}
  37.                     .LM_caption{font-family:Georgia,serif;font-size:-small;}
  38.                     .LM_imageSource{font-family:Arial,Helvetica,sans-serif;font-size:x-small;color:#666666;}
  39.                     h1{font-family:Arial,Helvetica,sans-serif;font-size:medium;color:#000000;}
  40.                     .post{font-family:Arial,Helvetica,sans-serif;}
  41.                     .mainTitle{font-family:Georgia,serif;}
  42.                     .content{font-family:Georgia,serif;}
  43.                     .entry{font-family:Georgia,serif;}
  44.                     h2{font-family:Arial,Helvetica,sans-serif;font-size:large;}
  45.                     small{font-family:Arial,Helvetica,sans-serif;  color:#ED1B23;}
  46.                 '''
  47.  
  48.     feeds =  [
  49.              ('A la Une', 'http://www.lemonde.fr/rss/une.xml'),
  50.              ('International', 'http://www.lemonde.fr/rss/sequence/0,2-3210,1-0,0.xml'),
  51.              ('Europe', 'http://www.lemonde.fr/rss/sequence/0,2-3214,1-0,0.xml'),
  52.              ('Societe', 'http://www.lemonde.fr/rss/sequence/0,2-3224,1-0,0.xml'),
  53.              ('Economie', 'http://www.lemonde.fr/rss/sequence/0,2-3234,1-0,0.xml'),
  54.              ('Medias', 'http://www.lemonde.fr/rss/sequence/0,2-3236,1-0,0.xml'),
  55.              ('Rendez-vous', 'http://www.lemonde.fr/rss/sequence/0,2-3238,1-0,0.xml'),
  56.              ('Sports', 'http://www.lemonde.fr/rss/sequence/0,2-3242,1-0,0.xml'),
  57.              ('Planete', 'http://www.lemonde.fr/rss/sequence/0,2-3244,1-0,0.xml'),
  58.              ('Culture', 'http://www.lemonde.fr/rss/sequence/0,2-3246,1-0,0.xml'),
  59.              ('Technologies', 'http://www.lemonde.fr/rss/sequence/0,2-651865,1-0,0.xml'),
  60.              ('Cinema', 'http://www.lemonde.fr/rss/sequence/0,2-3476,1-0,0.xml'),
  61.              ('Voyages', 'http://www.lemonde.fr/rss/sequence/0,2-3546,1-0,0.xml'),
  62.              ('Livres', 'http://www.lemonde.fr/rss/sequence/0,2-3260,1-0,0.xml'),
  63.              ('Examens', 'http://www.lemonde.fr/rss/sequence/0,2-3404,1-0,0.xml'),
  64.              ('Opinions', 'http://www.lemonde.fr/rss/sequence/0,2-3232,1-0,0.xml')
  65.              ]
  66.     keep_only_tags = [dict(name='div', attrs={'id':["mainTitle","mainContent","LM_content","content"]}),
  67.                       dict(name='div', attrs={'class':["post"]})
  68.                       ]
  69.  
  70.     remove_tags    = [dict(name='img', attrs={'src':'http://medias.lemonde.fr/mmpub/img/lgo/lemondefr_pet.gif'}),
  71.                                     dict(name='div', attrs={'id':'xiti-logo-noscript'}),
  72.                                     dict(name='br', attrs={}),
  73.                                     dict(name='iframe', attrs={}),
  74.                      dict(name='table', attrs={'id':["toolBox"]}),
  75.                       dict(name='table', attrs={'class':["bottomToolBox"]}),
  76.                       dict(name='div', attrs={'class':["pageNavigation","LM_pagination","fenetreBoxesContainer","breakingNews","LM_toolsBottom","LM_comments","LM_tools","pave_meme_sujet_hidden","boxMemeSujet"]}),
  77.                       dict(name='div', attrs={'id':["miniUne","LM_sideBar"]}),
  78.     ]
  79.  
  80.     preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
  81.         [
  82.             (r'<html.*(<div class="post".*?>.*?</div>.*?<div class="entry">.*?</div>).*You can start editing here.*</html>', lambda match : '<html><body>'+match.group(1)+'</body></html>'),
  83.             (r'<p> </p>', lambda match : ''),
  84.             (r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>'+match.group(1).upper()),
  85.             (r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/q(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>"'+match.group(1).upper()),
  86.             (r'(<div class=desc><b>.*</b></div>).*</body>', lambda match : match.group(1)),
  87.         ]
  88.     ]
  89.  
  90.     article_match_regexps = [ (re.compile(i)) for i in
  91.         [
  92.             (r'http://www\.lemonde\.fr/\S+/article/.*'),
  93.             (r'http://www\.lemonde\.fr/\S+/portfolio/.*'),
  94.             (r'http://www\.lemonde\.fr/\S+/article_interactif/.*'),
  95.             (r'http://\S+\.blog\.lemonde\.fr/.*'),
  96.         ]
  97.     ]
  98.  
  99.    # def print_version(self, url):
  100.    #     return re.sub('http://www\.lemonde\.fr/.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
  101.  
  102.     # Used to filter duplicated articles
  103.     articles_list = []
  104.  
  105.     def get_cover_url(self):
  106.         cover_url = None
  107.         soup = self.index_to_soup('http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html')
  108.         link_item = soup.find('div',attrs={'class':'pg-gch'})
  109.  
  110.         if link_item and link_item.img:
  111.            cover_url = link_item.img['src']
  112.  
  113.         return cover_url
  114.  
  115.     def get_article_url(self, article):
  116.         url=article.get('link',  None)
  117.         url=url[0:url.find("#")]
  118.         if url in self.articles_list:
  119.             self.log_debug(_('Skipping duplicated article: %s')%url)
  120.             return False
  121.         if self.is_article_wanted(url):
  122.             self.articles_list.append(url)
  123.             if '/portfolio/' in url or '/video/' in url:
  124.               url = None
  125.         return url
  126.         self.log_debug(_('Skipping filtered article: %s')%url)
  127.         url = article.get('guid', None)
  128.  
  129.  
  130.         return False
  131.  
  132.  
  133.     def is_article_wanted(self, url):
  134.         if self.article_match_regexps:
  135.             for m in self.article_match_regexps:
  136.                 if m.search(url):
  137.                     return True
  138.             return False
  139.         return False
  140.  
  141.     def preprocess_html(self, soup):
  142.  
  143.           for item in soup.findAll(style=True):
  144.               del item['style']
  145.  
  146.           for item in soup.findAll(face=True):
  147.               del item['face']
  148.           for tag in soup.findAll(name=['ul','li']):
  149.                 tag.name = 'div'
  150.  
  151.           return soup
  152.  
  153.