home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3821 < prev    next >
Encoding:
Text File  |  2009-11-11  |  3.3 KB  |  97 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
  5. '''
  6. infobae.com
  7. '''
  8. import re
  9. import urllib, urlparse
  10.  
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class Infobae(BasicNewsRecipe):
  14.     title                 = 'Infobae.com'
  15.     __author__            = 'Darko Miletic and Sujata Raman'
  16.     description           = 'Informacion Libre las 24 horas'
  17.     publisher             = 'Infobae.com'
  18.     category              = 'news, politics, Argentina'
  19.     oldest_article        = 1
  20.     max_articles_per_feed = 100
  21.     no_stylesheets        = True
  22.     use_embedded_content  = False
  23.     language = 'es'
  24.     lang = 'es-AR'
  25.  
  26.     encoding              = 'cp1252'
  27.     cover_url             = 'http://www.infobae.com/imgs/header/header.gif'
  28.     remove_javascript     = True
  29.     preprocess_regexps = [(re.compile(
  30.         r'<meta name="Description" content="[^"]+">'), lambda m:'')]
  31.  
  32.  
  33.     html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
  34.  
  35.     extra_css = '''
  36.                     .col-center{font-family:Arial,Helvetica,sans-serif;}
  37.                     h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
  38.                     .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
  39.                 '''
  40.  
  41.     keep_only_tags = [dict(name='div', attrs={'class':['content']})]
  42.  
  43.  
  44.     remove_tags = [
  45.                dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
  46.                dict(name='a', attrs={'name' : 'comentario',}),
  47.                dict(name='iframe'),
  48.                dict(name='img', alt = "Ver galerias de imagenes"),
  49.  
  50.                                  ]
  51.  
  52.  
  53.     feeds = [
  54.               (u'Noticias'  , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml'       )
  55.              ,(u'Salud'     , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml'     )
  56.              ,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml')
  57.              ,(u'Deportes'  , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml'  )
  58.             ]
  59.  
  60. #    def print_version(self, url):
  61. #        main, sep, article_part = url.partition('contenidos/')
  62. #        article_id, rsep, rrest = article_part.partition('-')
  63. #        return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
  64.  
  65.     def get_article_url(self, article):
  66.         ans = article.get('link').encode('utf-8')
  67.         parts = list(urlparse.urlparse(ans))
  68.         parts[2] = urllib.quote(parts[2])
  69.         ans = urlparse.urlunparse(parts)
  70.         return ans.decode('utf-8')
  71.  
  72.  
  73.     def preprocess_html(self, soup):
  74.  
  75.         for tag in soup.head.findAll('strong'):
  76.             tag.extract()
  77.         for tag in soup.findAll('meta'):
  78.             del tag['content']
  79.             tag.extract()
  80.  
  81.         mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
  82.         soup.head.insert(0,mtag)
  83.         for item in soup.findAll(style=True):
  84.             del item['style']
  85.  
  86.         return soup
  87.  
  88.     def postprocess_html(self, soup, first):
  89.  
  90.         for tag in soup.findAll(name='strong'):
  91.              tag.name = 'b'
  92.  
  93.         return soup
  94.  
  95.  
  96.  
  97.