home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4299 < prev    next >
Encoding:
Text File  |  2010-10-03  |  4.5 KB  |  111 lines

  1. from calibre.web.feeds.news import re
  2. from calibre.web.feeds.recipes import BasicNewsRecipe
  3. from BeautifulSoup import Tag
  4.  
  5. class RevistaMuyInteresante(BasicNewsRecipe):
  6.  
  7.     title       = 'Revista Muy Interesante'
  8.     __author__  = 'Jefferson Frantz'
  9.     description = 'Revista de divulgacion'
  10.     timefmt = ' [%d %b, %Y]'
  11.     language = 'es'
  12.  
  13.     no_stylesheets = True
  14.     remove_javascript = True
  15.  
  16.     extra_css              = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
  17.  
  18.  
  19.     def preprocess_html(self, soup):
  20.             for item in soup.findAll(style=True):
  21.                del item['style']
  22.  
  23.             for img_tag in soup.findAll('img'):
  24.                 imagen = img_tag
  25.                 new_tag = Tag(soup,'p')
  26.                 img_tag.replaceWith(new_tag)
  27.                 div = soup.find(attrs={'class':'article_category'})
  28.                 div.insert(0,imagen)
  29.                 break
  30.             return soup
  31.  
  32.  
  33.     preprocess_regexps = [
  34.         (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
  35.  
  36.     ]
  37.  
  38.  
  39.     keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
  40.  
  41.     remove_tags        = [
  42.                              dict(name=['object','link','script','ul'])
  43.                             ,dict(name='div', attrs={'id':['comment']})
  44.                             ,dict(name='td', attrs={'class':['buttonheading']})
  45.                             ,dict(name='div', attrs={'class':['tags_articles']})
  46.                             ,dict(name='table', attrs={'class':['pagenav']})
  47.                          ]
  48.  
  49.     remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
  50.  
  51.  
  52.     #TO GET ARTICLES IN SECTION
  53.     def nz_parse_section(self, url):
  54.             soup = self.index_to_soup(url)
  55.             div = soup.find(attrs={'class':'contenido'})
  56.             current_articles = []
  57.             for x in div.findAllNext(attrs={'class':['headline']}):
  58.                     a = x.find('a', href=True)
  59.                     if a is None:
  60.                         continue
  61.                     title = self.tag_to_string(a)
  62.                     url = a.get('href', False)
  63.                     if not url or not title:
  64.                         continue
  65.                     if url.startswith('/'):
  66.                          url = 'http://www.muyinteresante.es'+url
  67. #                    self.log('\t\tFound article:', title)
  68. #                    self.log('\t\t\t', url)
  69.                     current_articles.append({'title': title, 'url':url,
  70.                         'description':'', 'date':''})
  71.  
  72.             return current_articles
  73.  
  74.  
  75.     # To GET SECTIONS
  76.     def parse_index(self):
  77.             feeds = []
  78.             for title, url in [
  79.                 ('Historia',
  80.                  'http://www.muyinteresante.es/historia-articulos'),
  81.                 ('Ciencia',
  82.                  'http://www.muyinteresante.es/ciencia-articulos'),
  83.                 ('Naturaleza',
  84.                  'http://www.muyinteresante.es/naturaleza-articulos'),
  85.                 ('Tecnolog├¡a',
  86.                  'http://www.muyinteresante.es/tecnologia-articulos'),
  87.                 ('Salud',
  88.                  'http://www.muyinteresante.es/salud-articulos'),
  89.                 ('M├ís Muy',
  90.                  'http://www.muyinteresante.es/muy'),
  91.                 ('Innova - Automoci├│n',
  92.                  'http://www.muyinteresante.es/articulos-innovacion-autos'),
  93.                 ('Innova - Salud',
  94.                  'http://www.muyinteresante.es/articulos-innovacion-salud'),
  95.                 ('Innova - Medio Ambiente',
  96.                  'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
  97.                 ('Innova - Alimentaci├│n',
  98.                  'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
  99.                 ('Innova - Sociedad',
  100.                  'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
  101.                 ('Innova - Tecnolog├¡a',
  102.                  'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
  103.                 ('Innova - Ocio',
  104.                  'http://www.muyinteresante.es/articulos-innovacion-ocio'),
  105.              ]:
  106.                articles = self.nz_parse_section(url)
  107.                if articles:
  108.                    feeds.append((title, articles))
  109.             return feeds
  110.  
  111.