home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4060 < prev    next >
Encoding:
Text File  |  2010-07-27  |  4.0 KB  |  91 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010 Ingo Paschke <ipaschke@gmail.com>'
  3.  
  4. '''
  5. Fetch Tagesspiegel.
  6. '''
  7. import string, re
  8. from calibre import strftime
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class TagesspiegelRSS(BasicNewsRecipe):
  12.     title          = u'Der Tagesspiegel'
  13.     __author__ = 'Ingo Paschke'
  14.     language = 'de'
  15.     oldest_article = 7
  16.     max_articles_per_feed = 100
  17.  
  18.     extra_css = '''
  19.                 .hcf-overline{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;display:block}
  20.                 .hcf-teaser{font-family:Verdana,Arial,Helvetica;font-size:x-small;margin-top:0}
  21.                 h1{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
  22.                 .hcf-caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
  23.                 .hcf-copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
  24.                 .hcf-article{font-family:Arial,Helvetica;font-size:x-small}
  25.                 .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
  26.                 .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
  27.                 .hcf-inline-left{float:left;margin-right:15px;position:relative;}
  28.                 .hcf-inline-right{float:right;margin-right:15px;position:relative;}
  29.                 .hcf-smart-box{font-family: Arial, Helvetica, sans-serif; font-size: xx-small; margin: 0px 15px 8px 0px; width: 300px;}
  30.                 '''
  31.  
  32.     no_stylesheets = True
  33.     no_javascript = True
  34.     remove_empty_feeds = True
  35.     encoding = 'utf-8'
  36.  
  37.     keep_only_tags = dict(name='div', attrs={'class':["hcf-article"]})
  38.     remove_tags = [
  39.                     dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),dict(name='button'),
  40.                     dict(name='div', attrs={'class':["hcf-jump-to-comments","hcf-clear","hcf-magnify hcf-media-control"] }),
  41.                     dict(name='span', attrs={'class':["hcf-mainsearch",] }),
  42.                     dict(name='ul', attrs={'class':["hcf-tools"]}),
  43.                     dict(name='ul', attrs={'class': re.compile('hcf-services')})
  44.                   ]
  45.  
  46.     def parse_index(self):
  47.         soup = self.index_to_soup('http://www.tagesspiegel.de/zeitung/')
  48.  
  49.         def feed_title(div):
  50.             return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
  51.  
  52.         articles = {}
  53.         key = None
  54.         ans = []
  55.         maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
  56.  
  57.         for div in maincol.findAll(True, attrs={'class':['hcf-teaser', 'hcf-header', 'story headline']}):
  58.  
  59.              if div['class'] == 'hcf-header':
  60.                  try:
  61.                      key = string.capwords(feed_title(div.em.a))
  62.                      articles[key] = []
  63.                      ans.append(key)
  64.                  except:
  65.                      continue
  66.  
  67.              elif div['class'] == 'hcf-teaser' and getattr(div.contents[0],'name','') == 'h2':
  68.                  a = div.find('a', href=True)
  69.                  if not a:
  70.                      continue
  71.                  url = 'http://www.tagesspiegel.de' + a['href']
  72.                  title = self.tag_to_string(a, use_alt=True).strip()
  73.                  description = ''
  74.                  pubdate = strftime('%a, %d %b')
  75.                  summary = div.find('p', attrs={'class':'hcf-teaser'})
  76.                  if summary:
  77.                      description = self.tag_to_string(summary, use_alt=False)
  78.  
  79.                  feed = key if key is not None else 'Uncategorized'
  80.                  if not articles.has_key(feed):
  81.                      articles[feed] = []
  82.                  if not 'podcasts' in url:
  83.                      articles[feed].append(
  84.                                dict(title=title, url=url, date=pubdate,
  85.                                     description=re.sub('mehr$', '', description),
  86.                                     content=''))
  87.  
  88.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  89.  
  90.         return ans
  91.