home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4151 < prev    next >
Encoding:
Text File  |  2010-07-30  |  5.1 KB  |  111 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  3.  
  4. '''
  5. Fetch Die Zeit.
  6. '''
  7.  
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import Tag
  10.  
  11. class ZeitDe(BasicNewsRecipe):
  12.  
  13.     title = 'ZEIT Online'
  14.     description = 'ZEIT Online'
  15.     language = 'de'
  16.     lang = 'de_DE'
  17.  
  18.     __author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke'
  19.     use_embedded_content   = False
  20.     max_articles_per_feed = 40
  21.     remove_empty_feeds = True
  22.     no_stylesheets = True
  23.     no_javascript = True
  24.     encoding = 'utf-8'
  25.  
  26.     feeds =  [
  27.                ('Seite 1', 'http://newsfeed.zeit.de/index_xml'),
  28.                ('Politik', 'http://newsfeed.zeit.de/politik/index'),
  29.                ('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'),
  30.                ('Meinung', 'http://newsfeed.zeit.de/meinung/index'),
  31.                ('Gesellschaft', 'http://newsfeed.zeit.de/gesellschaft/index'),
  32.                ('Kultur', 'http://newsfeed.zeit.de/kultur/index'),
  33.                ('Wissen', 'http://newsfeed.zeit.de/wissen/index'),
  34.                ('Digital', 'http://newsfeed.zeit.de/digital/index'),
  35.                ('Studium', 'http://newsfeed.zeit.de/studium/index'),
  36.                ('Karriere', 'http://newsfeed.zeit.de/karriere/index'),
  37.                ('Lebensart', 'http://newsfeed.zeit.de/lebensart/index'),
  38.                ('Reisen', 'http://newsfeed.zeit.de/reisen/index'),
  39.                ('Auto', 'http://newsfeed.zeit.de/auto/index'),
  40.                ('Sport', 'http://newsfeed.zeit.de/sport/index'),
  41.              ]
  42.  
  43.     extra_css = '''
  44.                 .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
  45.                 .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;}
  46.                 .title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
  47.                 .caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
  48.                 .copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
  49.                 .article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
  50.                 .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
  51.                 .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
  52.                 .headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
  53.                 .inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; }
  54.                 img.inline{float:none}
  55.                 .intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700}
  56.                 .ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;}
  57.                 .infobox {border-style: solid; border-width: 1px;padding:8px;}
  58.                 .infobox dt {font-weight:700;}
  59.                 '''
  60.     #filter_regexps = [r'ad.de.doubleclick.net/']
  61.  
  62.     keep_only_tags = [
  63.                         dict(name='div', attrs={'class':["article"]}) ,
  64.                         dict(name='ul', attrs={'class':["tools"]}) ,
  65.                          ]
  66.     remove_tags = [
  67.                     dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),
  68.                     dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }),
  69.                     dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
  70.                     dict(name='div', attrs={'id':["place_5","place_4","comments"]})
  71.                   ]
  72.  
  73.     remove_attributes = ['style', 'font']
  74.  
  75.     def get_article_url(self, article):
  76.         ans = article.get('link',None)
  77.         ans += "?page=all"
  78.  
  79.         if 'video' in ans or 'quiz' in ans :
  80.               ans = None
  81.         return ans
  82.  
  83.     def get_cover_url(self):
  84.         try:
  85.             inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
  86.             return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
  87.         except:
  88.             return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
  89.  
  90.     def preprocess_html(self, soup):
  91.         soup.html['xml:lang'] = self.lang
  92.         soup.html['lang']     = self.lang
  93.         mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
  94.         soup.head.insert(0,mtag)
  95.         title = soup.find('h2', attrs={'class':'title'})
  96.         if title is None:
  97.             print "no title"
  98.             return soup
  99.         info = Tag(soup,'ul',[('class','ebinfobox')])
  100.         tools = soup.find('ul', attrs={'class':'tools'})
  101.         #author = tools.find('li','author first')
  102.         for tag in ['author first', 'date', 'date first', 'author', 'source']:
  103.             line = tools.find('li', tag)
  104.             if line:
  105.                 info.insert(0,line)
  106.         title.parent.insert(0,info)
  107.         tools.extract()
  108.         return soup
  109.  
  110.  
  111.