home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4058 < prev    next >
Encoding:
Text File  |  2010-04-28  |  4.3 KB  |  104 lines

  1.  
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
  4. '''
  5. www.sueddeutsche.de/sz/
  6. '''
  7.  
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre import strftime
  10.  
  11. class SueddeutcheZeitung(BasicNewsRecipe):
  12.     title                  = 'Sueddeutche Zeitung'
  13.     __author__             = 'Darko Miletic'
  14.     description            = 'News from Germany. Access to paid content.'
  15.     publisher              = 'Sueddeutche Zeitung'
  16.     category               = 'news, politics, Germany'
  17.     no_stylesheets         = True
  18.     oldest_article         = 2
  19.     encoding               = 'cp1252'
  20.     needs_subscription     = True
  21.     remove_empty_feeds     = True
  22.     delay                  = 2
  23.     PREFIX                 = 'http://www.sueddeutsche.de'
  24.     INDEX                  = PREFIX + '/app/epaper/textversion/'
  25.     use_embedded_content   = False
  26.     masthead_url           = 'http://pix.sueddeutsche.de/img/layout/header/logo.gif'
  27.     language               = 'de'
  28.     publication_type       = 'newspaper'
  29.     extra_css              = ' body{font-family: Arial,Helvetica,sans-serif} '
  30.  
  31.     conversion_options = {
  32.                           'comment'          : description
  33.                         , 'tags'             : category
  34.                         , 'publisher'        : publisher
  35.                         , 'language'         : language
  36.                         , 'linearize_tables' : True
  37.                         }
  38.  
  39.     remove_attributes = ['height','width']
  40.  
  41.     def get_browser(self):
  42.         br = BasicNewsRecipe.get_browser()
  43.         if self.username is not None and self.password is not None:
  44.             br.open(self.INDEX)
  45.             br.select_form(name='lbox')
  46.             br['login_name'    ] = self.username
  47.             br['login_passwort'] = self.password
  48.             br.submit()
  49.         return br
  50.  
  51.     remove_tags        =[
  52.                          dict(attrs={'class':'hidePrint'})
  53.                         ,dict(name=['link','object','embed','base','iframe'])
  54.                         ]
  55.     keep_only_tags     = [dict(attrs={'class':'artikelBox'})]
  56.     remove_tags_before =  dict(attrs={'class':'artikelTitel'})
  57.     remove_tags_after  =  dict(attrs={'class':'author'})
  58.  
  59.     feeds = [
  60.                (u'Politik'      , INDEX + 'Politik/'      )
  61.               ,(u'Seite drei'   , INDEX + 'Seite+drei/'   )
  62.               ,(u'Meinungsseite', INDEX + 'Meinungsseite/')
  63.               ,(u'Wissen'       , INDEX + 'Wissen/'       )
  64.               ,(u'Panorama'     , INDEX + 'Panorama/'     )
  65.               ,(u'Feuilleton'   , INDEX + 'Feuilleton/'   )
  66.               ,(u'Medien'       , INDEX + 'Medien/'       )
  67.               ,(u'Wirtschaft'   , INDEX + 'Wirtschaft/'   )
  68.               ,(u'Sport'        , INDEX + 'Sport/'        )
  69.               ,(u'Bayern'       , INDEX + 'Bayern/'       )
  70.               ,(u'Muenchen'     , INDEX + 'M%FCnchen/'    )
  71.             ]
  72.  
  73.     def parse_index(self):
  74.         src = self.index_to_soup(self.INDEX)
  75.         id = ''
  76.         for itt in src.findAll('a',href=True):
  77.             if itt['href'].startswith('/app/epaper/textversion/inhalt/'):
  78.                id = itt['href'].rpartition('/inhalt/')[2]
  79.         totalfeeds = []
  80.         lfeeds = self.get_feeds()
  81.         for feedobj in lfeeds:
  82.             feedtitle, feedurl = feedobj
  83.             self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
  84.             articles = []
  85.             soup = self.index_to_soup(feedurl + id)
  86.             tbl = soup.find(attrs={'class':'szprintd'})
  87.             for item in tbl.findAll(name='td',attrs={'class':'topthema'}):
  88.                 atag    = item.find(attrs={'class':'Titel'}).a
  89.                 ptag    = item.find('p')
  90.                 stag    = ptag.find('script')
  91.                 if stag:
  92.                    stag.extract()
  93.                 url           = self.PREFIX + atag['href']
  94.                 title         = self.tag_to_string(atag)
  95.                 description   = self.tag_to_string(ptag)
  96.                 articles.append({
  97.                                       'title'      :title
  98.                                      ,'date'       :strftime(self.timefmt)
  99.                                      ,'url'        :url
  100.                                      ,'description':description
  101.                                     })
  102.             totalfeeds.append((feedtitle, articles))
  103.         return totalfeeds
  104.