home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4097 < prev    next >
Encoding:
Text File  |  2009-12-14  |  4.9 KB  |  114 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
  5. '''
  6. time.com
  7. '''
  8.  
  9. import re
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class Time(BasicNewsRecipe):
  13.     title                 = u'Time'
  14.     __author__            = 'Kovid Goyal and Sujata Raman'
  15.     description           = 'Weekly magazine'
  16.     encoding = 'utf-8'
  17.     no_stylesheets        = True
  18.     language = 'en'
  19.     remove_javascript     = True
  20.  
  21.     extra_css      = ''' h1 {font-family:georgia,serif;color:#000000;}
  22.                         .mainHd{font-family:georgia,serif;color:#000000;}
  23.                          h2 {font-family:Arial,Sans-serif;}
  24.                         .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
  25.                         .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
  26.                         .byline{font-family:Arial,Sans-serif; font-size:x-small ;}
  27.                         .photoBkt{ font-size:x-small ;}
  28.                         .vertPhoto{font-size:x-small ;}
  29.                         .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
  30.                         .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
  31.                         .artTxt{font-family:georgia,serif;}
  32.                         #content{font-family:georgia,serif;}
  33.                         .caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
  34.                         .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
  35.                         a:link{color:#CC0000;}
  36.                         .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
  37.                         '''
  38.  
  39.  
  40.     keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
  41.                         dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
  42.     remove_tags    = [           dict(name ="div",attrs =  {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
  43.                                   dict(name ="span",attrs =  {'class':['see']}),
  44.                                  dict(name ="div",attrs =  {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
  45.                                   dict(name ="a",attrs =  {'class':['listLink']}),
  46.                                  dict(name ="ul",attrs =  {'id':['shareSocial','tabs']}),
  47.                                  dict(name ="li",attrs =  {'class':['back']}),
  48.                                  dict(name ="ul",attrs =  {'class':['navCount']}),
  49.                      ]
  50.     recursions = 10
  51.     match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
  52.  
  53.     preprocess_regexps = [(re.compile(
  54.         r'<meta .+/>'), lambda m:'')]
  55.  
  56.     def parse_index(self):
  57.         soup = self.index_to_soup('http://www.time.com/time/magazine')
  58.         img = soup.find('a', title="View Large Cover", href=True)
  59.         if img is not None:
  60.             cover_url = 'http://www.time.com'+img['href']
  61.             try:
  62.                 nsoup = self.index_to_soup(cover_url)
  63.                 img = nsoup.find('img', src=re.compile('archive/covers'))
  64.                 if img is not None:
  65.                     self.cover_url = img['src']
  66.             except:
  67.                 self.log.exception('Failed to fetch cover')
  68.  
  69.  
  70.         feeds = []
  71.         parent = soup.find(id='tocGuts')
  72.         for seched in parent.findAll(attrs={'class':'toc_seched'}):
  73.             section = self.tag_to_string(seched).capitalize()
  74.             articles = list(self.find_articles(seched))
  75.             feeds.append((section, articles))
  76.  
  77.         return feeds
  78.  
  79.     def find_articles(self, seched):
  80.             for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
  81.               if a.name in "div":
  82.                   break
  83.               else:
  84.                   yield {
  85.                            'title' : self.tag_to_string(a),
  86.                            'url'   : 'http://www.time.com'+a['href'],
  87.                            'date'  : '',
  88.                            'description' : self.article_description(a)
  89.                             }
  90.  
  91.  
  92.  
  93.     def article_description(self, a):
  94.         ans = []
  95.         while True:
  96.             t = a.nextSibling
  97.             if t is None:
  98.                 break
  99.             a = t
  100.             if getattr(t, 'name', False):
  101.                 if t.get('class', '') == 'toc_parens' or t.name == 'br':
  102.                     continue
  103.                 if t.name in ('div', 'a'):
  104.                     break
  105.                 ans.append(self.tag_to_string(t))
  106.             else:
  107.                 ans.append(unicode(t))
  108.         return u' '.join(ans).replace(u'\xa0', u'').strip()
  109.  
  110.     def postprocess_html(self,soup,first):
  111.         for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
  112.             tag.extract()
  113.         return soup
  114.