home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
- '''
- time.com
- '''
-
- import re
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class Time(BasicNewsRecipe):
- title = u'Time'
- __author__ = 'Kovid Goyal and Sujata Raman'
- description = 'Weekly magazine'
- encoding = 'utf-8'
- no_stylesheets = True
- language = 'en'
- remove_javascript = True
-
- extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
- .mainHd{font-family:georgia,serif;color:#000000;}
- h2 {font-family:Arial,Sans-serif;}
- .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
- .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
- .byline{font-family:Arial,Sans-serif; font-size:x-small ;}
- .photoBkt{ font-size:x-small ;}
- .vertPhoto{font-size:x-small ;}
- .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
- .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
- .artTxt{font-family:georgia,serif;}
- #content{font-family:georgia,serif;}
- .caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
- .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
- a:link{color:#CC0000;}
- .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
- '''
-
-
- keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
- dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
- remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
- dict(name ="span",attrs = {'class':['see']}),
- dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
- dict(name ="a",attrs = {'class':['listLink']}),
- dict(name ="ul",attrs = {'id':['shareSocial','tabs']}),
- dict(name ="li",attrs = {'class':['back']}),
- dict(name ="ul",attrs = {'class':['navCount']}),
- ]
- recursions = 10
- match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
-
- preprocess_regexps = [(re.compile(
- r'<meta .+/>'), lambda m:'')]
-
- def parse_index(self):
- soup = self.index_to_soup('http://www.time.com/time/magazine')
- img = soup.find('a', title="View Large Cover", href=True)
- if img is not None:
- cover_url = 'http://www.time.com'+img['href']
- try:
- nsoup = self.index_to_soup(cover_url)
- img = nsoup.find('img', src=re.compile('archive/covers'))
- if img is not None:
- self.cover_url = img['src']
- except:
- self.log.exception('Failed to fetch cover')
-
-
- feeds = []
- parent = soup.find(id='tocGuts')
- for seched in parent.findAll(attrs={'class':'toc_seched'}):
- section = self.tag_to_string(seched).capitalize()
- articles = list(self.find_articles(seched))
- feeds.append((section, articles))
-
- return feeds
-
- def find_articles(self, seched):
- for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
- if a.name in "div":
- break
- else:
- yield {
- 'title' : self.tag_to_string(a),
- 'url' : 'http://www.time.com'+a['href'],
- 'date' : '',
- 'description' : self.article_description(a)
- }
-
-
-
- def article_description(self, a):
- ans = []
- while True:
- t = a.nextSibling
- if t is None:
- break
- a = t
- if getattr(t, 'name', False):
- if t.get('class', '') == 'toc_parens' or t.name == 'br':
- continue
- if t.name in ('div', 'a'):
- break
- ans.append(self.tag_to_string(t))
- else:
- ans.append(unicode(t))
- return u' '.join(ans).replace(u'\xa0', u'').strip()
-
- def postprocess_html(self,soup,first):
- for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
- tag.extract()
- return soup
-