home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
- '''
- time.com
- '''
-
- import re
- from calibre.web.feeds.news import BasicNewsRecipe
- from lxml import html
-
- class Time(BasicNewsRecipe):
- #recipe_disabled = ('This recipe has been disabled as TIME no longer'
- # ' publish complete articles on the web.')
- title = u'Time'
- __author__ = 'Kovid Goyal'
- description = 'Weekly magazine'
- encoding = 'utf-8'
- no_stylesheets = True
- language = 'en'
- remove_javascript = True
-
-
- keep_only_tags = [
- {
- 'class':['artHd', 'articleContent',
- 'entry-title','entry-meta', 'entry-content', 'thumbnail']
- },
- ]
- remove_tags = [
- {'class':['content-tools', 'quigo', 'see',
- 'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
- {'id':['share-tools']},
- {'rel':'lightbox'},
- ]
-
- recursions = 10
- match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
-
- preprocess_regexps = [(re.compile(
- r'<meta .+/>'), lambda m:'')]
-
- def parse_index(self):
- raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
- root = html.fromstring(raw)
- img = root.xpath('//a[.="View Large Cover" and @href]')
- if img:
- cover_url = 'http://www.time.com' + img[0].get('href')
- try:
- nsoup = self.index_to_soup(cover_url)
- img = nsoup.find('img', src=re.compile('archive/covers'))
- if img is not None:
- self.cover_url = img['src']
- except:
- self.log.exception('Failed to fetch cover')
-
-
- feeds = []
- parent = root.xpath('//div[@class="content-main-aside"]')[0]
- for sec in parent.xpath(
- 'descendant::section[contains(@class, "sec-mag-section")]'):
- h3 = sec.xpath('./h3')
- if h3:
- section = html.tostring(h3[0], encoding=unicode,
- method='text').strip().capitalize()
- self.log('Found section', section)
- articles = list(self.find_articles(sec))
- if articles:
- feeds.append((section, articles))
-
- return feeds
-
- def find_articles(self, sec):
-
- for article in sec.xpath('./article'):
- h2 = article.xpath('./*[@class="entry-title"]')
- if not h2: continue
- a = h2[0].xpath('./a[@href]')
- if not a: continue
- title = html.tostring(a[0], encoding=unicode,
- method='text').strip()
- if not title: continue
- url = a[0].get('href')
- if url.startswith('/'):
- url = 'http://www.time.com'+url
- desc = ''
- p = article.xpath('./*[@class="entry-content"]')
- if p:
- desc = html.tostring(p[0], encoding=unicode,
- method='text')
- self.log('\t', title, ':\n\t\t', desc)
- yield {
- 'title' : title,
- 'url' : url,
- 'date' : '',
- 'description' : desc
- }
-
- def postprocess_html(self,soup,first):
- for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
- tag.extract()
- return soup
-
-