Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / time_magazine.recipe < prev next >

Wrap

Text File | 2011-09-09 | 3.5 KB | 105 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>' ''' time.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe from lxml import html class Time(BasicNewsRecipe): #recipe_disabled = ('This recipe has been disabled as TIME no longer' # ' publish complete articles on the web.') title = u'Time' __author__ = 'Kovid Goyal' description = 'Weekly magazine' encoding = 'utf-8' no_stylesheets = True language = 'en' remove_javascript = True keep_only_tags = [ { 'class':['artHd', 'articleContent', 'entry-title','entry-meta', 'entry-content', 'thumbnail'] }, ] remove_tags = [ {'class':['content-tools', 'quigo', 'see', 'first-tier-social-tools', 'navigation', 'enlarge lightbox']}, {'id':['share-tools']}, {'rel':'lightbox'}, ] recursions = 10 match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*'] preprocess_regexps = [(re.compile( r'<meta .+/>'), lambda m:'')] def parse_index(self): raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True) root = html.fromstring(raw) img = root.xpath('//a[.="View Large Cover" and @href]') if img: cover_url = 'http://www.time.com' + img[0].get('href') try: nsoup = self.index_to_soup(cover_url) img = nsoup.find('img', src=re.compile('archive/covers')) if img is not None: self.cover_url = img['src'] except: self.log.exception('Failed to fetch cover') feeds = [] parent = root.xpath('//div[@class="content-main-aside"]')[0] for sec in parent.xpath( 'descendant::section[contains(@class, "sec-mag-section")]'): h3 = sec.xpath('./h3') if h3: section = html.tostring(h3[0], encoding=unicode, method='text').strip().capitalize() self.log('Found section', section) articles = list(self.find_articles(sec)) if articles: feeds.append((section, articles)) return feeds def find_articles(self, sec): for article in sec.xpath('./article'): h2 = article.xpath('./*[@class="entry-title"]') if not h2: continue a = h2[0].xpath('./a[@href]') if not a: continue title = html.tostring(a[0], encoding=unicode, method='text').strip() if not title: continue url = a[0].get('href') if url.startswith('/'): url = 'http://www.time.com'+url desc = '' p = article.xpath('./*[@class="entry-content"]') if p: desc = html.tostring(p[0], encoding=unicode, method='text') self.log('\t', title, ':\n\t\t', desc) yield { 'title' : title, 'url' : url, 'date' : '', 'description' : desc } def postprocess_html(self,soup,first): for tag in soup.findAll(attrs ={'class':['artPag','pagination']}): tag.extract() return soup