home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3717 < prev    next >
Encoding:
Text File  |  2010-06-10  |  5.4 KB  |  142 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.utils.threadpool import ThreadPool, makeRequests
  3. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  4. import time, string, re
  5. from datetime import datetime
  6. from lxml import html
  7.  
  8. class Economist(BasicNewsRecipe):
  9.  
  10.     title = 'The Economist (free)'
  11.     language = 'en'
  12.  
  13.     __author__ = "Kovid Goyal"
  14.     description = ('Global news and current affairs from a European perspective.'
  15.             ' Much slower than the subscription based version.')
  16.  
  17.     oldest_article = 7.0
  18.     cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
  19.     remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
  20.             dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
  21.     keep_only_tags = [dict(id='ec-article-body')]
  22.     no_stylesheets = True
  23.     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
  24.         lambda x:'</html>')]
  25.  
  26.     def parse_index(self):
  27.         from calibre.web.feeds.feedparser import parse
  28.         if self.test:
  29.             self.oldest_article = 14.0
  30.         raw = self.index_to_soup(
  31.                 'http://feeds.feedburner.com/economist/full_print_edition',
  32.                 raw=True)
  33.         entries = parse(raw).entries
  34.         pool = ThreadPool(10)
  35.         self.feed_dict = {}
  36.         requests = []
  37.         for i, item in enumerate(entries):
  38.             title       = item.get('title', _('Untitled article'))
  39.             published = item.date_parsed
  40.             if not published:
  41.                 published = time.gmtime()
  42.             utctime = datetime(*published[:6])
  43.             delta = datetime.utcnow() - utctime
  44.             if delta.days*24*3600 + delta.seconds > 24*3600*self.oldest_article:
  45.                 self.log.debug('Skipping article %s as it is too old.'%title)
  46.                 continue
  47.             link        = item.get('link', None)
  48.             description = item.get('description', '')
  49.             author      = item.get('author', '')
  50.  
  51.             requests.append([i, link, title, description, author, published])
  52.         if self.test:
  53.             requests = requests[:4]
  54.         requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found,
  55.                 self.eco_article_failed)
  56.         for r in requests: pool.putRequest(r)
  57.         pool.wait()
  58.  
  59.         return self.eco_sort_sections([(t, a) for t, a in
  60.             self.feed_dict.items()])
  61.  
  62.     def eco_sort_sections(self, feeds):
  63.         if not feeds:
  64.             raise ValueError('No new articles found')
  65.         order = {
  66.             'The World This Week': 1,
  67.             'Leaders': 2,
  68.             'Letters': 3,
  69.             'Briefing': 4,
  70.             'Business': 5,
  71.             'Finance And Economics': 6,
  72.             'Science & Technology': 7,
  73.             'Books & Arts': 8,
  74.             'International': 9,
  75.             'United States': 10,
  76.             'Asia': 11,
  77.             'Europe': 12,
  78.             'The Americas': 13,
  79.             'Middle East & Africa': 14,
  80.             'Britain': 15,
  81.             'Obituary': 16,
  82.         }
  83.         return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100),
  84.             order.get(y[0], 100)))
  85.  
  86.     def process_eco_feed_article(self, args):
  87.         from calibre import browser
  88.         i, url, title, description, author, published = args
  89.         br = browser()
  90.         ret = br.open(url)
  91.         raw = ret.read()
  92.         url = br.geturl().split('?')[0]+'/print'
  93.         root = html.fromstring(raw)
  94.         matches = root.xpath('//*[@class = "ec-article-info"]')
  95.         feedtitle = 'Miscellaneous'
  96.         if matches:
  97.             feedtitle = string.capwords(html.tostring(matches[-1], method='text',
  98.                     encoding=unicode).split('|')[-1].strip())
  99.         return (i, feedtitle, url, title, description, author, published)
  100.  
  101.     def eco_article_found(self, req, result):
  102.         from calibre.web.feeds import Article
  103.         i, feedtitle, link, title, description, author, published = result
  104.         self.log('Found print version for article:', title, 'in', feedtitle,
  105.                 'at', link)
  106.  
  107.         a = Article(i, title, link, author, description, published, '')
  108.  
  109.         article = dict(title=a.title, description=a.text_summary,
  110.             date=time.strftime(self.timefmt, a.date), author=a.author, url=a.url)
  111.         if feedtitle not in self.feed_dict:
  112.             self.feed_dict[feedtitle] = []
  113.         self.feed_dict[feedtitle].append(article)
  114.  
  115.     def eco_article_failed(self, req, tb):
  116.         self.log.error('Failed to download %s with error:'%req.args[0][2])
  117.         self.log.debug(tb)
  118.  
  119.     def eco_find_image_tables(self, soup):
  120.         for x in soup.findAll('table', align=['right', 'center']):
  121.             if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
  122.                 yield x
  123.  
  124.     def postprocess_html(self, soup, first):
  125.         body = soup.find('body')
  126.         for name, val in body.attrs:
  127.             del body[name]
  128.         for table in list(self.eco_find_image_tables(soup)):
  129.             caption = table.find('font')
  130.             img = table.find('img')
  131.             div = Tag(soup, 'div')
  132.             div['style'] = 'text-align:left;font-size:70%'
  133.             ns = NavigableString(self.tag_to_string(caption))
  134.             div.insert(0, ns)
  135.             div.insert(1, Tag(soup, 'br'))
  136.             img.extract()
  137.             del img['width']
  138.             del img['height']
  139.             div.insert(2, img)
  140.             table.replaceWith(div)
  141.         return soup
  142.