home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4240 < prev    next >
Encoding:
Text File  |  2010-10-26  |  7.9 KB  |  193 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. nytimes.com
  7. '''
  8. import string, re, time
  9. from calibre import strftime
  10. from calibre.web.feeds.recipes import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  12.  
  13. def decode(self, src):
  14.     enc = 'utf-8'
  15.     if 'iso-8859-1' in src:
  16.         enc = 'cp1252'
  17.     return src.decode(enc, 'ignore')
  18.  
  19. class NYTimes(BasicNewsRecipe):
  20.  
  21.     title       = u'New York Times'
  22.     __author__  = 'Kovid Goyal/Nick Redding'
  23.     language = 'en'
  24.     requires_version = (0, 6, 36)
  25.  
  26.     description = 'Daily news from the New York Times (subscription version)'
  27.     timefmt = ' [%b %d]'
  28.     needs_subscription = True
  29.     remove_tags_before = dict(id='article')
  30.     remove_tags_after  = dict(id='article')
  31.     remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
  32.                                         'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
  33.                                         'icon enlargeThis','columnGroup  last','relatedSearchesModule']}),
  34.                    dict({'class':re.compile('^subNavigation')}),
  35.                    dict({'class':re.compile('^leaderboard')}),
  36.                    dict({'class':re.compile('^module')}),
  37.                    dict({'class':'metaFootnote'}),
  38.                    dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
  39.                             'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
  40.                             'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
  41.                             'relatedArticles', 'relatedTopics', 'adxSponLink']),
  42.                    dict(name=['script', 'noscript', 'style','form','hr'])]
  43.     encoding = decode
  44.     no_stylesheets = True
  45.     extra_css = '''
  46.                 .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
  47.                 .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  48.                 .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
  49.                 .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  50.                 .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  51.                 .timestamp { font-size: small; }
  52.                 .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  53.                 a:link {text-decoration: none; }'''
  54.  
  55.     def get_browser(self):
  56.         br = BasicNewsRecipe.get_browser()
  57.         if self.username is not None and self.password is not None:
  58.             br.open('http://www.nytimes.com/auth/login')
  59.             br.select_form(name='login')
  60.             br['USERID']   = self.username
  61.             br['PASSWORD'] = self.password
  62.             raw = br.submit().read()
  63.             if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
  64.                 raise Exception('Your username and password are incorrect')
  65.             #open('/t/log.html', 'wb').write(raw)
  66.         return br
  67.  
  68.     def get_masthead_url(self):
  69.         masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
  70.         #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
  71.         br = BasicNewsRecipe.get_browser()
  72.         try:
  73.             br.open(masthead)
  74.         except:
  75.             self.log("\nMasthead unavailable")
  76.             masthead = None
  77.         return masthead
  78.  
  79.  
  80.     def get_cover_url(self):
  81.         cover = None
  82.         st = time.localtime()
  83.         year = str(st.tm_year)
  84.         month = "%.2d" % st.tm_mon
  85.         day = "%.2d" % st.tm_mday
  86.         cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
  87.         br = BasicNewsRecipe.get_browser()
  88.         try:
  89.             br.open(cover)
  90.         except:
  91.             self.log("\nCover unavailable")
  92.             cover = None
  93.         return cover
  94.  
  95.     def short_title(self):
  96.         return 'New York Times'
  97.  
  98.     def parse_index(self):
  99.         self.encoding = 'cp1252'
  100.         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
  101.         self.encoding = decode
  102.  
  103.         def feed_title(div):
  104.             return ''.join(div.findAll(text=True, recursive=True)).strip()
  105.  
  106.         articles = {}
  107.         key = None
  108.         ans = []
  109.         url_list = []
  110.  
  111.         def handle_article(div):
  112.             a = div.find('a', href=True)
  113.             if not a:
  114.                 return
  115.             url = re.sub(r'\?.*', '', a['href'])
  116.             if not url.startswith("http"):
  117.                 return
  118.             if not url.endswith(".html"):
  119.                 return
  120.             if 'podcast' in url:
  121.                 return
  122.             url += '?pagewanted=all'
  123.             if url in url_list:
  124.                 return
  125.             url_list.append(url)
  126.             title = self.tag_to_string(a, use_alt=True).strip()
  127.             #self.log("Title: %s" % title)
  128.             description = ''
  129.             pubdate = strftime('%a, %d %b')
  130.             summary = div.find(True, attrs={'class':'summary'})
  131.             if summary:
  132.                 description = self.tag_to_string(summary, use_alt=False)
  133.             author = ''
  134.             authorAttribution = div.find(True, attrs={'class':'byline'})
  135.             if authorAttribution:
  136.                 author = self.tag_to_string(authorAttribution, use_alt=False)
  137.             else:
  138.                 authorAttribution = div.find(True, attrs={'class':'byline'})
  139.                 if authorAttribution:
  140.                     author = self.tag_to_string(authorAttribution, use_alt=False)
  141.             feed = key if key is not None else 'Uncategorized'
  142.             if not articles.has_key(feed):
  143.                 articles[feed] = []
  144.             articles[feed].append(
  145.                             dict(title=title, url=url, date=pubdate,
  146.                                 description=description, author=author,
  147.                                 content=''))
  148.  
  149.  
  150.  
  151.         # Find each instance of class="section-headline", class="story", class="story headline"
  152.         for div in soup.findAll(True,
  153.             attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
  154.  
  155.             if div['class'] in ['section-headline','sectionHeader']:
  156.                 key = string.capwords(feed_title(div))
  157.                 articles[key] = []
  158.                 ans.append(key)
  159.                 #self.log('Section: %s' % key)
  160.  
  161.             elif div['class'] in ['story', 'story headline'] :
  162.                 handle_article(div)
  163.             elif div['class'] == 'headlinesOnly multiline flush':
  164.                 for lidiv in div.findAll('li'):
  165.                     handle_article(lidiv)
  166.  
  167. #        ans = self.sort_index_by(ans, {'The Front Page':-1,
  168. #                                      'Dining In, Dining Out':1,
  169. #                                     'Obituaries':2})
  170.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  171.  
  172.         return ans
  173.  
  174.     def preprocess_html(self, soup):
  175.         kicker_tag = soup.find(attrs={'class':'kicker'})
  176.         if kicker_tag:
  177.             tagline = self.tag_to_string(kicker_tag)
  178.             #self.log("FOUND KICKER %s" % tagline)
  179.             if tagline=='Op-Ed Columnist':
  180.                 img_div = soup.find('div','inlineImage module')
  181.                 #self.log("Searching for photo")
  182.                 if img_div:
  183.                     img_div.extract()
  184.                     #self.log("Photo deleted")
  185.         refresh = soup.find('meta', {'http-equiv':'refresh'})
  186.         if refresh is None:
  187.             return soup
  188.         content = refresh.get('content').partition('=')[2]
  189.         raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
  190.         return BeautifulSoup(raw.decode('cp1252', 'replace'))
  191.  
  192.  
  193.