Maximum CD 2011 January

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2011 January / maximum-cd-2011-01.iso / DiscContents / calibre-0.7.26.msi / file_4240 < prev next >

Wrap

Text File | 2010-10-26 | 7.9 KB | 193 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' ''' nytimes.com ''' import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup def decode(self, src): enc = 'utf-8' if 'iso-8859-1' in src: enc = 'cp1252' return src.decode(enc, 'ignore') class NYTimes(BasicNewsRecipe): title = u'New York Times' __author__ = 'Kovid Goyal/Nick Redding' language = 'en' requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' timefmt = ' [%b %d]' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), dict({'class':re.compile('^subNavigation')}), dict({'class':re.compile('^leaderboard')}), dict({'class':re.compile('^module')}), dict({'class':'metaFootnote'}), dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', 'relatedArticles', 'relatedTopics', 'adxSponLink']), dict(name=['script', 'noscript', 'style','form','hr'])] encoding = decode no_stylesheets = True extra_css = ''' .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .timestamp { font-size: small; } .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } a:link {text-decoration: none; }''' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://www.nytimes.com/auth/login') br.select_form(name='login') br['USERID'] = self.username br['PASSWORD'] = self.password raw = br.submit().read() if 'Sorry, we could not find the combination you entered. Please try again.' in raw: raise Exception('Your username and password are incorrect') #open('/t/log.html', 'wb').write(raw) return br def get_masthead_url(self): masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' br = BasicNewsRecipe.get_browser() try: br.open(masthead) except: self.log("\nMasthead unavailable") masthead = None return masthead def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: self.log("\nCover unavailable") cover = None return cover def short_title(self): return 'New York Times' def parse_index(self): self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') self.encoding = decode def feed_title(div): return ''.join(div.findAll(text=True, recursive=True)).strip() articles = {} key = None ans = [] url_list = [] def handle_article(div): a = div.find('a', href=True) if not a: return url = re.sub(r'\?.*', '', a['href']) if not url.startswith("http"): return if not url.endswith(".html"): return if 'podcast' in url: return url += '?pagewanted=all' if url in url_list: return url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() #self.log("Title: %s" % title) description = '' pubdate = strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) author = '' authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) else: authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): articles[feed] = [] articles[feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) # Find each instance of class="section-headline", class="story", class="story headline" for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): if div['class'] in ['section-headline','sectionHeader']: key = string.capwords(feed_title(div)) articles[key] = [] ans.append(key) #self.log('Section: %s' % key) elif div['class'] in ['story', 'story headline'] : handle_article(div) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): handle_article(lidiv) # ans = self.sort_index_by(ans, {'The Front Page':-1, # 'Dining In, Dining Out':1, # 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans def preprocess_html(self, soup): kicker_tag = soup.find(attrs={'class':'kicker'}) if kicker_tag: tagline = self.tag_to_string(kicker_tag) #self.log("FOUND KICKER %s" % tagline) if tagline=='Op-Ed Columnist': img_div = soup.find('div','inlineImage module') #self.log("Searching for photo") if img_div: img_div.extract() #self.log("Photo deleted") refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup content = refresh.get('content').partition('=')[2] raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace'))