home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
- '''
- nytimes.com
- '''
- import string, re, time
- from calibre import strftime
- from calibre.web.feeds.recipes import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
- def decode(self, src):
- enc = 'utf-8'
- if 'iso-8859-1' in src:
- enc = 'cp1252'
- return src.decode(enc, 'ignore')
-
- class NYTimes(BasicNewsRecipe):
-
- title = u'New York Times'
- __author__ = 'Kovid Goyal/Nick Redding'
- language = 'en'
- requires_version = (0, 6, 36)
-
- description = 'Daily news from the New York Times (subscription version)'
- timefmt = ' [%b %d]'
- needs_subscription = True
- remove_tags_before = dict(id='article')
- remove_tags_after = dict(id='article')
- remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
- 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
- 'icon enlargeThis','columnGroup last','relatedSearchesModule']}),
- dict({'class':re.compile('^subNavigation')}),
- dict({'class':re.compile('^leaderboard')}),
- dict({'class':re.compile('^module')}),
- dict({'class':'metaFootnote'}),
- dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
- 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
- 'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
- 'relatedArticles', 'relatedTopics', 'adxSponLink']),
- dict(name=['script', 'noscript', 'style','form','hr'])]
- encoding = decode
- no_stylesheets = True
- extra_css = '''
- .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
- .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .timestamp { font-size: small; }
- .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- a:link {text-decoration: none; }'''
-
- def get_browser(self):
- br = BasicNewsRecipe.get_browser()
- if self.username is not None and self.password is not None:
- br.open('http://www.nytimes.com/auth/login')
- br.select_form(name='login')
- br['USERID'] = self.username
- br['PASSWORD'] = self.password
- raw = br.submit().read()
- if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
- raise Exception('Your username and password are incorrect')
- #open('/t/log.html', 'wb').write(raw)
- return br
-
- def get_masthead_url(self):
- masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
- #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(masthead)
- except:
- self.log("\nMasthead unavailable")
- masthead = None
- return masthead
-
-
- def get_cover_url(self):
- cover = None
- st = time.localtime()
- year = str(st.tm_year)
- month = "%.2d" % st.tm_mon
- day = "%.2d" % st.tm_mday
- cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(cover)
- except:
- self.log("\nCover unavailable")
- cover = None
- return cover
-
- def short_title(self):
- return 'New York Times'
-
- def parse_index(self):
- self.encoding = 'cp1252'
- soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
- self.encoding = decode
-
- def feed_title(div):
- return ''.join(div.findAll(text=True, recursive=True)).strip()
-
- articles = {}
- key = None
- ans = []
- url_list = []
-
- def handle_article(div):
- a = div.find('a', href=True)
- if not a:
- return
- url = re.sub(r'\?.*', '', a['href'])
- if not url.startswith("http"):
- return
- if not url.endswith(".html"):
- return
- if 'podcast' in url:
- return
- url += '?pagewanted=all'
- if url in url_list:
- return
- url_list.append(url)
- title = self.tag_to_string(a, use_alt=True).strip()
- #self.log("Title: %s" % title)
- description = ''
- pubdate = strftime('%a, %d %b')
- summary = div.find(True, attrs={'class':'summary'})
- if summary:
- description = self.tag_to_string(summary, use_alt=False)
- author = ''
- authorAttribution = div.find(True, attrs={'class':'byline'})
- if authorAttribution:
- author = self.tag_to_string(authorAttribution, use_alt=False)
- else:
- authorAttribution = div.find(True, attrs={'class':'byline'})
- if authorAttribution:
- author = self.tag_to_string(authorAttribution, use_alt=False)
- feed = key if key is not None else 'Uncategorized'
- if not articles.has_key(feed):
- articles[feed] = []
- articles[feed].append(
- dict(title=title, url=url, date=pubdate,
- description=description, author=author,
- content=''))
-
-
-
- # Find each instance of class="section-headline", class="story", class="story headline"
- for div in soup.findAll(True,
- attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
-
- if div['class'] in ['section-headline','sectionHeader']:
- key = string.capwords(feed_title(div))
- articles[key] = []
- ans.append(key)
- #self.log('Section: %s' % key)
-
- elif div['class'] in ['story', 'story headline'] :
- handle_article(div)
- elif div['class'] == 'headlinesOnly multiline flush':
- for lidiv in div.findAll('li'):
- handle_article(lidiv)
-
- # ans = self.sort_index_by(ans, {'The Front Page':-1,
- # 'Dining In, Dining Out':1,
- # 'Obituaries':2})
- ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-
- return ans
-
- def preprocess_html(self, soup):
- kicker_tag = soup.find(attrs={'class':'kicker'})
- if kicker_tag:
- tagline = self.tag_to_string(kicker_tag)
- #self.log("FOUND KICKER %s" % tagline)
- if tagline=='Op-Ed Columnist':
- img_div = soup.find('div','inlineImage module')
- #self.log("Searching for photo")
- if img_div:
- img_div.extract()
- #self.log("Photo deleted")
- refresh = soup.find('meta', {'http-equiv':'refresh'})
- if refresh is None:
- return soup
- content = refresh.get('content').partition('=')[2]
- raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
- return BeautifulSoup(raw.decode('cp1252', 'replace'))
-
-
-