home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
- '''
- nytimes.com
- '''
- import re
- import time
- from calibre import entity_to_unicode
- from calibre.web.feeds.recipes import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
- Comment, BeautifulStoneSoup
-
- class NYTimes(BasicNewsRecipe):
-
- title = 'New York Times Top Stories'
- __author__ = 'GRiker'
- language = 'en'
- requires_version = (0, 7, 5)
- description = 'Top Stories from the New York Times'
-
- # List of sections typically included in Top Stories. Use a keyword from the
- # right column in the excludeSectionKeywords[] list to skip downloading that section
- sections = {
- 'arts' : 'Arts',
- 'business' : 'Business',
- 'diningwine' : 'Dining & Wine',
- 'editorials' : 'Editorials',
- 'health' : 'Health',
- 'magazine' : 'Magazine',
- 'mediaadvertising' : 'Media & Advertising',
- 'newyorkregion' : 'New York/Region',
- 'oped' : 'Op-Ed',
- 'politics' : 'Politics',
- 'science' : 'Science',
- 'sports' : 'Sports',
- 'technology' : 'Technology',
- 'topstories' : 'Top Stories',
- 'travel' : 'Travel',
- 'us' : 'U.S.',
- 'world' : 'World'
- }
-
- # Add section keywords from the right column above to skip that section
- # For example, to skip sections containing the word 'Sports' or 'Dining', use:
- # excludeSectionKeywords = ['Sports', 'Dining']
- # Fetch only Business and Technology
- # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
- # Fetch only Top Stories
- # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
- # By default, no sections are skipped.
- excludeSectionKeywords = []
-
- # one_picture_per_article specifies that calibre should only use the first image
- # from an article (if one exists). If one_picture_per_article = True, the image
- # will be moved to a location between the headline and the byline.
- # If one_picture_per_article = False, all images from the article will be included
- # and shown in their original location.
- one_picture_per_article = True
-
- # The maximum number of articles that will be downloaded
- max_articles_per_feed = 40
-
- timefmt = ''
- needs_subscription = True
- masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
- cover_margins = (18,18,'grey99')
-
- remove_tags_before = dict(id='article')
- remove_tags_after = dict(id='article')
- remove_tags = [dict(attrs={'class':[
- 'articleFooter',
- 'articleTools',
- 'columnGroup doubleRule',
- 'columnGroup singleRule',
- 'columnGroup last',
- 'columnGroup last',
- 'doubleRule',
- 'dottedLine',
- 'entry-meta',
- 'entry-response module',
- 'icon enlargeThis',
- 'leftNavTabs',
- 'module box nav',
- 'nextArticleLink',
- 'nextArticleLink clearfix',
- 'post-tools',
- 'relatedSearchesModule',
- 'side_tool',
- 'singleAd',
- 'subNavigation clearfix',
- 'subNavigation tabContent active',
- 'subNavigation tabContent active clearfix',
- ]}),
- dict(id=[
- 'adxLeaderboard',
- 'archive',
- 'articleExtras',
- 'articleInline',
- 'blog_sidebar',
- 'businessSearchBar',
- 'cCol',
- 'entertainmentSearchBar',
- 'footer',
- 'header',
- 'header_search',
- 'login',
- 'masthead',
- 'masthead-nav',
- 'memberTools',
- 'navigation',
- 'portfolioInline',
- 'relatedArticles',
- 'respond',
- 'side_search',
- 'side_index',
- 'side_tool',
- 'toolsRight',
- ]),
- dict(name=['script', 'noscript', 'style'])]
-
- no_stylesheets = True
- extra_css = '.headline {text-align: left;}\n \
- .byline {font-family: monospace; \
- text-align: left; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .dateline {font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .timestamp {font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .source {text-align: left;}\n \
- .image {text-align: center;}\n \
- .credit {text-align: right; \
- font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .articleBody {text-align: left;}\n \
- .authorId {text-align: left; \
- font-style: italic;}\n '
-
- def dump_ans(self, ans) :
- total_article_count = 0
- for section in ans :
- if self.verbose:
- self.log("section %s: %d articles" % (section[0], len(section[1])) )
- for article in section[1]:
- total_article_count += 1
- if self.verbose:
- self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
- article['url'].encode('cp1252','replace')))
- self.log( "Queued %d articles" % total_article_count )
-
- def fixChars(self,string):
- # Replace lsquo (\x91)
- fixed = re.sub("\x91","‘",string)
-
- # Replace rsquo (\x92)
- fixed = re.sub("\x92","’",fixed)
-
- # Replace ldquo (\x93)
- fixed = re.sub("\x93","“",fixed)
-
- # Replace rdquo (\x94)
- fixed = re.sub("\x94","”",fixed)
-
- # Replace ndash (\x96)
- fixed = re.sub("\x96","–",fixed)
-
- # Replace mdash (\x97)
- fixed = re.sub("\x97","—",fixed)
-
- return fixed
-
- def get_browser(self):
- br = BasicNewsRecipe.get_browser()
- if self.username is not None and self.password is not None:
- try:
- br.open('http://www.nytimes.com/auth/login')
- br.select_form(name='login')
- br['USERID'] = self.username
- br['PASSWORD'] = self.password
- br.submit()
- except:
- self.log("\nFailed to login")
- return br
-
- def skip_ad_pages(self, soup):
- # Skip ad pages served before actual article
- skip_tag = soup.find(True, {'name':'skip'})
- if skip_tag is not None:
- self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
- url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
- url += '?pagewanted=all'
- self.log.warn("Skipping ad to article at '%s'" % url)
- return self.index_to_soup(url, raw=True)
-
- def get_cover_url(self):
- cover = None
- st = time.localtime()
- year = str(st.tm_year)
- month = "%.2d" % st.tm_mon
- day = "%.2d" % st.tm_mday
- cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(cover)
- except:
- self.log("\nCover unavailable")
- cover = None
- return cover
-
- def index_to_soup(self, url_or_raw, raw=False):
- '''
- OVERRIDE of class method
- deals with various page encodings between index and articles
- '''
- def get_the_soup(docEncoding, url_or_raw, raw=False) :
- if re.match(r'\w+://', url_or_raw):
- f = self.browser.open(url_or_raw)
- _raw = f.read()
- f.close()
- if not _raw:
- raise RuntimeError('Could not fetch index from %s'%url_or_raw)
- else:
- _raw = url_or_raw
- if raw:
- return _raw
-
- if not isinstance(_raw, unicode) and self.encoding:
- _raw = _raw.decode(docEncoding, 'replace')
- massage = list(BeautifulSoup.MARKUP_MASSAGE)
- massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
- return BeautifulSoup(_raw, markupMassage=massage)
-
- # Entry point
- print "index_to_soup()"
- soup = get_the_soup( self.encoding, url_or_raw )
- contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
- docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
- if docEncoding == '' :
- docEncoding = self.encoding
-
- if self.verbose > 2:
- self.log( " document encoding: '%s'" % docEncoding)
- if docEncoding != self.encoding :
- soup = get_the_soup(docEncoding, url_or_raw)
-
- return soup
-
- def massageNCXText(self, description):
- # Kindle TOC descriptions won't render certain characters
- if description:
- massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
- # Replace '&' with '&'
- massaged = re.sub("&","&", massaged)
- return self.fixChars(massaged)
- else:
- return description
-
- def parse_index(self):
- articles = {}
- ans = []
-
- feed = key = 'All Top Stories'
- articles[key] = []
- ans.append(key)
- self.log("Scanning 1 section ...")
-
- soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
-
- # Fetch the outer table
- table = soup.find('table')
- previousTable = table
-
- # Find the deepest table containing the stories
- while True :
- table = table.find('table')
- if table.find(text=re.compile('top stories start')) :
- previousTable = table
- continue
- else :
- table = previousTable
- break
-
- # There are multiple subtables, find the one containing the stories
- for block in table.findAll('table') :
- if block.find(text=re.compile('top stories start')) :
- table = block
- break
- else :
- continue
-
- # Again there are multiple subtables, find the one containing the stories
- for storyblock in table.findAll('table') :
- if storyblock.find(text=re.compile('top stories start')) :
- break
- else :
- continue
-
- skipThisSection = False
- todays_article_count = 0
- # Within this table are <font face="times new roman, times, san serif"> entries
- self.log("Fetching feed Top Stories")
- for tr in storyblock.findAllNext('tr'):
- if tr.find('span') is not None :
-
- sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
- 'times new roman,times, sans serif',
- 'times new roman, times, sans serif']})
- section = None
- bylines = []
- descriptions = []
- pubdate = None
-
- # Get the Section title
- for (x,i) in enumerate(sectionblock.contents) :
- skipThisSection = False
- # Extract the section title
- if ('Comment' in str(i.__class__)) :
- if 'start(name=' in i :
- section = i[i.find('=')+1:-2]
-
- if not self.sections.has_key(section) :
- skipThisSection = True
- break
-
- # Check for excluded section
- if len(self.excludeSectionKeywords):
- key = self.sections[section]
- excluded = re.compile('|'.join(self.excludeSectionKeywords))
- if excluded.search(key) or articles.has_key(key):
- skipThisSection = True
- break
-
- # Get the bylines and descriptions
- if not skipThisSection :
- lines = sectionblock.contents
- contentStrings = []
-
- for line in lines:
- if not isinstance(line, Comment) and line.strip and line.strip() > "":
- contentStrings.append(line.strip())
-
- # Gather the byline/description pairs
- bylines = []
- descriptions = []
- for contentString in contentStrings:
- if contentString[0:3] == 'By ' and contentString[3].isupper() :
- bylines.append(contentString)
- else:
- descriptions.append(contentString)
-
- # Fetch the article titles and URLs
- articleCount = len(sectionblock.findAll('span'))
- todays_article_count += articleCount
- for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
- a = span.find('a', href=True)
- url = re.sub(r'\?.*', '', a['href'])
- url += '?pagewanted=all'
-
- title = self.tag_to_string(a, use_alt=True)
- # prepend the section name
- title = self.sections[section] + " · " + title
-
- if not isinstance(title, unicode):
- title = title.decode('utf-8', 'replace')
-
- # Allow for unattributed, undescribed entries "Editor's Note"
- if i >= len(descriptions) :
- description = None
- else :
- description = descriptions[i]
-
- if len(bylines) == articleCount :
- author = bylines[i]
- else :
- author = None
-
- # Check for duplicates
- duplicateFound = False
- if len(articles[feed]) > 1:
- for article in articles[feed] :
- if url == article['url'] :
- duplicateFound = True
- break
-
- if duplicateFound:
- # Continue fetching, don't add this article
- todays_article_count -= 1
- continue
-
- if not articles.has_key(feed):
- articles[feed] = []
- articles[feed].append(
- dict(title=title, url=url, date=pubdate,
- description=description, author=author, content=''))
- # self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
-
- ans = self.sort_index_by(ans, {'Top Stories':-1})
- ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
- self.dump_ans(ans)
- return ans
-
- def preprocess_html(self, soup):
- return self.strip_anchors(soup)
-
- def postprocess_html(self,soup, True):
-
- if self.one_picture_per_article:
- # Remove all images after first
- largeImg = soup.find(True, {'class':'articleSpanImage'})
- inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
- if largeImg:
- for inlineImg in inlineImgs:
- inlineImg.extract()
- else:
- if inlineImgs:
- firstImg = inlineImgs[0]
- for inlineImg in inlineImgs[1:]:
- inlineImg.extract()
- # Move firstImg after headline
- cgFirst = soup.find(True, {'class':'columnGroup first'})
- if cgFirst:
- # Strip all sibling NavigableStrings: noise
- navstrings = cgFirst.findAll(text=True, recursive=False)
- [ns.extract() for ns in navstrings]
- headline_found = False
- tag = cgFirst.find(True)
- insertLoc = 0
- while True:
- insertLoc += 1
- if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
- headline_found = True
- break
- tag = tag.nextSibling
- if not tag:
- headline_found = False
- break
- if headline_found:
- cgFirst.insert(insertLoc,firstImg)
- else:
- self.log(">>> No class:'columnGroup first' found <<<")
- # Change class="kicker" to <h3>
- kicker = soup.find(True, {'class':'kicker'})
- if kicker and kicker.contents[0]:
- h3Tag = Tag(soup, "h3")
- h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
- use_alt=False)))
- kicker.replaceWith(h3Tag)
-
- # Change captions to italic -1
- for caption in soup.findAll(True, {'class':'caption'}) :
- if caption and caption.contents[0]:
- emTag = Tag(soup, "em")
- c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
- mp_off = c.find("More Photos")
- if mp_off >= 0:
- c = c[:mp_off]
- emTag.insert(0, c)
- #hrTag = Tag(soup, 'hr')
- #hrTag['class'] = 'caption_divider'
- hrTag = Tag(soup, 'div')
- hrTag['class'] = 'divider'
- emTag.insert(1, hrTag)
- caption.replaceWith(emTag)
-
- # Change <nyt_headline> to <h2>
- h1 = soup.find('h1')
- if h1:
- headline = h1.find("nyt_headline")
- if headline:
- tag = Tag(soup, "h2")
- tag['class'] = "headline"
- tag.insert(0, self.fixChars(headline.contents[0]))
- h1.replaceWith(tag)
- else:
- # Blog entry - replace headline, remove <hr> tags
- headline = soup.find('title')
- if headline:
- tag = Tag(soup, "h2")
- tag['class'] = "headline"
- tag.insert(0, self.fixChars(headline.contents[0]))
- soup.insert(0, tag)
- hrs = soup.findAll('hr')
- for hr in hrs:
- hr.extract()
-
- # Change <h1> to <h3> - used in editorial blogs
- masthead = soup.find("h1")
- if masthead:
- # Nuke the href
- if masthead.a:
- del(masthead.a['href'])
- tag = Tag(soup, "h3")
- tag.insert(0, self.fixChars(masthead.contents[0]))
- masthead.replaceWith(tag)
-
- # Change <span class="bold"> to <b>
- for subhead in soup.findAll(True, {'class':'bold'}) :
- if subhead.contents:
- bTag = Tag(soup, "b")
- bTag.insert(0, subhead.contents[0])
- subhead.replaceWith(bTag)
-
- # Synthesize a section header
- dsk = soup.find('meta', attrs={'name':'dsk'})
- if dsk and dsk.has_key('content'):
- hTag = Tag(soup,'h3')
- hTag['class'] = 'section'
- hTag.insert(0,NavigableString(dsk['content']))
- articleTag = soup.find(True, attrs={'id':'article'})
- if articleTag:
- articleTag.insert(0,hTag)
-
- # Add class="articleBody" to <div> so we can format with CSS
- divTag = soup.find('div',attrs={'id':'articleBody'})
- if divTag:
- divTag['class'] = divTag['id']
-
- # Add class="authorId" to <div> so we can format with CSS
- divTag = soup.find('div',attrs={'id':'authorId'})
- if divTag and divTag.contents[0]:
- tag = Tag(soup, "p")
- tag['class'] = "authorId"
- tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
- use_alt=False)))
- divTag.replaceWith(tag)
-
- return soup
-
- def strip_anchors(self,soup):
- paras = soup.findAll(True)
- for para in paras:
- aTags = para.findAll('a')
- for a in aTags:
- if a.img is None:
- a.replaceWith(a.renderContents().decode('cp1252','replace'))
- return soup
-