home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3951 < prev    next >
Encoding:
Text File  |  2010-06-30  |  22.3 KB  |  543 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. nytimes.com
  7. '''
  8. import re
  9. import time
  10. from calibre import entity_to_unicode
  11. from calibre.web.feeds.recipes import BasicNewsRecipe
  12. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
  13. Comment, BeautifulStoneSoup
  14.  
  15. class NYTimes(BasicNewsRecipe):
  16.  
  17.     title       = 'New York Times Top Stories'
  18.     __author__  = 'GRiker'
  19.     language = 'en'
  20.     requires_version = (0, 7, 5)
  21.     description = 'Top Stories from the New York Times'
  22.  
  23.     # List of sections typically included in Top Stories.  Use a keyword from the
  24.     # right column in the excludeSectionKeywords[] list to skip downloading that section
  25.     sections = {
  26.                  'arts'             :   'Arts',
  27.                  'business'         :   'Business',
  28.                  'diningwine'       :   'Dining & Wine',
  29.                  'editorials'       :   'Editorials',
  30.                  'health'           :   'Health',
  31.                  'magazine'         :   'Magazine',
  32.                  'mediaadvertising' :   'Media & Advertising',
  33.                  'newyorkregion'    :   'New York/Region',
  34.                  'oped'             :   'Op-Ed',
  35.                  'politics'         :   'Politics',
  36.                  'science'          :   'Science',
  37.                  'sports'           :   'Sports',
  38.                  'technology'       :   'Technology',
  39.                  'topstories'       :   'Top Stories',
  40.                  'travel'           :   'Travel',
  41.                  'us'               :   'U.S.',
  42.                  'world'            :   'World'
  43.                }
  44.  
  45.     # Add section keywords from the right column above to skip that section
  46.     # For example, to skip sections containing the word 'Sports' or 'Dining', use:
  47.     # excludeSectionKeywords = ['Sports', 'Dining']
  48.     # Fetch only Business and Technology
  49.     # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
  50.     # Fetch only Top Stories
  51.     # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
  52.     # By default, no sections are skipped.
  53.     excludeSectionKeywords = []
  54.  
  55.     # one_picture_per_article specifies that calibre should only use the first image
  56.     # from an article (if one exists).  If one_picture_per_article = True, the image
  57.     # will be moved to a location between the headline and the byline.
  58.     # If one_picture_per_article = False, all images from the article will be included
  59.     # and shown in their original location.
  60.     one_picture_per_article = True
  61.  
  62.     # The maximum number of articles that will be downloaded
  63.     max_articles_per_feed = 40
  64.  
  65.     timefmt = ''
  66.     needs_subscription = True
  67.     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
  68.     cover_margins = (18,18,'grey99')
  69.  
  70.     remove_tags_before = dict(id='article')
  71.     remove_tags_after  = dict(id='article')
  72.     remove_tags = [dict(attrs={'class':[
  73.                             'articleFooter',
  74.                             'articleTools',
  75.                             'columnGroup doubleRule',
  76.                             'columnGroup singleRule',
  77.                             'columnGroup last',
  78.                             'columnGroup  last',
  79.                             'doubleRule',
  80.                             'dottedLine',
  81.                             'entry-meta',
  82.                             'entry-response module',
  83.                             'icon enlargeThis',
  84.                             'leftNavTabs',
  85.                             'module box nav',
  86.                             'nextArticleLink',
  87.                             'nextArticleLink clearfix',
  88.                             'post-tools',
  89.                             'relatedSearchesModule',
  90.                             'side_tool',
  91.                             'singleAd',
  92.                             'subNavigation clearfix',
  93.                             'subNavigation tabContent active',
  94.                             'subNavigation tabContent active clearfix',
  95.                             ]}),
  96.                    dict(id=[
  97.                             'adxLeaderboard',
  98.                             'archive',
  99.                             'articleExtras',
  100.                             'articleInline',
  101.                             'blog_sidebar',
  102.                             'businessSearchBar',
  103.                             'cCol',
  104.                             'entertainmentSearchBar',
  105.                             'footer',
  106.                             'header',
  107.                             'header_search',
  108.                             'login',
  109.                             'masthead',
  110.                             'masthead-nav',
  111.                             'memberTools',
  112.                             'navigation',
  113.                             'portfolioInline',
  114.                             'relatedArticles',
  115.                             'respond',
  116.                             'side_search',
  117.                             'side_index',
  118.                             'side_tool',
  119.                             'toolsRight',
  120.                             ]),
  121.                    dict(name=['script', 'noscript', 'style'])]
  122.  
  123.     no_stylesheets = True
  124.     extra_css = '.headline      {text-align:    left;}\n    \
  125.                  .byline        {font-family:   monospace;  \
  126.                                  text-align:    left;       \
  127.                                  margin-top:    0px;        \
  128.                                  margin-bottom: 0px;}\n     \
  129.                  .dateline      {font-size:     small;      \
  130.                                  margin-top:    0px;        \
  131.                                  margin-bottom: 0px;}\n     \
  132.                  .timestamp     {font-size:     small;      \
  133.                                  margin-top:    0px;        \
  134.                                  margin-bottom: 0px;}\n     \
  135.                  .source        {text-align:    left;}\n    \
  136.                  .image         {text-align:    center;}\n  \
  137.                  .credit        {text-align:    right;      \
  138.                                  font-size:     small;      \
  139.                                  margin-top:    0px;        \
  140.                                  margin-bottom: 0px;}\n     \
  141.                  .articleBody   {text-align:    left;}\n    \
  142.                  .authorId      {text-align:    left;       \
  143.                                  font-style:    italic;}\n  '
  144.  
  145.     def dump_ans(self, ans) :
  146.         total_article_count = 0
  147.         for section in ans :
  148.             if self.verbose:
  149.                 self.log("section %s: %d articles" % (section[0], len(section[1])) )
  150.             for article in section[1]:
  151.                 total_article_count += 1
  152.                 if self.verbose:
  153.                     self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
  154.                               article['url'].encode('cp1252','replace')))
  155.         self.log( "Queued %d articles" % total_article_count )
  156.  
  157.     def fixChars(self,string):
  158.         # Replace lsquo (\x91)
  159.         fixed = re.sub("\x91","‘",string)
  160.  
  161.         # Replace rsquo (\x92)
  162.         fixed = re.sub("\x92","’",fixed)
  163.  
  164.         # Replace ldquo (\x93)
  165.         fixed = re.sub("\x93","“",fixed)
  166.  
  167.         # Replace rdquo (\x94)
  168.         fixed = re.sub("\x94","”",fixed)
  169.  
  170.         # Replace ndash (\x96)
  171.         fixed = re.sub("\x96","–",fixed)
  172.  
  173.         # Replace mdash (\x97)
  174.         fixed = re.sub("\x97","—",fixed)
  175.  
  176.         return fixed
  177.  
  178.     def get_browser(self):
  179.         br = BasicNewsRecipe.get_browser()
  180.         if self.username is not None and self.password is not None:
  181.             try:
  182.                 br.open('http://www.nytimes.com/auth/login')
  183.                 br.select_form(name='login')
  184.                 br['USERID']   = self.username
  185.                 br['PASSWORD'] = self.password
  186.                 br.submit()
  187.             except:
  188.                 self.log("\nFailed to login")
  189.         return br
  190.  
  191.     def skip_ad_pages(self, soup):
  192.         # Skip ad pages served before actual article
  193.         skip_tag = soup.find(True, {'name':'skip'})
  194.         if skip_tag is not None:
  195.             self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
  196.             url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
  197.             url += '?pagewanted=all'
  198.             self.log.warn("Skipping ad to article at '%s'" % url)
  199.             return self.index_to_soup(url, raw=True)
  200.  
  201.     def get_cover_url(self):
  202.         cover = None
  203.         st = time.localtime()
  204.         year = str(st.tm_year)
  205.         month = "%.2d" % st.tm_mon
  206.         day = "%.2d" % st.tm_mday
  207.         cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
  208.         br = BasicNewsRecipe.get_browser()
  209.         try:
  210.             br.open(cover)
  211.         except:
  212.             self.log("\nCover unavailable")
  213.             cover = None
  214.         return cover
  215.  
  216.     def index_to_soup(self, url_or_raw, raw=False):
  217.         '''
  218.         OVERRIDE of class method
  219.         deals with various page encodings between index and articles
  220.         '''
  221.         def get_the_soup(docEncoding, url_or_raw, raw=False) :
  222.             if re.match(r'\w+://', url_or_raw):
  223.                 f = self.browser.open(url_or_raw)
  224.                 _raw = f.read()
  225.                 f.close()
  226.                 if not _raw:
  227.                     raise RuntimeError('Could not fetch index from %s'%url_or_raw)
  228.             else:
  229.                 _raw = url_or_raw
  230.             if raw:
  231.                 return _raw
  232.  
  233.             if not isinstance(_raw, unicode) and self.encoding:
  234.                 _raw = _raw.decode(docEncoding, 'replace')
  235.             massage = list(BeautifulSoup.MARKUP_MASSAGE)
  236.             massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
  237.             return BeautifulSoup(_raw, markupMassage=massage)
  238.  
  239.         # Entry point
  240.         print "index_to_soup()"
  241.         soup = get_the_soup( self.encoding, url_or_raw )
  242.         contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
  243.         docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
  244.         if docEncoding == '' :
  245.             docEncoding = self.encoding
  246.  
  247.         if self.verbose > 2:
  248.             self.log( "  document encoding: '%s'" % docEncoding)
  249.         if docEncoding != self.encoding :
  250.             soup = get_the_soup(docEncoding, url_or_raw)
  251.  
  252.         return soup
  253.  
  254.     def massageNCXText(self, description):
  255.         # Kindle TOC descriptions won't render certain characters
  256.         if description:
  257.             massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
  258.             # Replace '&' with '&'
  259.             massaged = re.sub("&","&", massaged)
  260.             return self.fixChars(massaged)
  261.         else:
  262.             return description
  263.  
  264.     def parse_index(self):
  265.         articles = {}
  266.         ans = []
  267.  
  268.         feed = key = 'All Top Stories'
  269.         articles[key] = []
  270.         ans.append(key)
  271.         self.log("Scanning 1 section ...")
  272.  
  273.         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
  274.  
  275.         # Fetch the outer table
  276.         table = soup.find('table')
  277.         previousTable = table
  278.  
  279.         # Find the deepest table containing the stories
  280.         while True :
  281.             table = table.find('table')
  282.             if table.find(text=re.compile('top stories start')) :
  283.                 previousTable = table
  284.                 continue
  285.             else :
  286.                 table = previousTable
  287.                 break
  288.  
  289.         # There are multiple subtables, find the one containing the stories
  290.         for block in table.findAll('table') :
  291.             if block.find(text=re.compile('top stories start')) :
  292.                 table = block
  293.                 break
  294.             else :
  295.                 continue
  296.  
  297.         # Again there are multiple subtables, find the one containing the stories
  298.         for storyblock in table.findAll('table') :
  299.             if storyblock.find(text=re.compile('top stories start')) :
  300.                 break
  301.             else :
  302.                 continue
  303.  
  304.         skipThisSection = False
  305.         todays_article_count = 0
  306.         # Within this table are <font face="times new roman, times, san serif"> entries
  307.         self.log("Fetching feed Top Stories")
  308.         for tr in storyblock.findAllNext('tr'):
  309.             if tr.find('span') is not None :
  310.  
  311.                 sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
  312.                                                          'times new roman,times, sans serif',
  313.                                                          'times new roman, times, sans serif']})
  314.                 section = None
  315.                 bylines = []
  316.                 descriptions = []
  317.                 pubdate = None
  318.  
  319.                 # Get the Section title
  320.                 for (x,i) in enumerate(sectionblock.contents) :
  321.                     skipThisSection = False
  322.                     # Extract the section title
  323.                     if ('Comment' in str(i.__class__)) :
  324.                         if 'start(name=' in i :
  325.                             section = i[i.find('=')+1:-2]
  326.  
  327.                         if not self.sections.has_key(section) :
  328.                             skipThisSection = True
  329.                             break
  330.  
  331.                         # Check for excluded section
  332.                         if len(self.excludeSectionKeywords):
  333.                             key = self.sections[section]
  334.                             excluded = re.compile('|'.join(self.excludeSectionKeywords))
  335.                             if excluded.search(key) or articles.has_key(key):
  336.                                 skipThisSection = True
  337.                                 break
  338.  
  339.                 # Get the bylines and descriptions
  340.                 if not skipThisSection :
  341.                     lines = sectionblock.contents
  342.                     contentStrings = []
  343.  
  344.                     for line in lines:
  345.                         if not isinstance(line, Comment) and line.strip and line.strip() > "":
  346.                             contentStrings.append(line.strip())
  347.  
  348.                     # Gather the byline/description pairs
  349.                     bylines = []
  350.                     descriptions = []
  351.                     for contentString in contentStrings:
  352.                         if contentString[0:3] == 'By ' and contentString[3].isupper() :
  353.                             bylines.append(contentString)
  354.                         else:
  355.                             descriptions.append(contentString)
  356.  
  357.                     # Fetch the article titles and URLs
  358.                     articleCount = len(sectionblock.findAll('span'))
  359.                     todays_article_count += articleCount
  360.                     for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
  361.                         a = span.find('a', href=True)
  362.                         url = re.sub(r'\?.*', '', a['href'])
  363.                         url += '?pagewanted=all'
  364.  
  365.                         title = self.tag_to_string(a, use_alt=True)
  366.                         # prepend the section name
  367.                         title = self.sections[section] + " · " + title
  368.  
  369.                         if not isinstance(title, unicode):
  370.                             title = title.decode('utf-8', 'replace')
  371.  
  372.                         # Allow for unattributed, undescribed entries "Editor's Note"
  373.                         if i >= len(descriptions) :
  374.                             description = None
  375.                         else :
  376.                             description = descriptions[i]
  377.  
  378.                         if len(bylines) == articleCount :
  379.                             author = bylines[i]
  380.                         else :
  381.                             author = None
  382.  
  383.                         # Check for duplicates
  384.                         duplicateFound = False
  385.                         if len(articles[feed]) > 1:
  386.                             for article in articles[feed] :
  387.                                 if url == article['url'] :
  388.                                     duplicateFound = True
  389.                                     break
  390.  
  391.                             if duplicateFound:
  392.                                 # Continue fetching, don't add this article
  393.                                 todays_article_count -= 1
  394.                                 continue
  395.  
  396.                         if not articles.has_key(feed):
  397.                             articles[feed] = []
  398.                         articles[feed].append(
  399.                             dict(title=title, url=url, date=pubdate,
  400.                                  description=description, author=author, content=''))
  401. #        self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
  402.  
  403.         ans = self.sort_index_by(ans, {'Top Stories':-1})
  404.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  405.         self.dump_ans(ans)
  406.         return ans
  407.  
  408.     def preprocess_html(self, soup):
  409.         return self.strip_anchors(soup)
  410.  
  411.     def postprocess_html(self,soup, True):
  412.  
  413.         if self.one_picture_per_article:
  414.             # Remove all images after first
  415.             largeImg = soup.find(True, {'class':'articleSpanImage'})
  416.             inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
  417.             if largeImg:
  418.                 for inlineImg in inlineImgs:
  419.                     inlineImg.extract()
  420.             else:
  421.                 if inlineImgs:
  422.                     firstImg = inlineImgs[0]
  423.                     for inlineImg in inlineImgs[1:]:
  424.                         inlineImg.extract()
  425.                     # Move firstImg after headline
  426.                     cgFirst = soup.find(True, {'class':'columnGroup  first'})
  427.                     if cgFirst:
  428.                         # Strip all sibling NavigableStrings: noise
  429.                         navstrings = cgFirst.findAll(text=True, recursive=False)
  430.                         [ns.extract() for ns in navstrings]
  431.                         headline_found = False
  432.                         tag = cgFirst.find(True)
  433.                         insertLoc = 0
  434.                         while True:
  435.                             insertLoc += 1
  436.                             if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
  437.                                     headline_found = True
  438.                                     break
  439.                             tag = tag.nextSibling
  440.                             if not tag:
  441.                                 headline_found = False
  442.                                 break
  443.                         if headline_found:
  444.                             cgFirst.insert(insertLoc,firstImg)
  445.                     else:
  446.                         self.log(">>> No class:'columnGroup  first' found <<<")
  447.         # Change class="kicker" to <h3>
  448.         kicker = soup.find(True, {'class':'kicker'})
  449.         if kicker and kicker.contents[0]:
  450.             h3Tag = Tag(soup, "h3")
  451.             h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
  452.                              use_alt=False)))
  453.             kicker.replaceWith(h3Tag)
  454.  
  455.         # Change captions to italic -1
  456.         for caption in soup.findAll(True, {'class':'caption'}) :
  457.             if caption and caption.contents[0]:
  458.                 emTag = Tag(soup, "em")
  459.                 c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
  460.                 mp_off = c.find("More Photos")
  461.                 if mp_off >= 0:
  462.                     c = c[:mp_off]
  463.                 emTag.insert(0, c)
  464.                 #hrTag = Tag(soup, 'hr')
  465.                 #hrTag['class'] = 'caption_divider'
  466.                 hrTag = Tag(soup, 'div')
  467.                 hrTag['class'] = 'divider'
  468.                 emTag.insert(1, hrTag)
  469.                 caption.replaceWith(emTag)
  470.  
  471.         # Change <nyt_headline> to <h2>
  472.         h1 = soup.find('h1')
  473.         if h1:
  474.             headline = h1.find("nyt_headline")
  475.             if headline:
  476.                 tag = Tag(soup, "h2")
  477.                 tag['class'] = "headline"
  478.                 tag.insert(0, self.fixChars(headline.contents[0]))
  479.                 h1.replaceWith(tag)
  480.         else:
  481.             # Blog entry - replace headline, remove <hr> tags
  482.             headline = soup.find('title')
  483.             if headline:
  484.                 tag = Tag(soup, "h2")
  485.                 tag['class'] = "headline"
  486.                 tag.insert(0, self.fixChars(headline.contents[0]))
  487.                 soup.insert(0, tag)
  488.                 hrs = soup.findAll('hr')
  489.                 for hr in hrs:
  490.                     hr.extract()
  491.  
  492.         # Change <h1> to <h3> - used in editorial blogs
  493.         masthead = soup.find("h1")
  494.         if masthead:
  495.             # Nuke the href
  496.             if masthead.a:
  497.                 del(masthead.a['href'])
  498.             tag = Tag(soup, "h3")
  499.             tag.insert(0, self.fixChars(masthead.contents[0]))
  500.             masthead.replaceWith(tag)
  501.  
  502.         # Change <span class="bold"> to <b>
  503.         for subhead in soup.findAll(True, {'class':'bold'}) :
  504.             if subhead.contents:
  505.                 bTag = Tag(soup, "b")
  506.                 bTag.insert(0, subhead.contents[0])
  507.                 subhead.replaceWith(bTag)
  508.  
  509.         # Synthesize a section header
  510.         dsk = soup.find('meta', attrs={'name':'dsk'})
  511.         if dsk and dsk.has_key('content'):
  512.             hTag = Tag(soup,'h3')
  513.             hTag['class'] = 'section'
  514.             hTag.insert(0,NavigableString(dsk['content']))
  515.             articleTag = soup.find(True, attrs={'id':'article'})
  516.             if articleTag:
  517.                 articleTag.insert(0,hTag)
  518.  
  519.         # Add class="articleBody" to <div> so we can format with CSS
  520.         divTag = soup.find('div',attrs={'id':'articleBody'})
  521.         if divTag:
  522.             divTag['class'] = divTag['id']
  523.  
  524.         # Add class="authorId" to <div> so we can format with CSS
  525.         divTag = soup.find('div',attrs={'id':'authorId'})
  526.         if divTag and divTag.contents[0]:
  527.             tag = Tag(soup, "p")
  528.             tag['class'] = "authorId"
  529.             tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
  530.                              use_alt=False)))
  531.             divTag.replaceWith(tag)
  532.  
  533.         return soup
  534.  
  535.     def strip_anchors(self,soup):
  536.         paras = soup.findAll(True)
  537.         for para in paras:
  538.             aTags = para.findAll('a')
  539.             for a in aTags:
  540.                 if a.img is None:
  541.                     a.replaceWith(a.renderContents().decode('cp1252','replace'))
  542.         return soup
  543.