home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4114 < prev    next >
Encoding:
Text File  |  2010-07-08  |  17.2 KB  |  436 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. usatoday.com
  7. '''
  8.  
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10. from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
  11. import re
  12.  
  13. class USAToday(BasicNewsRecipe):
  14.  
  15.     title = 'USA Today'
  16.     __author__ = 'GRiker'
  17.     oldest_article = 1
  18.     timefmt  = ''
  19.     max_articles_per_feed = 20
  20.     language = 'en'
  21.     no_stylesheets = True
  22.     extra_css = '.headline      {text-align:    left;}\n    \
  23.                  .byline        {font-family:   monospace;  \
  24.                                  text-align:    left;       \
  25.                                  margin-bottom: 1em;}\n     \
  26.                  .image         {text-align:    center;}\n  \
  27.                  .caption       {text-align:    center;     \
  28.                                  font-size:     smaller;    \
  29.                                  font-style:    italic}\n   \
  30.                  .credit        {text-align:    right;      \
  31.                                  margin-bottom: 0em;        \
  32.                                  font-size:     smaller;}\n \
  33.                  .articleBody   {text-align:    left;}\n    '
  34.     conversion_options = { 'linearize_tables' : True }
  35.     #simultaneous_downloads = 1
  36.     feeds =  [
  37.                 ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
  38.                 ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
  39.                 ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
  40.                 ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
  41.                 ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
  42.                 ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
  43.                 ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
  44.                 ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
  45.                 ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
  46.                 ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
  47.                 ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
  48.                 ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
  49.                 ]
  50.     keep_only_tags = [dict(attrs={'class':[
  51.                                            'byLine',
  52.                                            'inside-copy',
  53.                                            'inside-head',
  54.                                            'inside-head2',
  55.                                            'item',
  56.                                            'item-block',
  57.                                            'photo-container',
  58.                                            ]}),
  59.                       dict(id=[
  60.                                'applyMainStoryPhoto',
  61.                                'permalink',
  62.                                ])]
  63.  
  64.     remove_tags = [dict(attrs={'class':[
  65.                                         'comments',
  66.                                         'jump',
  67.                                         'pagetools',
  68.                                         'post-attributes',
  69.                                         'tags',
  70.                                         ]}),
  71.                    dict(id=[])]
  72.  
  73.     #feeds =  [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
  74.  
  75.     def dump_hex(self, src, length=16):
  76.         ''' Diagnostic '''
  77.         FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
  78.         N=0; result=''
  79.         while src:
  80.            s,src = src[:length],src[length:]
  81.            hexa = ' '.join(["%02X"%ord(x) for x in s])
  82.            s = s.translate(FILTER)
  83.            result += "%04X   %-*s   %s\n" % (N, length*3, hexa, s)
  84.            N+=length
  85.         print result
  86.  
  87.     def fixChars(self,string):
  88.         # Replace lsquo (\x91)
  89.         fixed = re.sub("\x91","‘",string)
  90.  
  91.         # Replace rsquo (\x92)
  92.         fixed = re.sub("\x92","’",fixed)
  93.  
  94.         # Replace ldquo (\x93)
  95.         fixed = re.sub("\x93","“",fixed)
  96.  
  97.         # Replace rdquo (\x94)
  98.         fixed = re.sub("\x94","”",fixed)
  99.  
  100.         # Replace ndash (\x96)
  101.         fixed = re.sub("\x96","–",fixed)
  102.  
  103.         # Replace mdash (\x97)
  104.         fixed = re.sub("\x97","—",fixed)
  105.  
  106.         return fixed
  107.  
  108.     def get_masthead_url(self):
  109.         masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
  110.         br = BasicNewsRecipe.get_browser()
  111.         try:
  112.             br.open(masthead)
  113.         except:
  114.             self.log("\nCover unavailable")
  115.             masthead = None
  116.         return masthead
  117.  
  118.     def massageNCXText(self, description):
  119.         # Kindle TOC descriptions won't render certain characters
  120.         if description:
  121.             massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
  122.             # Replace '&' with '&'
  123.             massaged = re.sub("&","&", massaged)
  124.             return self.fixChars(massaged)
  125.         else:
  126.             return description
  127.  
  128.     def parse_feeds(self, *args, **kwargs):
  129.         parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
  130.         # Count articles for progress dialog
  131.         article_count = 0
  132.         for feed in parsed_feeds:
  133.             article_count += len(feed)
  134.         self.log( "Queued %d articles" % article_count)
  135.         return parsed_feeds
  136.  
  137.     def preprocess_html(self, soup):
  138.         soup = self.strip_anchors(soup)
  139.         return soup
  140.  
  141.     def postprocess_html(self, soup, first_fetch):
  142.  
  143.         # Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
  144.         navLinks = soup.find(True,{'style':'padding-bottom:3px'})
  145.         if navLinks:
  146.             navLinks.extract()
  147.  
  148.         # Remove <div class="inside-copy" style="margin-bottom:10px">
  149.         gibberish = soup.find(True,{'style':'margin-bottom:10px'})
  150.         if gibberish:
  151.             gibberish.extract()
  152.  
  153.         # Change <inside-head> to <h2>
  154.         headline = soup.find(True, {'class':['inside-head','inside-head2']})
  155.         if not headline:
  156.             headline = soup.find('h3')
  157.         if headline:
  158.             tag = Tag(soup, "h2")
  159.             tag['class'] = "headline"
  160.             tag.insert(0, headline.contents[0])
  161.             headline.replaceWith(tag)
  162.         else:
  163.             print "unable to find headline:\n%s\n" % soup
  164.  
  165.         # Change byLine to byline, change commas to middot
  166.         # Kindle renders commas in byline as '&'
  167.         byline = soup.find(True, {'class':'byLine'})
  168.         if byline:
  169.             byline['class'] = 'byline'
  170.             # Replace comma with middot
  171.             byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents()))
  172.  
  173.         jumpout_punc_list = [':','?']
  174.         # Remove the inline jumpouts in <div class="inside-copy">
  175.         paras = soup.findAll(True, {'class':'inside-copy'})
  176.         for para in paras:
  177.             if re.match("<b>[\w\W]+ ",para.renderContents()):
  178.                 p = para.find('b')
  179.                 for punc in jumpout_punc_list:
  180.                     punc_offset = p.contents[0].find(punc)
  181.                     if punc_offset == -1:
  182.                         continue
  183.                     if punc_offset > 1:
  184.                         if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
  185.                             #print "extracting \n%s\n" % para.prettify()
  186.                             para.extract()
  187.  
  188.         # Reset class for remaining
  189.         paras = soup.findAll(True, {'class':'inside-copy'})
  190.         for para in paras:
  191.             para['class'] = 'articleBody'
  192.  
  193.         # Remove inline jumpouts in <p>
  194.         paras = soup.findAll(['p'])
  195.         for p in paras:
  196.             if hasattr(p,'contents') and len(p.contents):
  197.                 for punc in jumpout_punc_list:
  198.                     punc_offset = p.contents[0].find(punc)
  199.                     if punc_offset == -1:
  200.                         continue
  201.                     if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
  202.                         #print "evaluating %s\n" % p.contents[0][:punc_offset+1]
  203.                         if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
  204.                             #print "extracting \n%s\n" % p.prettify()
  205.                             p.extract()
  206.  
  207.         # Capture the first img, insert after headline
  208.         imgs = soup.findAll('img')
  209.         print "postprocess_html(): %d images" % len(imgs)
  210.         if imgs:
  211.             divTag = Tag(soup, 'div')
  212.             divTag['class'] = 'image'
  213.             body = soup.find('body')
  214.             img = imgs[0]
  215.             #print "img: \n%s\n" % img.prettify()
  216.  
  217.             # Table for photo and credit
  218.             tableTag = Tag(soup,'table')
  219.  
  220.             # Photo
  221.             trimgTag = Tag(soup, 'tr')
  222.             tdimgTag = Tag(soup, 'td')
  223.             tdimgTag.insert(0,img)
  224.             trimgTag.insert(0,tdimgTag)
  225.             tableTag.insert(0,trimgTag)
  226.  
  227.             # Credit
  228.             trcreditTag = Tag(soup, 'tr')
  229.  
  230.             tdcreditTag = Tag(soup, 'td')
  231.             tdcreditTag['class'] = 'credit'
  232.             credit = soup.find('td',{'class':'photoCredit'})
  233.             if credit:
  234.                 tdcreditTag.insert(0,NavigableString(credit.renderContents()))
  235.             else:
  236.                 credit = img['credit']
  237.                 if credit:
  238.                     tdcreditTag.insert(0,NavigableString(credit))
  239.                 else:
  240.                     tdcreditTag.insert(0,NavigableString(''))
  241.  
  242.             trcreditTag.insert(0,tdcreditTag)
  243.             tableTag.insert(1,trcreditTag)
  244.             dtc = 0
  245.             divTag.insert(dtc,tableTag)
  246.             dtc += 1
  247.  
  248.             if False:
  249.                 # Add the caption in the table
  250.                 tableCaptionTag = Tag(soup,'caption')
  251.                 tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
  252.                 tableTag.insert(1,tableCaptionTag)
  253.                 divTag.insert(dtc,tableTag)
  254.                 dtc += 1
  255.                 body.insert(1,divTag)
  256.             else:
  257.                 # Add the caption below the table
  258.                 #print "Looking for caption in this soup:\n%s" % img.prettify()
  259.                 captionTag = Tag(soup,'p')
  260.                 captionTag['class'] = 'caption'
  261.                 if hasattr(img,'alt') and img['alt']:
  262.                     captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
  263.                     divTag.insert(dtc, captionTag)
  264.                     dtc += 1
  265.                 else:
  266.                     try:
  267.                         captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
  268.                         divTag.insert(dtc, captionTag)
  269.                         dtc += 1
  270.                     except:
  271.                         pass
  272.  
  273.             hrTag = Tag(soup, 'hr')
  274.             divTag.insert(dtc, hrTag)
  275.             dtc += 1
  276.  
  277.             # Delete <div id="applyMainStoryPhoto"
  278.             photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
  279.             if photoJunk:
  280.                 photoJunk.extract()
  281.  
  282.             # Insert img after headline
  283.             tag = body.find(True)
  284.             insertLoc = 0
  285.             headline_found = False
  286.             while True:
  287.                 # Scan the top-level tags
  288.                 insertLoc += 1
  289.                 if hasattr(tag,'class') and tag['class'] == 'headline':
  290.                     headline_found = True
  291.                     body.insert(insertLoc,divTag)
  292.                     break
  293.                 tag = tag.nextSibling
  294.                 if not tag:
  295.                     break
  296.  
  297.             if not headline_found:
  298.                 # Monolithic <div> - restructure
  299.                 tag = body.find(True)
  300.                 while True:
  301.                     insertLoc += 1
  302.                     try:
  303.                         if hasattr(tag,'class') and tag['class'] == 'headline':
  304.                             headline_found = True
  305.                             tag.insert(insertLoc,divTag)
  306.                             break
  307.                     except:
  308.                         pass
  309.                     tag = tag.next
  310.                     if not tag:
  311.                         break
  312.  
  313.                 # Yank out headline, img and caption
  314.                 headline = body.find('h2','headline')
  315.                 img = body.find('div','image')
  316.                 caption = body.find('p''class')
  317.  
  318.                 # body(0) is calibre_navbar
  319.                 # body(1) is <div class="item">
  320.  
  321.                 btc = 1
  322.                 headline.extract()
  323.                 body.insert(1, headline)
  324.                 btc += 1
  325.                 if img:
  326.                     img.extract()
  327.                     body.insert(btc, img)
  328.                     btc += 1
  329.                 if caption:
  330.                     caption.extract()
  331.                     body.insert(btc, caption)
  332.                     btc += 1
  333.  
  334.             if len(imgs) > 1:
  335.                 if True:
  336.                     [img.extract() for img in imgs[1:]]
  337.                 else:
  338.                     # Format the remaining images
  339.                     # This doesn't work yet
  340.                     for img in imgs[1:]:
  341.                         print "img:\n%s\n" % img.prettify()
  342.                         divTag = Tag(soup, 'div')
  343.                         divTag['class'] = 'image'
  344.  
  345.                         # Table for photo and credit
  346.                         tableTag = Tag(soup,'table')
  347.  
  348.                         # Photo
  349.                         trimgTag = Tag(soup, 'tr')
  350.                         tdimgTag = Tag(soup, 'td')
  351.                         tdimgTag.insert(0,img)
  352.                         trimgTag.insert(0,tdimgTag)
  353.                         tableTag.insert(0,trimgTag)
  354.  
  355.                         # Credit
  356.                         trcreditTag = Tag(soup, 'tr')
  357.  
  358.                         tdcreditTag = Tag(soup, 'td')
  359.                         tdcreditTag['class'] = 'credit'
  360.                         try:
  361.                             tdcreditTag.insert(0,NavigableString(img['credit']))
  362.                         except:
  363.                             tdcreditTag.insert(0,NavigableString(''))
  364.                         trcreditTag.insert(0,tdcreditTag)
  365.                         tableTag.insert(1,trcreditTag)
  366.                         divTag.insert(0,tableTag)
  367.                         soup.img.replaceWith(divTag)
  368.  
  369.         return soup
  370.  
  371.     def postprocess_book(self, oeb, opts, log) :
  372.  
  373.         def extract_byline(href) :
  374.             # <meta name="byline" content=
  375.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  376.             byline = soup.find('div',attrs={'class':'byline'})
  377.             if byline:
  378.                 byline['class'] = 'byline'
  379.                 # Replace comma with middot
  380.                 byline.contents[0].replaceWith(re.sub(u",", u" ·",
  381.                     byline.renderContents(encoding=None)))
  382.                 return byline.renderContents(encoding=None)
  383.             else :
  384.                 paras = soup.findAll(text=True)
  385.                 for para in paras:
  386.                     if para.startswith("Copyright"):
  387.                         return para[len('Copyright xxxx '):para.find('.')]
  388.                 return None
  389.  
  390.         def extract_description(href) :
  391.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  392.             description = soup.find('meta',attrs={'name':'description'})
  393.             if description :
  394.                 return self.massageNCXText(description['content'])
  395.             else:
  396.                 # Take first paragraph of article
  397.                 articleBody = soup.find('div',attrs={'id':['articleBody','item']})
  398.                 if articleBody:
  399.                     paras = articleBody.findAll('p')
  400.                     for p in paras:
  401.                         if p.renderContents() > '' :
  402.                             return self.massageNCXText(self.tag_to_string(p,use_alt=False))
  403.                 else:
  404.                     print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
  405.                     return None
  406.  
  407.         # Method entry point here
  408.         # Single section toc looks different than multi-section tocs
  409.         if oeb.toc.depth() == 2 :
  410.             for article in oeb.toc :
  411.                 if article.author is None :
  412.                     article.author = extract_byline(article.href)
  413.                 if article.description is None :
  414.                     article.description = extract_description(article.href)
  415.         elif oeb.toc.depth() == 3 :
  416.             for section in oeb.toc :
  417.                 for article in section :
  418.                     article.author = extract_byline(article.href)
  419.                     '''
  420.                     if article.author is None :
  421.                         article.author = self.massageNCXText(extract_byline(article.href))
  422.                     else:
  423.                         article.author = self.massageNCXText(article.author)
  424.                     '''
  425.                     if article.description is None :
  426.                         article.description = extract_description(article.href)
  427.  
  428.     def strip_anchors(self,soup):
  429.         paras = soup.findAll(True)
  430.         for para in paras:
  431.             aTags = para.findAll('a')
  432.             for a in aTags:
  433.                 if a.img is None:
  434.                     a.replaceWith(a.renderContents().decode('cp1252','replace'))
  435.         return soup
  436.