Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4114 < prev next >

Wrap

Text File | 2010-07-08 | 17.2 KB | 436 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' ''' usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag import re class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'GRiker' oldest_article = 1 timefmt = '' max_articles_per_feed = 20 language = 'en' no_stylesheets = True extra_css = '.headline {text-align: left;}\n \ .byline {font-family: monospace; \ text-align: left; \ margin-bottom: 1em;}\n \ .image {text-align: center;}\n \ .caption {text-align: center; \ font-size: smaller; \ font-style: italic}\n \ .credit {text-align: right; \ margin-bottom: 0em; \ font-size: smaller;}\n \ .articleBody {text-align: left;}\n ' conversion_options = { 'linearize_tables' : True } #simultaneous_downloads = 1 feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'), ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), ] keep_only_tags = [dict(attrs={'class':[ 'byLine', 'inside-copy', 'inside-head', 'inside-head2', 'item', 'item-block', 'photo-container', ]}), dict(id=[ 'applyMainStoryPhoto', 'permalink', ])] remove_tags = [dict(attrs={'class':[ 'comments', 'jump', 'pagetools', 'post-attributes', 'tags', ]}), dict(id=[])] #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')] def dump_hex(self, src, length=16): ''' Diagnostic ''' FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) N=0; result='' while src: s,src = src[:length],src[length:] hexa = ' '.join(["%02X"%ord(x) for x in s]) s = s.translate(FILTER) result += "%04X %-*s %s\n" % (N, length*3, hexa, s) N+=length print result def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) # Replace rsquo (\x92) fixed = re.sub("\x92","’",fixed) # Replace ldquo (\x93) fixed = re.sub("\x93","“",fixed) # Replace rdquo (\x94) fixed = re.sub("\x94","”",fixed) # Replace ndash (\x96) fixed = re.sub("\x96","–",fixed) # Replace mdash (\x97) fixed = re.sub("\x97","—",fixed) return fixed def get_masthead_url(self): masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' br = BasicNewsRecipe.get_browser() try: br.open(masthead) except: self.log("\nCover unavailable") masthead = None return masthead def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) # Replace '&' with '&' massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description def parse_feeds(self, *args, **kwargs): parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs) # Count articles for progress dialog article_count = 0 for feed in parsed_feeds: article_count += len(feed) self.log( "Queued %d articles" % article_count) return parsed_feeds def preprocess_html(self, soup): soup = self.strip_anchors(soup) return soup def postprocess_html(self, soup, first_fetch): # Remove navLinks <div class="inside-copy" style="padding-bottom:3px"> navLinks = soup.find(True,{'style':'padding-bottom:3px'}) if navLinks: navLinks.extract() # Remove <div class="inside-copy" style="margin-bottom:10px"> gibberish = soup.find(True,{'style':'margin-bottom:10px'}) if gibberish: gibberish.extract() # Change <inside-head> to <h2> headline = soup.find(True, {'class':['inside-head','inside-head2']}) if not headline: headline = soup.find('h3') if headline: tag = Tag(soup, "h2") tag['class'] = "headline" tag.insert(0, headline.contents[0]) headline.replaceWith(tag) else: print "unable to find headline:\n%s\n" % soup # Change byLine to byline, change commas to middot # Kindle renders commas in byline as '&' byline = soup.find(True, {'class':'byLine'}) if byline: byline['class'] = 'byline' # Replace comma with middot byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents())) jumpout_punc_list = [':','?'] # Remove the inline jumpouts in <div class="inside-copy"> paras = soup.findAll(True, {'class':'inside-copy'}) for para in paras: if re.match("<b>[\w\W]+ ",para.renderContents()): p = para.find('b') for punc in jumpout_punc_list: punc_offset = p.contents[0].find(punc) if punc_offset == -1: continue if punc_offset > 1: if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): #print "extracting \n%s\n" % para.prettify() para.extract() # Reset class for remaining paras = soup.findAll(True, {'class':'inside-copy'}) for para in paras: para['class'] = 'articleBody' # Remove inline jumpouts in <p> paras = soup.findAll(['p']) for p in paras: if hasattr(p,'contents') and len(p.contents): for punc in jumpout_punc_list: punc_offset = p.contents[0].find(punc) if punc_offset == -1: continue if punc_offset > 2 and hasattr(p,'a') and len(p.contents): #print "evaluating %s\n" % p.contents[0][:punc_offset+1] if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): #print "extracting \n%s\n" % p.prettify() p.extract() # Capture the first img, insert after headline imgs = soup.findAll('img') print "postprocess_html(): %d images" % len(imgs) if imgs: divTag = Tag(soup, 'div') divTag['class'] = 'image' body = soup.find('body') img = imgs[0] #print "img: \n%s\n" % img.prettify() # Table for photo and credit tableTag = Tag(soup,'table') # Photo trimgTag = Tag(soup, 'tr') tdimgTag = Tag(soup, 'td') tdimgTag.insert(0,img) trimgTag.insert(0,tdimgTag) tableTag.insert(0,trimgTag) # Credit trcreditTag = Tag(soup, 'tr') tdcreditTag = Tag(soup, 'td') tdcreditTag['class'] = 'credit' credit = soup.find('td',{'class':'photoCredit'}) if credit: tdcreditTag.insert(0,NavigableString(credit.renderContents())) else: credit = img['credit'] if credit: tdcreditTag.insert(0,NavigableString(credit)) else: tdcreditTag.insert(0,NavigableString('')) trcreditTag.insert(0,tdcreditTag) tableTag.insert(1,trcreditTag) dtc = 0 divTag.insert(dtc,tableTag) dtc += 1 if False: # Add the caption in the table tableCaptionTag = Tag(soup,'caption') tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) tableTag.insert(1,tableCaptionTag) divTag.insert(dtc,tableTag) dtc += 1 body.insert(1,divTag) else: # Add the caption below the table #print "Looking for caption in this soup:\n%s" % img.prettify() captionTag = Tag(soup,'p') captionTag['class'] = 'caption' if hasattr(img,'alt') and img['alt']: captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt'])) divTag.insert(dtc, captionTag) dtc += 1 else: try: captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline'])) divTag.insert(dtc, captionTag) dtc += 1 except: pass hrTag = Tag(soup, 'hr') divTag.insert(dtc, hrTag) dtc += 1 # Delete <div id="applyMainStoryPhoto" photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'}) if photoJunk: photoJunk.extract() # Insert img after headline tag = body.find(True) insertLoc = 0 headline_found = False while True: # Scan the top-level tags insertLoc += 1 if hasattr(tag,'class') and tag['class'] == 'headline': headline_found = True body.insert(insertLoc,divTag) break tag = tag.nextSibling if not tag: break if not headline_found: # Monolithic <div> - restructure tag = body.find(True) while True: insertLoc += 1 try: if hasattr(tag,'class') and tag['class'] == 'headline': headline_found = True tag.insert(insertLoc,divTag) break except: pass tag = tag.next if not tag: break # Yank out headline, img and caption headline = body.find('h2','headline') img = body.find('div','image') caption = body.find('p''class') # body(0) is calibre_navbar # body(1) is <div class="item"> btc = 1 headline.extract() body.insert(1, headline) btc += 1 if img: img.extract() body.insert(btc, img) btc += 1 if caption: caption.extract() body.insert(btc, caption) btc += 1 if len(imgs) > 1: if True: [img.extract() for img in imgs[1:]] else: # Format the remaining images # This doesn't work yet for img in imgs[1:]: print "img:\n%s\n" % img.prettify() divTag = Tag(soup, 'div') divTag['class'] = 'image' # Table for photo and credit tableTag = Tag(soup,'table') # Photo trimgTag = Tag(soup, 'tr') tdimgTag = Tag(soup, 'td') tdimgTag.insert(0,img) trimgTag.insert(0,tdimgTag) tableTag.insert(0,trimgTag) # Credit trcreditTag = Tag(soup, 'tr') tdcreditTag = Tag(soup, 'td') tdcreditTag['class'] = 'credit' try: tdcreditTag.insert(0,NavigableString(img['credit'])) except: tdcreditTag.insert(0,NavigableString('')) trcreditTag.insert(0,tdcreditTag) tableTag.insert(1,trcreditTag) divTag.insert(0,tableTag) soup.img.replaceWith(divTag) return soup def postprocess_book(self, oeb, opts, log) : def extract_byline(href) : # <meta name="byline" content= soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) byline = soup.find('div',attrs={'class':'byline'}) if byline: byline['class'] = 'byline' # Replace comma with middot byline.contents[0].replaceWith(re.sub(u",", u" ·", byline.renderContents(encoding=None))) return byline.renderContents(encoding=None) else : paras = soup.findAll(text=True) for para in paras: if para.startswith("Copyright"): return para[len('Copyright xxxx '):para.find('.')] return None def extract_description(href) : soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) description = soup.find('meta',attrs={'name':'description'}) if description : return self.massageNCXText(description['content']) else: # Take first paragraph of article articleBody = soup.find('div',attrs={'id':['articleBody','item']}) if articleBody: paras = articleBody.findAll('p') for p in paras: if p.renderContents() > '' : return self.massageNCXText(self.tag_to_string(p,use_alt=False)) else: print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify() return None # Method entry point here # Single section toc looks different than multi-section tocs if oeb.toc.depth() == 2 : for article in oeb.toc : if article.author is None : article.author = extract_byline(article.href) if article.description is None : article.description = extract_description(article.href) elif oeb.toc.depth() == 3 : for section in oeb.toc : for article in section : article.author = extract_byline(article.href) ''' if article.author is None : article.author = self.massageNCXText(extract_byline(article.href)) else: article.author = self.massageNCXText(article.author) ''' if article.description is None : article.description = extract_description(article.href) def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup