home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
- '''
- usatoday.com
- '''
-
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
- import re
-
- class USAToday(BasicNewsRecipe):
-
- title = 'USA Today'
- __author__ = 'GRiker'
- oldest_article = 1
- timefmt = ''
- max_articles_per_feed = 20
- language = 'en'
- no_stylesheets = True
- extra_css = '.headline {text-align: left;}\n \
- .byline {font-family: monospace; \
- text-align: left; \
- margin-bottom: 1em;}\n \
- .image {text-align: center;}\n \
- .caption {text-align: center; \
- font-size: smaller; \
- font-style: italic}\n \
- .credit {text-align: right; \
- margin-bottom: 0em; \
- font-size: smaller;}\n \
- .articleBody {text-align: left;}\n '
- conversion_options = { 'linearize_tables' : True }
- #simultaneous_downloads = 1
- feeds = [
- ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
- ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
- ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
- ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
- ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
- ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
- ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
- ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
- ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
- ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
- ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
- ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
- ]
- keep_only_tags = [dict(attrs={'class':[
- 'byLine',
- 'inside-copy',
- 'inside-head',
- 'inside-head2',
- 'item',
- 'item-block',
- 'photo-container',
- ]}),
- dict(id=[
- 'applyMainStoryPhoto',
- 'permalink',
- ])]
-
- remove_tags = [dict(attrs={'class':[
- 'comments',
- 'jump',
- 'pagetools',
- 'post-attributes',
- 'tags',
- ]}),
- dict(id=[])]
-
- #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
-
- def dump_hex(self, src, length=16):
- ''' Diagnostic '''
- FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
- N=0; result=''
- while src:
- s,src = src[:length],src[length:]
- hexa = ' '.join(["%02X"%ord(x) for x in s])
- s = s.translate(FILTER)
- result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
- N+=length
- print result
-
- def fixChars(self,string):
- # Replace lsquo (\x91)
- fixed = re.sub("\x91","‘",string)
-
- # Replace rsquo (\x92)
- fixed = re.sub("\x92","’",fixed)
-
- # Replace ldquo (\x93)
- fixed = re.sub("\x93","“",fixed)
-
- # Replace rdquo (\x94)
- fixed = re.sub("\x94","”",fixed)
-
- # Replace ndash (\x96)
- fixed = re.sub("\x96","–",fixed)
-
- # Replace mdash (\x97)
- fixed = re.sub("\x97","—",fixed)
-
- return fixed
-
- def get_masthead_url(self):
- masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(masthead)
- except:
- self.log("\nCover unavailable")
- masthead = None
- return masthead
-
- def massageNCXText(self, description):
- # Kindle TOC descriptions won't render certain characters
- if description:
- massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
- # Replace '&' with '&'
- massaged = re.sub("&","&", massaged)
- return self.fixChars(massaged)
- else:
- return description
-
- def parse_feeds(self, *args, **kwargs):
- parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
- # Count articles for progress dialog
- article_count = 0
- for feed in parsed_feeds:
- article_count += len(feed)
- self.log( "Queued %d articles" % article_count)
- return parsed_feeds
-
- def preprocess_html(self, soup):
- soup = self.strip_anchors(soup)
- return soup
-
- def postprocess_html(self, soup, first_fetch):
-
- # Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
- navLinks = soup.find(True,{'style':'padding-bottom:3px'})
- if navLinks:
- navLinks.extract()
-
- # Remove <div class="inside-copy" style="margin-bottom:10px">
- gibberish = soup.find(True,{'style':'margin-bottom:10px'})
- if gibberish:
- gibberish.extract()
-
- # Change <inside-head> to <h2>
- headline = soup.find(True, {'class':['inside-head','inside-head2']})
- if not headline:
- headline = soup.find('h3')
- if headline:
- tag = Tag(soup, "h2")
- tag['class'] = "headline"
- tag.insert(0, headline.contents[0])
- headline.replaceWith(tag)
- else:
- print "unable to find headline:\n%s\n" % soup
-
- # Change byLine to byline, change commas to middot
- # Kindle renders commas in byline as '&'
- byline = soup.find(True, {'class':'byLine'})
- if byline:
- byline['class'] = 'byline'
- # Replace comma with middot
- byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents()))
-
- jumpout_punc_list = [':','?']
- # Remove the inline jumpouts in <div class="inside-copy">
- paras = soup.findAll(True, {'class':'inside-copy'})
- for para in paras:
- if re.match("<b>[\w\W]+ ",para.renderContents()):
- p = para.find('b')
- for punc in jumpout_punc_list:
- punc_offset = p.contents[0].find(punc)
- if punc_offset == -1:
- continue
- if punc_offset > 1:
- if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
- #print "extracting \n%s\n" % para.prettify()
- para.extract()
-
- # Reset class for remaining
- paras = soup.findAll(True, {'class':'inside-copy'})
- for para in paras:
- para['class'] = 'articleBody'
-
- # Remove inline jumpouts in <p>
- paras = soup.findAll(['p'])
- for p in paras:
- if hasattr(p,'contents') and len(p.contents):
- for punc in jumpout_punc_list:
- punc_offset = p.contents[0].find(punc)
- if punc_offset == -1:
- continue
- if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
- #print "evaluating %s\n" % p.contents[0][:punc_offset+1]
- if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
- #print "extracting \n%s\n" % p.prettify()
- p.extract()
-
- # Capture the first img, insert after headline
- imgs = soup.findAll('img')
- print "postprocess_html(): %d images" % len(imgs)
- if imgs:
- divTag = Tag(soup, 'div')
- divTag['class'] = 'image'
- body = soup.find('body')
- img = imgs[0]
- #print "img: \n%s\n" % img.prettify()
-
- # Table for photo and credit
- tableTag = Tag(soup,'table')
-
- # Photo
- trimgTag = Tag(soup, 'tr')
- tdimgTag = Tag(soup, 'td')
- tdimgTag.insert(0,img)
- trimgTag.insert(0,tdimgTag)
- tableTag.insert(0,trimgTag)
-
- # Credit
- trcreditTag = Tag(soup, 'tr')
-
- tdcreditTag = Tag(soup, 'td')
- tdcreditTag['class'] = 'credit'
- credit = soup.find('td',{'class':'photoCredit'})
- if credit:
- tdcreditTag.insert(0,NavigableString(credit.renderContents()))
- else:
- credit = img['credit']
- if credit:
- tdcreditTag.insert(0,NavigableString(credit))
- else:
- tdcreditTag.insert(0,NavigableString(''))
-
- trcreditTag.insert(0,tdcreditTag)
- tableTag.insert(1,trcreditTag)
- dtc = 0
- divTag.insert(dtc,tableTag)
- dtc += 1
-
- if False:
- # Add the caption in the table
- tableCaptionTag = Tag(soup,'caption')
- tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
- tableTag.insert(1,tableCaptionTag)
- divTag.insert(dtc,tableTag)
- dtc += 1
- body.insert(1,divTag)
- else:
- # Add the caption below the table
- #print "Looking for caption in this soup:\n%s" % img.prettify()
- captionTag = Tag(soup,'p')
- captionTag['class'] = 'caption'
- if hasattr(img,'alt') and img['alt']:
- captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
- divTag.insert(dtc, captionTag)
- dtc += 1
- else:
- try:
- captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
- divTag.insert(dtc, captionTag)
- dtc += 1
- except:
- pass
-
- hrTag = Tag(soup, 'hr')
- divTag.insert(dtc, hrTag)
- dtc += 1
-
- # Delete <div id="applyMainStoryPhoto"
- photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
- if photoJunk:
- photoJunk.extract()
-
- # Insert img after headline
- tag = body.find(True)
- insertLoc = 0
- headline_found = False
- while True:
- # Scan the top-level tags
- insertLoc += 1
- if hasattr(tag,'class') and tag['class'] == 'headline':
- headline_found = True
- body.insert(insertLoc,divTag)
- break
- tag = tag.nextSibling
- if not tag:
- break
-
- if not headline_found:
- # Monolithic <div> - restructure
- tag = body.find(True)
- while True:
- insertLoc += 1
- try:
- if hasattr(tag,'class') and tag['class'] == 'headline':
- headline_found = True
- tag.insert(insertLoc,divTag)
- break
- except:
- pass
- tag = tag.next
- if not tag:
- break
-
- # Yank out headline, img and caption
- headline = body.find('h2','headline')
- img = body.find('div','image')
- caption = body.find('p''class')
-
- # body(0) is calibre_navbar
- # body(1) is <div class="item">
-
- btc = 1
- headline.extract()
- body.insert(1, headline)
- btc += 1
- if img:
- img.extract()
- body.insert(btc, img)
- btc += 1
- if caption:
- caption.extract()
- body.insert(btc, caption)
- btc += 1
-
- if len(imgs) > 1:
- if True:
- [img.extract() for img in imgs[1:]]
- else:
- # Format the remaining images
- # This doesn't work yet
- for img in imgs[1:]:
- print "img:\n%s\n" % img.prettify()
- divTag = Tag(soup, 'div')
- divTag['class'] = 'image'
-
- # Table for photo and credit
- tableTag = Tag(soup,'table')
-
- # Photo
- trimgTag = Tag(soup, 'tr')
- tdimgTag = Tag(soup, 'td')
- tdimgTag.insert(0,img)
- trimgTag.insert(0,tdimgTag)
- tableTag.insert(0,trimgTag)
-
- # Credit
- trcreditTag = Tag(soup, 'tr')
-
- tdcreditTag = Tag(soup, 'td')
- tdcreditTag['class'] = 'credit'
- try:
- tdcreditTag.insert(0,NavigableString(img['credit']))
- except:
- tdcreditTag.insert(0,NavigableString(''))
- trcreditTag.insert(0,tdcreditTag)
- tableTag.insert(1,trcreditTag)
- divTag.insert(0,tableTag)
- soup.img.replaceWith(divTag)
-
- return soup
-
- def postprocess_book(self, oeb, opts, log) :
-
- def extract_byline(href) :
- # <meta name="byline" content=
- soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
- byline = soup.find('div',attrs={'class':'byline'})
- if byline:
- byline['class'] = 'byline'
- # Replace comma with middot
- byline.contents[0].replaceWith(re.sub(u",", u" ·",
- byline.renderContents(encoding=None)))
- return byline.renderContents(encoding=None)
- else :
- paras = soup.findAll(text=True)
- for para in paras:
- if para.startswith("Copyright"):
- return para[len('Copyright xxxx '):para.find('.')]
- return None
-
- def extract_description(href) :
- soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
- description = soup.find('meta',attrs={'name':'description'})
- if description :
- return self.massageNCXText(description['content'])
- else:
- # Take first paragraph of article
- articleBody = soup.find('div',attrs={'id':['articleBody','item']})
- if articleBody:
- paras = articleBody.findAll('p')
- for p in paras:
- if p.renderContents() > '' :
- return self.massageNCXText(self.tag_to_string(p,use_alt=False))
- else:
- print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
- return None
-
- # Method entry point here
- # Single section toc looks different than multi-section tocs
- if oeb.toc.depth() == 2 :
- for article in oeb.toc :
- if article.author is None :
- article.author = extract_byline(article.href)
- if article.description is None :
- article.description = extract_description(article.href)
- elif oeb.toc.depth() == 3 :
- for section in oeb.toc :
- for article in section :
- article.author = extract_byline(article.href)
- '''
- if article.author is None :
- article.author = self.massageNCXText(extract_byline(article.href))
- else:
- article.author = self.massageNCXText(article.author)
- '''
- if article.description is None :
- article.description = extract_description(article.href)
-
- def strip_anchors(self,soup):
- paras = soup.findAll(True)
- for para in paras:
- aTags = para.findAll('a')
- for a in aTags:
- if a.img is None:
- a.replaceWith(a.renderContents().decode('cp1252','replace'))
- return soup
-