home *** CD-ROM | disk | FTP | other *** search
- import re
- import urllib2
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
-
- class Ebert(BasicNewsRecipe):
- title = 'Roger Ebert'
- __author__ = 'Shane Erstad'
- description = 'Roger Ebert Movie Reviews'
- publisher = 'Chicago Sun Times'
- category = 'movies'
- oldest_article = 8
- max_articles_per_feed = 100
- no_stylesheets = True
- use_embedded_content = False
- encoding = 'utf-8'
- masthead_url = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
- language = 'en'
- remove_empty_feeds = False
- PREFIX = 'http://rogerebert.suntimes.com'
- patternReviews = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
- patternCommentary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
- patternPeople = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
- patternGlossary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
-
-
-
- conversion_options = {
- 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
- , 'linearize_tables' : True
- }
-
-
- feeds = [
- (u'Reviews' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
- ,(u'Commentary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
- ,(u'Great Movies' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
- ,(u'People' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
- ,(u'Glossary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
-
- ]
-
- preprocess_regexps = [
- (re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
- lambda m: '')
- ]
-
-
-
- def print_version(self, url):
- return url + '&template=printart'
-
- def parse_index(self):
- totalfeeds = []
- lfeeds = self.get_feeds()
- for feedobj in lfeeds:
- feedtitle, feedurl = feedobj
- self.log('\tFeedurl: ', feedurl)
- self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
- articles = []
- page = urllib2.urlopen(feedurl).read()
-
- if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
- pattern = self.patternReviews
- elif feedtitle == 'Commentary':
- pattern = self.patternCommentary
- elif feedtitle == 'People':
- pattern = self.patternPeople
- elif feedtitle == 'Glossary':
- pattern = self.patternGlossary
-
-
- regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)
-
- for match in regex.finditer(page):
- if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
- movietitle = match.group(1)
- thislink = match.group(2)
- description = match.group(3)
- elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
- thislink = match.group(1)
- description = match.group(2)
-
- self.log(thislink)
-
- for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
- thisurl = self.PREFIX + link['href']
- thislinktext = self.tag_to_string(link)
-
- if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
- thistitle = movietitle
- elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
- thistitle = thislinktext
-
- if thistitle == '':
- thistitle = 'Ebert Journal Post'
-
- """
- pattern2 = r'AID=\/(.*?)\/'
- reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
- match2 = reg2.search(thisurl)
- date = match2.group(1)
- c = time.strptime(match2.group(1),"%Y%m%d")
- date=time.strftime("%a, %b %d, %Y", c)
- self.log(date)
- """
-
- articles.append({
- 'title' :thistitle
- ,'date' :''
- ,'url' :thisurl
- ,'description':description
- })
- totalfeeds.append((feedtitle, articles))
-
- return totalfeeds
-
-