home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
- __license__ = 'GPL v3'
- __copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
- __docformat__ = 'restructuredtext en'
-
- '''
- theage.com.au
- '''
- from calibre import strftime
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup
- import re
-
- class TheAge(BasicNewsRecipe):
-
- title = 'The Age'
- description = 'Business News, World News and Breaking News in Melbourne, Australia'
- publication_type = 'newspaper'
- __author__ = 'Matthew Briggs'
- language = 'en_AU'
-
- max_articles_per_feed = 1000
- recursions = 0
- remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
-
- def get_browser(self):
- br = BasicNewsRecipe.get_browser()
- br.set_handle_refresh(False)
- return br
-
- def parse_index(self):
-
- soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
-
- section = None
- sections = {}
-
- for tag in soup.findAll(['h3', 'a']):
- if tag.name == 'h3':
- section = self.tag_to_string(tag)
- sections[section] = []
-
- # Make sure to skip: <a href="/">TheAge</a>
-
- elif section and tag.has_key('href') and len(tag['href'].strip())>1:
- url = tag['href'].strip()
- if url.startswith('/'):
- url = 'http://www.theage.com.au' + url
- title = self.tag_to_string(tag)
- sections[section].append({
- 'title': title,
- 'url' : url,
- 'date' : strftime('%a, %d %b'),
- 'description' : '',
- 'content' : '',
- })
-
- feeds = []
-
- # Insert feeds in specified order, if available
-
- feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
- for i in feedSort:
- if i in sections:
- feeds.append((i,sections[i]))
-
- # Done with the sorted feeds
-
- for i in feedSort:
- del sections[i]
-
- # Append what is left over...
-
- for i in sections:
- feeds.append((i,sections[i]))
-
- return feeds
-
- def get_cover_url(self):
-
- soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
-
- for i in soup.findAll('a'):
- href = i['href']
- if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
- return href
-
- return None
-
- def preprocess_html(self,soup):
-
- for p in soup.findAll('p'):
-
- # Collapse the paragraph by joining the non-tag contents
-
- contents = [i for i in p.contents if isinstance(i,unicode)]
- if len(contents):
- contents = ''.join(contents)
-
- # Filter out what's left of the text-mode navigation stuff
-
- if re.match('((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$',contents):
- p.extract()
- continue
-
- # Shrink the fine print font
-
- if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
- p['style'] = 'font-size:small'
- continue
-
- return soup
-