home *** CD-ROM | disk | FTP | other *** search
- # -*- coding: utf-8 -*-
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class TheForce(BasicNewsRecipe):
- title = u'The Force'
- language = 'en'
- __author__ = 'Krittika Goyal'
- oldest_article = 1 #days
- max_articles_per_feed = 25
- encoding = 'cp1252'
-
- remove_stylesheets = True
- #remove_javascripts = True
- conversion_options = { 'linearize_tables' : True }
- remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
- keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
- #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
- remove_tags = [
- dict(name='iframe'),
- #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
- #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
- #dict(name='table', attrs={'cellspacing':'0'}),
- #dict(name='ul', attrs={'class':'articleTools'}),
- ]
-
- feeds = [
- ('The Force',
- 'http://www.theforce.net/outnews/tfnrdf.xml'),
- ]
-
- def preprocess_html(self, soup):
- for tag in soup.findAll(name='i'):
- if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
- for x in tag.findAllNext():
- x.extract()
- tag.extract()
- break
- tag = soup.find(attrs={'class':'articleoption'})
- if tag is not None:
- tag = tag.findParent('table')
- if tag is not None:
- for x in tag.findAllNext():
- x.extract()
- tag.extract()
-
- for img in soup.findAll('img', src=True):
- a = img.findParent('a', href=True)
- if a is None: continue
- url = a.get('href').split('?')[-1].partition('=')[-1]
- if url:
- img.extract()
- a.name = 'img'
- a['src'] = url
- del a['href']
- img['src'] = url
- return soup
-