home *** CD-ROM | disk | FTP | other *** search
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
-
-
- class FokkeEnSukkeRecipe(BasicNewsRecipe) :
- __license__ = 'GPL v3'
- __author__ = 'kwetal'
- language = 'nl'
- country = 'NL'
- version = 2
-
- title = u'Fokke en Sukke'
- publisher = u'Reid, Geleijnse & Van Tol'
- category = u'News, Cartoons'
- description = u'Popular Dutch daily cartoon Fokke en Sukke'
-
- conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
-
- no_stylesheets = True
- extra_css = '''
- body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
- div.title {text-align: center; margin-bottom: 1em;}
- '''
-
- INDEX = u'http://foksuk.nl'
- cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
-
- keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
-
- def parse_index(self) :
- # A list with daynames as they _can_ appear in the index
- dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
- soup = self.index_to_soup(self.INDEX)
-
- # Find the links for the various cartoons for this week and loop through them
- index = soup.find('div', attrs={'class' : 'selectcartoon'})
- links = index.findAll('a')
- maxIndex = len(links) - 1
- articles = []
- for i in range(1, len(links)) :
- # There can be more than one cartoon for a given day (currently either one or two).
- # If there's only one, there is just a link with the dayname.
- # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
- # In that case we're interested in the last two.
- if links[i].renderContents() in dayNames :
- # If the link is not in daynames, we processed it already, but if it is, let's see
- # if the next one has '1' as content
- if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
- # Got you! Add it to the list
- article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
- articles.append(article)
- # If there is a '1', there should be a '2' as well, but better save than sorry
- if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
- # Got you! Add it to the list
- article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
- articles.append(article)
- else :
- # There is only one cartoon for this day. Add it to the list.
- article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
- articles.append(article)
- # Might as well use the weeknumber as title
- week = index.find('span', attrs={'class' : 'week'}).renderContents()
-
- return [[week, articles]]
-
- def preprocess_html(self, soup) :
- cartoon = soup.find('div', attrs={'class' : 'cartoon'})
-
- title = ''
- img = soup.find('img', attrs = {'alt' : True})
- if img :
- title = img['alt']
-
- tag = Tag(soup, 'div', [('class', 'title')])
- tag.insert(0, title)
- cartoon.insert(0, tag)
-
- # We only want the cartoon, so throw out the index
- select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
- if select :
- select.extract()
-
- freshSoup = self.getFreshSoup(soup)
- freshSoup.body.append(cartoon)
-
- return freshSoup
-
- def getFreshSoup(self, oldSoup):
- freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
- if oldSoup.head.title:
- freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
- return freshSoup
-
-
-