Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / wprost.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4.0 KB | 94 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = 'Modified 2011, Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>' from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): EDITION = 0 FIND_LAST_FULL_ISSUE = True EXCLUDE_LOCKED = True ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' title = u'Wprost' __author__ = 'matek09' description = 'Weekly magazine' encoding = 'ISO-8859-2' no_stylesheets = True language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) '''keep_only_tags =[] keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''), (re.compile(r'\<table .*?\>'), lambda match: ''), (re.compile(r'\<tr>'), lambda match: ''), (re.compile(r'\<td .*?\>'), lambda match: '')] remove_tags =[] remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) extra_css = ''' .div-header {font-size: x-small; font-weight: bold} ''' #h2 {font-size: x-large; font-weight: bold} def is_blocked(self, a): if a.findNextSibling('img') is None: return False else: return True def find_last_issue(self): soup = self.index_to_soup('http://www.wprost.pl/archiwum/') a = 0 if self.FIND_LAST_FULL_ISSUE: ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) else: a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) self.EDITION = a['href'].replace('/tygodnik/?I=', '') self.cover_url = a.img['src'] def parse_index(self): self.find_last_issue() soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) feeds = [] for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): articles = list(self.find_articles(main_block)) if len(articles) > 0: section = self.tag_to_string(main_block) feeds.append((section, articles)) return feeds def find_articles(self, main_block): for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): if a.name in "td": break if self.EXCLUDE_LOCKED & self.is_blocked(a): continue yield { 'title' : self.tag_to_string(a), 'url' : 'http://www.wprost.pl' + a['href'], 'date' : '', 'description' : '' }