home *** CD-ROM | disk | FTP | other *** search
- # -*- coding: utf-8 -*-
- #!/usr/bin/env python
-
- __license__ = 'GPL v3'
- __copyright__ = '2010, matek09, matek09@gmail.com'
-
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ptempfile import PersistentTemporaryFile
- import datetime
-
-
- class Newsweek(BasicNewsRecipe):
- EDITION = '0'
- DATE = None
- YEAR = datetime.datetime.now().year
-
- title = u'Newsweek Polska'
- __author__ = 'matek09'
- description = 'Weekly magazine'
- encoding = 'utf-8'
- language = 'pl'
- remove_javascript = True
-
- temp_files = []
- articles_are_obfuscated = True
-
-
- def get_obfuscated_article(self, url):
- br = self.get_browser()
- br.open(url)
- source = br.response().read()
- page = self.index_to_soup(source)
-
- main_section = page.find(id='mainSection')
-
- title = main_section.find('h1')
- info = main_section.find('ul', attrs={'class' : 'articleInfo'})
- authors = info.find('li').find('h4')
- article = main_section.find('div', attrs={'id' : 'article'})
- html = unicode(title) + unicode(authors) + unicode(article)
- next = main_section.find('li', attrs={'class' : 'next'})
-
- while next:
- url = next.find('a')['href']
- br.open(url)
- source = br.response().read()
- page = self.index_to_soup(source)
- main_section = page.find(id='mainSection')
- article = main_section.find('div', attrs={'id' : 'article'})
- aside = article.find(id='articleAside')
- if aside is not None:
- aside.extract()
- html = html + unicode(article)
- next = main_section.find('li', attrs={'class' : 'next'})
-
-
- self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
- self.temp_files[-1].write(html)
- self.temp_files[-1].close()
- return self.temp_files[-1].name
-
- def is_full(self, issue_soup):
- while True:
- main_section = issue_soup.find(id='mainSection')
- next = main_section.find('li', attrs={'class' : 'next'})
- if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
- return False
- elif next is None:
- return True
- else:
- issue_soup = self.index_to_soup(next.find('a')['href'])
-
- def find_last_full_issue(self, archive_url):
- archive_soup = self.index_to_soup(archive_url)
- select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
- for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')):
- self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
- issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
- if self.is_full(issue_soup):
- return
-
- self.YEAR = self.YEAR - 1
- self.find_last_full_issue(archive_url + ',' + str(self.YEAR))
-
- def parse_index(self):
- archive_url = 'http://www.newsweek.pl/wydania/archiwum'
- self.find_last_full_issue(archive_url)
- soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
- self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
- main_section = soup.find(id='mainSection')
- img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
- self.cover_url = img['src']
- feeds = []
- articles = {}
- sections = []
- while True:
- news_list = main_section.find('ul', attrs={'class' : 'newsList'})
- for h2 in news_list.findAll('h2'):
-
- article = self.create_article(h2)
- category_div = h2.findNext('div', attrs={'class' : 'kategorie'})
- section = self.tag_to_string(category_div)
- if articles.has_key(section):
- articles[section].append(article)
- else:
- articles[section] = [article]
- sections.append(section)
-
- next = main_section.find('li', attrs={'class' : 'next'})
- if next is None:
- break
- soup = self.index_to_soup(next.find('a')['href'])
- main_section = soup.find(id='mainSection')
-
- for section in sections:
- feeds.append((section, articles[section]))
- return feeds
-
- def create_article(self, h2):
- article = {}
- a = h2.find('a')
- article['title'] = self.tag_to_string(a)
- article['url'] = a['href']
- article['date'] = self.DATE
- desc = h2.findNext('p')
-
- if desc is not None:
- article['description'] = self.tag_to_string(desc)
- else:
- article['description'] = ''
- return article
-
-
-
-
-