home *** CD-ROM | disk | FTP | other *** search
- from calibre.web.feeds.recipes import BasicNewsRecipe
-
- class AdvancedUserRecipe1278063072(BasicNewsRecipe):
- title = u'Singtao Daily - Canada'
- oldest_article = 7
- max_articles_per_feed = 100
- __author__ = 'rty'
- description = 'Toronto Canada Chinese Newspaper'
- publisher = 'news.singtao.ca'
- category = 'Chinese, News, Canada'
- remove_javascript = True
- use_embedded_content = False
- no_stylesheets = True
- language = 'zh'
- conversion_options = {'linearize_tables':True}
- masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg'
- extra_css = '''
- @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\
-
- body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\
-
- h1 {font-family: 'DroidFont', serif;}\
-
- .articledescription {font-family: 'DroidFont', serif;}
- '''
- keep_only_tags = [
- dict(name='div', attrs={'id':['title','storybody']}),
- dict(name='div', attrs={'class':'content'})
- ]
-
- def parse_index(self):
- feeds = []
- for title, url in [
- ('Editorial',
- 'http://news.singtao.ca/toronto/editorial.html'),
- ('Toronto \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'),
- 'http://news.singtao.ca/toronto/city.html'),
- ('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'),
- 'http://news.singtao.ca/toronto/canada.html'),
- ('Entertainment',
- 'http://news.singtao.ca/toronto/entertainment.html'),
- ('World',
- 'http://news.singtao.ca/toronto/world.html'),
- ('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'),
- 'http://news.singtao.ca/toronto/finance.html'),
- ('Sports', 'http://news.singtao.ca/toronto/sports.html'),
- ]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
- return feeds
-
- def parse_section(self, url):
- soup = self.index_to_soup(url)
- div = soup.find(attrs={'class': ['newslist paddingL10T10','newslist3 paddingL10T10']})
- #date = div.find(attrs={'class': 'underlineBLK'})
- current_articles = []
- for li in div.findAll('li'):
- a = li.find('a', href = True)
- if a is None:
- continue
- title = self.tag_to_string(a)
- url = a.get('href', False)
- if not url or not title:
- continue
- if url.startswith('/'):
- url = 'http://news.singtao.ca'+url
- # self.log('\ \ Found article:', title)
- # self.log('\ \ \ ', url)
- current_articles.append({'title': title, 'url': url, 'description':''})
-
- return current_articles
-
- def preprocess_html(self, soup):
- for item in soup.findAll(style=True):
- del item['style']
- for item in soup.findAll(width=True):
- del item['width']
- return soup
-