Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / scientific_american.recipe < prev next >

Wrap

Text File | 2011-09-09 | 3.7 KB | 92 lines

#!/usr/bin/env python __license__ = 'GPL v3' import re from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' description = u'Popular Science. Monthly magazine.' category = 'science' __author__ = 'Starson17' no_stylesheets = True use_embedded_content = False language = 'en' publisher = 'Nature Publishing Group' remove_empty_feeds = True remove_javascript = True oldest_article = 30 max_articles_per_feed = 100 conversion_options = {'linearize_tables' : True , 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } keep_only_tags = [ dict(name='h2', attrs={'class':'articleTitle'}) ,dict(name='p', attrs={'id':'articleDek'}) ,dict(name='p', attrs={'class':'articleInfo'}) ,dict(name='div', attrs={'id':['articleContent']}) ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)}) ] remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})] def parse_index(self): soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/') issuetag = soup.find('p',attrs={'id':'articleDek'}) self.timefmt = ' [%s]'%(self.tag_to_string(issuetag)) img = soup.find('img', alt='Scientific American Magazine', src=True) if img is not None: self.cover_url = img['src'] features, feeds = [], [] for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}): if a is None: continue desc = '' s = a.parent.parent.find(attrs={'class':'dek'}) desc = self.tag_to_string(s) article = { 'url' : a['href'], 'title' : self.tag_to_string(a), 'date' : '', 'description' : desc, } features.append(article) feeds.append(('Features', features)) department = [] title = None for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'): if 'department.cfm' in li.a['href']: if department: feeds.append((title, department)) title = self.tag_to_string(li.a) department = [] if 'article.cfm' in li.h3.a['href']: article = { 'url' : li.h3.a['href'], 'title' : self.tag_to_string(li.h3.a), 'date': '', 'description': self.tag_to_string(li.p), } department.append(article) if department: feeds.append((title, department)) return feeds def postprocess_html(self, soup, first_fetch): for item in soup.findAll('a'): if 'topic.cfm' in item['href']: item.replaceWith(item.string) return soup extra_css = ''' p{font-weight: normal; font-size:small} li{font-weight: normal; font-size:small} .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} h2{font-size:large; font-family:Arial,Helvetica,sans-serif;} h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;} '''