home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
- __license__ = 'GPL v3'
-
- import re
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class ScientificAmerican(BasicNewsRecipe):
- title = u'Scientific American'
- description = u'Popular Science. Monthly magazine.'
- category = 'science'
- __author__ = 'Starson17'
- no_stylesheets = True
- use_embedded_content = False
- language = 'en'
- publisher = 'Nature Publishing Group'
- remove_empty_feeds = True
- remove_javascript = True
- oldest_article = 30
- max_articles_per_feed = 100
-
- conversion_options = {'linearize_tables' : True
- , 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
- }
-
- keep_only_tags = [
- dict(name='h2', attrs={'class':'articleTitle'})
- ,dict(name='p', attrs={'id':'articleDek'})
- ,dict(name='p', attrs={'class':'articleInfo'})
- ,dict(name='div', attrs={'id':['articleContent']})
- ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)})
- ]
-
- remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})]
-
- def parse_index(self):
- soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
- issuetag = soup.find('p',attrs={'id':'articleDek'})
- self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
- img = soup.find('img', alt='Scientific American Magazine', src=True)
- if img is not None:
- self.cover_url = img['src']
- features, feeds = [], []
- for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
- if a is None: continue
- desc = ''
- s = a.parent.parent.find(attrs={'class':'dek'})
- desc = self.tag_to_string(s)
- article = {
- 'url' : a['href'],
- 'title' : self.tag_to_string(a),
- 'date' : '',
- 'description' : desc,
- }
- features.append(article)
- feeds.append(('Features', features))
- department = []
- title = None
- for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
- if 'department.cfm' in li.a['href']:
- if department:
- feeds.append((title, department))
- title = self.tag_to_string(li.a)
- department = []
- if 'article.cfm' in li.h3.a['href']:
- article = {
- 'url' : li.h3.a['href'],
- 'title' : self.tag_to_string(li.h3.a),
- 'date': '',
- 'description': self.tag_to_string(li.p),
- }
- department.append(article)
- if department:
- feeds.append((title, department))
- return feeds
-
- def postprocess_html(self, soup, first_fetch):
- for item in soup.findAll('a'):
- if 'topic.cfm' in item['href']:
- item.replaceWith(item.string)
- return soup
-
- extra_css = '''
- p{font-weight: normal; font-size:small}
- li{font-weight: normal; font-size:small}
- .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
- h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
- h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
- '''
-