Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4025 < prev next >

Wrap

Text File | 2010-08-01 | 4.4 KB | 122 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' ''' sciam.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' description = u'Popular science. Monthly magazine.' __author__ = 'Kovid Goyal and Sujata Raman' language = 'en' remove_javascript = True oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False extra_css = ''' p{font-weight: normal; font-size:small} li{font-weight: normal; font-size:small} .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} h2{font-size:x-small;} h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;} ''' remove_tags_before = dict(name='div', attrs={'class':'headline'}) remove_tags_after = dict(id=['article']) remove_tags = [ dict(id=['sharetools', 'reddit']), #dict(name='script'), {'class':['float_left', 'atools']}, {"class": re.compile(r'also-in-this')}, dict(name='a',title = ["Get the Rest of the Article","Subscribe","Buy this Issue"]), dict(name = 'img',alt = ["Graphic - Get the Rest of the Article"]), dict(name='div', attrs={'class':['commentbox']}), dict(name='h2', attrs={'class':['discuss_h2']}), ] html2lrf_options = ['--base-font-size', '8'] recursions = 1 match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)'] def parse_index(self): soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/') monthtag = soup.find('div',attrs={'id':'magazine-main_col2'}) month = self.tag_to_string(monthtag.contents[1]) self.timefmt = ' [%s]'%(self.tag_to_string(month)) img = soup.find('img', alt='Scientific American Magazine', src=True) if img is not None: self.cover_url = img['src'] features, feeds = [], [] for p in soup.find(id='magazine-main_col2').findAll('p') : a = p.find('a', href=True) if a is None: continue desc = '' s = p.find('span', attrs={'class':"sub"}) desc = self.tag_to_string(s) article = { 'url' : a['href'], 'title' : self.tag_to_string(a), 'date' : '', 'description' : desc, } features.append(article) feeds.append(('Features', features)) section = [] title = None for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']): if x.name == 'div': if section: feeds.append((title, section)) title = self.tag_to_string(x) section = [] else: if 'article.cfm' in x['href']: article = { 'url' : x['href'], 'title' : self.tag_to_string(x), 'date': '', 'description': '', } section.append(article) if section: feeds.append((title, section)) return feeds def postprocess_html(self, soup, first_fetch): if soup is not None: for span in soup.findAll('span', attrs={'class':'pagination'}): span.extract() if not first_fetch: div = soup.find('div', attrs={'class':'headline'}) if div: div.extract() return soup preprocess_regexps = [ (re.compile(r'Already a Digital subscriber.*Now</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'If your institution has site license access, enter.*here</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'to subscribe to our.*;.*\}', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'\)\(jQuery\);.*-->', re.DOTALL|re.IGNORECASE), lambda match: ''), ]