home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4025 < prev    next >
Encoding:
Text File  |  2010-08-01  |  4.4 KB  |  122 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. '''
  7. sciam.com
  8. '''
  9. import re
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class ScientificAmerican(BasicNewsRecipe):
  13.     title = u'Scientific American'
  14.     description = u'Popular science. Monthly magazine.'
  15.     __author__ = 'Kovid Goyal and Sujata Raman'
  16.     language = 'en'
  17.     remove_javascript   = True
  18.     oldest_article = 30
  19.     max_articles_per_feed = 100
  20.     no_stylesheets = True
  21.     use_embedded_content   = False
  22.     extra_css = '''
  23.                 p{font-weight: normal; font-size:small}
  24.                 li{font-weight: normal; font-size:small}
  25.                 .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
  26.                 h2{font-size:x-small;}
  27.                 h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
  28.                 '''
  29.     remove_tags_before = dict(name='div', attrs={'class':'headline'})
  30.  
  31.     remove_tags_after  = dict(id=['article'])
  32.     remove_tags        = [
  33.                           dict(id=['sharetools', 'reddit']),
  34.                           #dict(name='script'),
  35.                           {'class':['float_left', 'atools']},
  36.                           {"class": re.compile(r'also-in-this')},
  37.                           dict(name='a',title = ["Get the Rest of the Article","Subscribe","Buy this Issue"]),
  38.                           dict(name = 'img',alt = ["Graphic - Get the Rest of the Article"]),
  39.                           dict(name='div', attrs={'class':['commentbox']}),
  40.                           dict(name='h2', attrs={'class':['discuss_h2']}),
  41.                          ]
  42.  
  43.     html2lrf_options = ['--base-font-size', '8']
  44.     recursions = 1
  45.     match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)']
  46.  
  47.     def parse_index(self):
  48.         soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
  49.         monthtag = soup.find('div',attrs={'id':'magazine-main_col2'})
  50.         month = self.tag_to_string(monthtag.contents[1])
  51.  
  52.  
  53.         self.timefmt = ' [%s]'%(self.tag_to_string(month))
  54.         img = soup.find('img', alt='Scientific American Magazine', src=True)
  55.         if img is not None:
  56.             self.cover_url = img['src']
  57.         features, feeds = [], []
  58.         for p in soup.find(id='magazine-main_col2').findAll('p') :
  59.             a = p.find('a', href=True)
  60.  
  61.             if a is None: continue
  62.             desc = ''
  63.             s = p.find('span', attrs={'class':"sub"})
  64.             desc = self.tag_to_string(s)
  65.  
  66.             article = {
  67.                     'url' : a['href'],
  68.                     'title' : self.tag_to_string(a),
  69.                     'date' : '',
  70.                     'description' : desc,
  71.                     }
  72.             features.append(article)
  73.         feeds.append(('Features', features))
  74.  
  75.         section = []
  76.         title = None
  77.  
  78.         for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']):
  79.  
  80.             if x.name == 'div':
  81.  
  82.                 if section:
  83.                     feeds.append((title, section))
  84.  
  85.                 title = self.tag_to_string(x)
  86.                 section = []
  87.             else:
  88.  
  89.                 if 'article.cfm' in x['href']:
  90.                     article = {
  91.                             'url' : x['href'],
  92.                             'title' : self.tag_to_string(x),
  93.                             'date': '',
  94.                             'description': '',
  95.                         }
  96.  
  97.                     section.append(article)
  98.  
  99.         if section:
  100.             feeds.append((title, section))
  101.  
  102.         return feeds
  103.  
  104.  
  105.     def postprocess_html(self, soup, first_fetch):
  106.         if soup is not None:
  107.             for span in soup.findAll('span', attrs={'class':'pagination'}):
  108.                 span.extract()
  109.             if not first_fetch:
  110.                 div = soup.find('div', attrs={'class':'headline'})
  111.                 if div:
  112.                     div.extract()
  113.  
  114.         return soup
  115.  
  116.     preprocess_regexps = [
  117.         (re.compile(r'Already a Digital subscriber.*Now</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  118.         (re.compile(r'If your institution has site license access, enter.*here</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''),
  119.         (re.compile(r'to subscribe to our.*;.*\}', re.DOTALL|re.IGNORECASE), lambda match: ''),
  120.         (re.compile(r'\)\(jQuery\);.*-->', re.DOTALL|re.IGNORECASE), lambda match: ''),
  121.         ]
  122.