home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3653 < prev    next >
Encoding:
Text File  |  2010-07-01  |  2.8 KB  |  72 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2.  
  3. class AdvancedUserRecipe1277228948(BasicNewsRecipe):
  4.     title          = u'China Press USA'
  5.     oldest_article = 7
  6.     max_articles_per_feed = 100
  7.  
  8.     __author__            = 'rty'
  9.     __version__            = '1.0'
  10.     language = 'zh'
  11.     pubisher  = 'www.chinapressusa.com'
  12.     description           = 'Overseas Chinese Network Newspaper in the USA'
  13.     category              = 'News in Chinese, USA'
  14.     remove_javascript = True
  15.     use_embedded_content   = False
  16.     no_stylesheets = True
  17.     #encoding               = 'GB2312'
  18.     encoding               = 'UTF-8'
  19.     conversion_options = {'linearize_tables':True}
  20.     masthead_url ='http://www.chinapressusa.com/common/images/logo.gif'
  21.     extra_css = '''
  22.              @font-face { font-family: "DroidFont", serif, sans-serif;  src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
  23.              body {
  24.                   margin-right: 8pt;
  25.                   font-family: 'DroidFont', serif;}
  26.               h1  {font-family: 'DroidFont', serif, sans-serif}
  27.             .show {font-family: 'DroidFont', serif, sans-serif}
  28.             '''
  29.     feeds          = [
  30.     (u'\u65b0\u95fb\u9891\u9053', u'http://news.uschinapress.com/news.xml'),
  31.     (u'\u534e\u4eba\u9891\u9053', u'http://chinese.uschinapress.com/chinese.xml'),
  32.     (u'\u8bc4\u8bba\u9891\u9053', u'http://review.uschinapress.com/review.xml'),
  33.     ]
  34.     keep_only_tags = [
  35.                               dict(name='div', attrs={'class':'show'}),
  36.                                ]
  37.     remove_tags = [
  38.      #               dict(name='table', attrs={'class':'xle'}),
  39.                     dict(name='div', attrs={'class':'time'}),
  40.                          ]
  41.     remove_tags_after = [
  42.                   dict(name='div', attrs={'class':'bank17'}),
  43.          #         dict(name='a', attrs={'class':'ab12'}),
  44.                          ]
  45.  
  46.  
  47.     def append_page(self, soup, appendtag, position):
  48.         pager = soup.find('div',attrs={'id':'displaypagenum'})
  49.         if pager:
  50.            nexturl = self.INDEX + pager.a['href']
  51.            soup2 = self.index_to_soup(nexturl)
  52.            texttag = soup2.find('div', attrs={'class':'show'})
  53.            for it in texttag.findAll(style=True):
  54.                del it['style']
  55.            newpos = len(texttag.contents)
  56.            self.append_page(soup2,texttag,newpos)
  57.            texttag.extract()
  58.            appendtag.insert(position,texttag)
  59.  
  60.  
  61.     def preprocess_html(self, soup):
  62.         mtag = '<meta http-equiv="Content-Language" content="zh-CN"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
  63.         soup.head.insert(0,mtag)
  64.  
  65.         for item in soup.findAll(style=True):
  66.             del item['style']
  67.         self.append_page(soup, soup.body, 3)
  68.         pager = soup.find('div',attrs={'id':'displaypagenum'})
  69.         if pager:
  70.            pager.extract()
  71.         return soup
  72.