home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3663 < prev    next >
Encoding:
Text File  |  2010-01-21  |  5.9 KB  |  99 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  3. '''
  4. Profile to download CNN
  5. '''
  6. from calibre.web.feeds.news import BasicNewsRecipe
  7. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  8.  
  9. class CNN(BasicNewsRecipe):
  10.  
  11.     title = 'CNN'
  12.     description = 'Global news'
  13.     timefmt  = ' [%d %b %Y]'
  14.     __author__ = 'Krittika Goyal and Sujata Raman'
  15.     language = 'en'
  16.  
  17.     no_stylesheets = True
  18.     use_embedded_content   = False
  19.     oldest_article        = 15
  20.     recursions = 1
  21.     match_regexps = [r'http://sportsillustrated.cnn.com/.*/[1-9].html']
  22.     max_articles_per_feed = 25
  23.  
  24.     extra_css = '''
  25.                 .cnn_strycntntlft{font-family :Arial,Helvetica,sans-serif;}
  26.                 h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  27.                 .cnnTxtCmpnt{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  28.                 .cnnTMcontent{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#575757}
  29.                 .storytext{font-family :Arial,Helvetica,sans-serif; font-size:small}
  30.                 .storybyline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#575757}
  31.                 .credit{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
  32.                 .storyBrandingBanner{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#575757}
  33.                 .storytimestamp{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#575757}
  34.                 .timestamp{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#575757}
  35.                 .cnn_strytmstmp{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  36.                 .cnn_stryimg640caption{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  37.                 .cnn_strylccimg300cntr{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  38.                 .cnn_stryichgfcpt{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  39.                 .cnnByline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  40.                 .cnn_bulletbin cnnStryHghLght{ font-size:xx-small;}
  41.                 .subhead p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
  42.                 .cnnStoryContent{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  43.                 .cnnContentContainer{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  44.                 .col1{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
  45.                 .col3{color:#333333; font-family :Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold;}
  46.                 .cnnInlineT1Caption{font-family :Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold;}
  47.                 .cnnInlineT1Credit{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#333333;}
  48.                 .col10{color:#5A637E;}
  49.                 .cnnInlineRailBulletList{color:black;}
  50.                 .cnnLine0{font-family :Arial,Helvetica,sans-serif; color:#666666;font-weight:bold;}
  51.                 .cnnTimeStamp{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#333333;}
  52.                 .galleryhedDek{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#575757;}
  53.                 .galleryWidgetHeader{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#004276;}
  54.                 .article-content{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  55.                 .cnnRecapStory{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  56.                 h1{font-family :Arial,Helvetica,sans-serif; font-size:x-large}
  57.                 .captionname{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#575757;}
  58.                 inStoryIE{{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
  59.                 '''
  60.  
  61.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  62.     #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
  63.     remove_tags = [
  64.        dict(name='iframe'),
  65.        dict(name='div', attrs={'class':['cnnEndOfStory', 'cnnShareThisItem', 'cnn_strylctcntr cnn_strylctcqrelt', 'cnnShareBoxContent', 'cnn_strybtmcntnt', 'cnn_strycntntrgt']}),
  66.        dict(name='div', attrs={'id':['IEContainer', 'clickIncludeBox']}),
  67.        #dict(name='ul', attrs={'class':'article-tools'}),
  68.        #dict(name='ul', attrs={'class':'articleTools'}),
  69.     ]
  70.  
  71.     feeds =  [
  72.              ('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
  73.              ('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
  74.              ('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
  75.              #('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
  76.              ('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
  77.              ('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
  78.              ('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
  79.              ('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
  80.              ('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
  81.              ('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
  82.              ('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
  83.              ('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
  84.              ('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
  85.              ('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')
  86.              ]
  87.     def preprocess_html(self, soup):
  88.         story = soup.find(name='div', attrs={'class':'cnnBody_Left'})
  89.         if story is None:
  90.            story = soup.find(name='div', attrs={'id':'cnnContentContainer'})
  91.            soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  92.            body = soup.find(name='body')
  93.            body.insert(0, story)
  94.         else:
  95.            soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  96.            body = soup.find(name='body')
  97.            body.insert(0, story)
  98.         return soup
  99.