home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4145 < prev    next >
Encoding:
Text File  |  2010-02-03  |  14.8 KB  |  315 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4.  
  5. '''
  6. online.wsj.com
  7. '''
  8. import re
  9. from calibre.web.feeds.recipes import BasicNewsRecipe
  10. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  11. from datetime import timedelta, date
  12.  
  13. class WSJ(BasicNewsRecipe):
  14.     # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
  15.     title          = u'Wall Street Journal (free)'
  16.     __author__     = 'Nick Redding'
  17.     language = 'en'
  18.     description = ('All the free content from the Wall Street Journal (business, financial and political news)')
  19.  
  20.     no_stylesheets = True
  21.     timefmt = ' [%b %d]'
  22.  
  23.     # customization notes: delete sections you are not interested in
  24.     # set omit_paid_content to False if you want the paid content article snippets
  25.     # set oldest_article to the maximum number of days back from today to include articles
  26.     sectionlist = [
  27.                         ['/home-page','Front Page'],
  28.                         ['/public/page/news-opinion-commentary.html','Commentary'],
  29.                         ['/public/page/news-global-world.html','World News'],
  30.                         ['/public/page/news-world-business.html','US News'],
  31.                         ['/public/page/news-business-us.html','Business'],
  32.                         ['/public/page/news-financial-markets-stock.html','Markets'],
  33.                         ['/public/page/news-tech-technology.html','Technology'],
  34.                         ['/public/page/news-personal-finance.html','Personal Finnce'],
  35.                         ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
  36.                         ['/public/page/news-real-estate-homes.html','Real Estate'],
  37.                         ['/public/page/news-career-jobs.html','Careers'],
  38.                         ['/public/page/news-small-business-marketing.html','Small Business']
  39.                     ]
  40.     oldest_article = 2
  41.     omit_paid_content = True
  42.  
  43.     extra_css   = '''h1{font-size:large; font-family:Times,serif;}
  44.                     h2{font-family:Times,serif; font-size:small; font-style:italic;}
  45.                     .subhead{font-family:Times,serif; font-size:small; font-style:italic;}
  46.                     .insettipUnit {font-family:Times,serif;font-size:xx-small;}
  47.                     .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
  48.                     .article{font-family:Times,serif; font-size:x-small;}
  49.                     .tagline { font-size:xx-small;}
  50.                     .dateStamp {font-family:Times,serif;}
  51.                     h3{font-family:Times,serif; font-size:xx-small;}
  52.                     .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
  53.                     .metadataType-articleCredits {list-style-type: none;}
  54.                     h6{font-family:Times,serif; font-size:small; font-style:italic;}
  55.                     .paperLocation{font-size:xx-small;}'''
  56.  
  57.  
  58.     remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
  59.     remove_tags =   [   dict({'id':re.compile('^articleTabs_tab_')}),
  60.                         #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
  61.                         #         "articleTabs_tab_interactive","articleTabs_tab_video",
  62.                         #         "articleTabs_tab_map","articleTabs_tab_slideshow"]),
  63.             {'class':  ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
  64.                                     'insettip','insetClose','more_in', "insetContent",
  65.                         #            'articleTools_bottom','articleTools_bottom mjArticleTools',
  66.                                     'aTools', 'tooltip',
  67.                                     'adSummary', 'nav-inline','insetFullBracket']},
  68.                         dict({'class':re.compile('^articleTools_bottom')}),
  69.                         dict(rel='shortcut icon')
  70.                     ]
  71.     remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
  72.  
  73.     def get_browser(self):
  74.         br = BasicNewsRecipe.get_browser()
  75.         return br
  76.  
  77.  
  78.     def preprocess_html(self,soup):
  79.  
  80.         def decode_us_date(datestr):
  81.             udate = datestr.strip().lower().split()
  82.             m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
  83.             d = int(udate[1])
  84.             y = int(udate[2])
  85.             return date(y,m,d)
  86.  
  87.         # check if article is paid content
  88.         if self.omit_paid_content:
  89.             divtags = soup.findAll('div','tooltip')
  90.             if divtags:
  91.                 for divtag in divtags:
  92.                     if divtag.find(text="Subscriber Content"):
  93.                         return None
  94.  
  95.         # check if article is too old
  96.         datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
  97.         if datetag:
  98.             dateline_string = self.tag_to_string(datetag,False)
  99.             date_items = dateline_string.split(',')
  100.             datestring = date_items[0]+date_items[1]
  101.             article_date = decode_us_date(datestring)
  102.             earliest_date = date.today() - timedelta(days=self.oldest_article)
  103.             if article_date < earliest_date:
  104.                 self.log("Skipping article dated %s" % datestring)
  105.                 return None
  106.             datetag.parent.extract()
  107.  
  108.             # place dateline in article heading
  109.  
  110.             bylinetag = soup.find('h3','byline')
  111.             if bylinetag:
  112.                 h3bylinetag = bylinetag
  113.             else:
  114.                 bylinetag = soup.find('li','byline')
  115.                 if bylinetag:
  116.                     h3bylinetag = bylinetag.h3
  117.                     if not h3bylinetag:
  118.                         h3bylinetag = bylinetag
  119.                     bylinetag = bylinetag.parent
  120.             if bylinetag:
  121.                 if h3bylinetag.a:
  122.                     bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
  123.                 else:
  124.                     bylinetext = self.tag_to_string(h3bylinetag,False)
  125.                 h3byline = Tag(soup,'h3',[('class','byline')])
  126.                 if bylinetext.isspace() or (bylinetext == ''):
  127.                     h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
  128.                 else:
  129.                     h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
  130.                 bylinetag.replaceWith(h3byline)
  131.             else:
  132.                 headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
  133.                 if headlinetag:
  134.                     dateline = Tag(soup,'h3', [('class','byline')])
  135.                     dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
  136.                     headlinetag.insert(len(headlinetag),dateline)
  137.         else: # if no date tag, don't process this page--it's not a news item
  138.             return None
  139.         # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
  140.         ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
  141.         if ultag:
  142.             a = ultag.h3
  143.             if a:
  144.                 ultag.replaceWith(a)
  145.         return soup
  146.  
  147.     def parse_index(self):
  148.  
  149.         articles = {}
  150.         key = None
  151.         ans = []
  152.  
  153.         def parse_index_page(page_name,page_title):
  154.  
  155.             def article_title(tag):
  156.                 atag = tag.find('h2') # title is usually in an h2 tag
  157.                 if not atag: # if not, get text from the a tag
  158.                     atag = tag.find('a',href=True)
  159.                     if not atag:
  160.                         return ''
  161.                     t = self.tag_to_string(atag,False)
  162.                     if t == '':
  163.                         # sometimes the title is in the second a tag
  164.                         atag.extract()
  165.                         atag = tag.find('a',href=True)
  166.                         if not atag:
  167.                             return ''
  168.                         return self.tag_to_string(atag,False)
  169.                     return t
  170.                 return self.tag_to_string(atag,False)
  171.  
  172.             def article_author(tag):
  173.                 atag = tag.find('strong') # author is usually in a strong tag
  174.                 if not atag:
  175.                      atag = tag.find('h4') # if not, look for an h4 tag
  176.                      if not atag:
  177.                          return ''
  178.                 return self.tag_to_string(atag,False)
  179.  
  180.             def article_summary(tag):
  181.                 atag = tag.find('p')
  182.                 if not atag:
  183.                     return ''
  184.                 subtag = atag.strong
  185.                 if subtag:
  186.                     subtag.extract()
  187.                 return self.tag_to_string(atag,False)
  188.  
  189.             def article_url(tag):
  190.                 atag = tag.find('a',href=True)
  191.                 if not atag:
  192.                     return ''
  193.                 url = re.sub(r'\?.*', '', atag['href'])
  194.                 return url
  195.  
  196.             def handle_section_name(tag):
  197.                 # turns a tag into a section name with special processing
  198.                 # for Wat's News, U.S., World & U.S. and World
  199.                 s = self.tag_to_string(tag,False)
  200.                 if ("What" in s) and ("News" in s):
  201.                     s = "What's News"
  202.                 elif (s == "U.S.") or (s == "World & U.S.") or (s == "World"):
  203.                     s = s + " News"
  204.                 return s
  205.  
  206.  
  207.  
  208.             mainurl = 'http://online.wsj.com'
  209.             pageurl = mainurl+page_name
  210.             #self.log("Page url %s" % pageurl)
  211.             soup = self.index_to_soup(pageurl)
  212.             # Find each instance of div with class including "headlineSummary"
  213.             for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
  214.                 # divtag contains all article data as ul's and li's
  215.                 # first, check if there is an h3 tag which provides a section name
  216.                 stag = divtag.find('h3')
  217.                 if stag:
  218.                     if stag.parent.get('class', '') == 'dynamic':
  219.                         # a carousel of articles is too complex to extract a section name
  220.                         # for each article, so we'll just call the section "Carousel"
  221.                         section_name = 'Carousel'
  222.                     else:
  223.                         section_name = handle_section_name(stag)
  224.                 else:
  225.                     section_name = "What's News"
  226.                 #self.log("div Section %s" % section_name)
  227.                 # find each top-level ul in the div
  228.                 # we don't restrict to class = newsItem because the section_name
  229.                 # sometimes changes via a ul tag inside the div
  230.                 for ultag in divtag.findAll('ul',recursive=False):
  231.                     stag = ultag.find('h3')
  232.                     if stag:
  233.                         if stag.parent.name == 'ul':
  234.                             # section name has changed
  235.                             section_name = handle_section_name(stag)
  236.                             #self.log("ul Section %s" % section_name)
  237.                             # delete the h3 tag so it doesn't get in the way
  238.                             stag.extract()
  239.                     # find each top level li in the ul
  240.                     for litag in ultag.findAll('li',recursive=False):
  241.                         stag = litag.find('h3')
  242.                         if stag:
  243.                             # section name has changed
  244.                             section_name = handle_section_name(stag)
  245.                             #self.log("li Section %s" % section_name)
  246.                             # delete the h3 tag so it doesn't get in the way
  247.                             stag.extract()
  248.                         # if there is a ul tag inside the li it is superfluous;
  249.                         # it is probably a list of related articles
  250.                         utag = litag.find('ul')
  251.                         if utag:
  252.                             utag.extract()
  253.                         # now skip paid subscriber articles if desired
  254.                         subscriber_tag = litag.find(text="Subscriber Content")
  255.                         if subscriber_tag:
  256.                                 if self.omit_paid_content:
  257.                                     continue
  258.                                 # delete the tip div so it doesn't get in the way
  259.                                 tiptag = litag.find("div", { "class" : "tipTargetBox" })
  260.                                 if tiptag:
  261.                                     tiptag.extract()
  262.                         h1tag = litag.h1
  263.                         # if there's an h1 tag, it's parent is a div which should replace
  264.                         # the li tag for the analysis
  265.                         if h1tag:
  266.                             litag = h1tag.parent
  267.                         h5tag = litag.h5
  268.                         if h5tag:
  269.                             # section mame has changed
  270.                             section_name = self.tag_to_string(h5tag,False)
  271.                             #self.log("h5 Section %s" % section_name)
  272.                             # delete the h5 tag so it doesn't get in the way
  273.                             h5tag.extract()
  274.                         url = article_url(litag)
  275.                         if url == '':
  276.                             continue
  277.                         if url.startswith("/article"):
  278.                             url = mainurl+url
  279.                         if not url.startswith("http://online.wsj.com"):
  280.                             continue
  281.                         if not url.endswith(".html"):
  282.                             continue
  283.                         if 'video' in url:
  284.                             continue
  285.                         title = article_title(litag)
  286.                         if title == '':
  287.                             continue
  288.                         #self.log("URL %s" % url)
  289.                         #self.log("Title %s" % title)
  290.                         pubdate = ''
  291.                         #self.log("Date %s" % pubdate)
  292.                         author = article_author(litag)
  293.                         if author == '':
  294.                             author = section_name
  295.                         elif author == section_name:
  296.                             author = ''
  297.                         else:
  298.                             author = section_name+': '+author
  299.                         #if not author == '':
  300.                         #    self.log("Author %s" % author)
  301.                         description = article_summary(litag)
  302.                         #if not description == '':
  303.                         #    self.log("Description %s" % description)
  304.                         if not articles.has_key(page_title):
  305.                             articles[page_title] = []
  306.                         articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
  307.  
  308.  
  309.         for page_name,page_title in self.sectionlist:
  310.             parse_index_page(page_name,page_title)
  311.             ans.append(page_title)
  312.  
  313.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  314.         return ans
  315.