home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_4037 < prev    next >
Encoding:
Text File  |  2009-12-08  |  18.2 KB  |  435 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. calibre recipe for slate.com
  7. '''
  8.  
  9. import re
  10. from calibre.web.feeds.recipes import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
  12.  
  13. class PeriodicalNameHere(BasicNewsRecipe):
  14.     # Method variables for customizing downloads
  15.     title                   = 'Slate'
  16.     description             = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
  17.     __author__              = 'GRiker and Sujata Raman'
  18.     max_articles_per_feed   = 20
  19.     oldest_article          = 7.0
  20.     recursions              = 0
  21.     delay                   = 0
  22.     simultaneous_downloads  = 5
  23.     timeout                 = 120.0
  24.     timefmt                 = ''
  25.     feeds                   = None
  26.     no_stylesheets          = True
  27.     encoding                = None
  28.     language = 'en'
  29.  
  30.  
  31.  
  32.  
  33.     # Method variables for customizing feed parsing
  34.     summary_length          = 250
  35.     use_embedded_content    = None
  36.  
  37.     # Method variables for pre/post processing of HTML
  38.     preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
  39.                                         re.DOTALL|re.IGNORECASE),
  40.                                         lambda match: ''),
  41.                            (re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
  42.                                         re.DOTALL|re.IGNORECASE),
  43.                                         lambda match: '')   ]
  44.  
  45.     match_regexps           = []
  46.  
  47.     # The second entry is for 'Big Money', which comes from a different site, uses different markup
  48.     keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body']}),
  49.                                dict(attrs={   'id':['content']})  ]
  50.  
  51.     # The second entry is for 'Big Money', which comes from a different site, uses different markup
  52.     remove_tags             = [dict(attrs={   'id':['toolbox','recommend_tab','insider_ad_wrapper',
  53.                                                     'article_bottom_tools_cntr','fray_article_discussion',                                                    'fray_article_links','bottom_sponsored_links','author_bio',
  54.                                                     'bizbox_links_bottom','ris_links_wrapper','BOXXLE']}),
  55.                                dict(attrs={    'id':['content-top','service-links-bottom','hed']})   ]
  56.  
  57.     excludedDescriptionKeywords =   ['Slate V','Twitter feed','podcast']
  58.     excludedTitleKeywords =         ['Gabfest','Slate V','on Twitter']
  59.     excludedAuthorKeywords =        []
  60.     excludedContentKeywords =       ['http://twitter.com/Slate']
  61.  
  62.     extra_css = '''
  63.                   .h1_subhead{font-family:Arial; font-size:small; }
  64.                    h1{font-family:Verdana; font-size:large; }
  65.                  .byline        {font-family:Georgia;   margin-bottom: 0px; color: #660033;}
  66.                  .dateline      {font-family:Arial;  font-size: smaller; height: 0pt; color:#666666;}
  67.                  .imagewrapper  {font-family:Verdana;font-size:x-small; }
  68.                  .source        {font-family:Verdana; font-size:x-small;}
  69.                  .credit        {font-family:Verdana; font-size:     smaller;}
  70.                  #article_body  {font-family:Verdana; }
  71.                  #content  {font-family:Arial; }
  72.                  .caption{font-family:Verdana;font-style:italic; font-size:x-small;}
  73.                  h3{font-family:Arial; color:#666666; font-size:small}
  74.                   a{color:#0066CC;}
  75.                   '''
  76.  
  77.     # Local variables to extend class
  78.     baseURL = 'http://slate.com'
  79.     section_dates = []
  80.  
  81.     # class extension methods
  82.     def tag_to_strings(self, tag):
  83.         if not tag:
  84.             return ''
  85.         if isinstance(tag, basestring):
  86.             return tag
  87.         strings = []
  88.         for item in tag.contents:
  89.             if isinstance(item, (NavigableString, CData)):
  90.                 strings.append(item.string)
  91.             elif isinstance(item, Tag):
  92.                 res = self.tag_to_string(item)
  93.                 if res:
  94.                     strings.append(res)
  95.         return strings
  96.  
  97.  
  98.     def extract_sections(self):
  99.         soup = self.index_to_soup( self.baseURL )
  100.         soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
  101.         soup = soup.find(True, attrs={'id':'toc_links_container'})
  102.  
  103.         todays_section = soup.find(True, attrs={'class':'todaydateline'})
  104.         self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
  105.         self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
  106.  
  107.         older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
  108.         for older_section in older_section_dates :
  109.             self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
  110.  
  111.         if soup_top_stories:
  112.             headline_stories = soup_top_stories.find('ul')
  113.         else:
  114.             headline_stories = None
  115.         section_lists = soup.findAll('ul')
  116.         # Prepend the headlines to the first section
  117.         if headline_stories:
  118.             section_lists[0].insert(0,headline_stories)
  119.  
  120.         sections = []
  121.         for section in section_lists :
  122.             sections.append(section)
  123.         return sections
  124.  
  125.  
  126.     def extract_section_articles(self, sections_html) :
  127.         #       Find the containers with section content
  128.         soup = self.index_to_soup(str(sections_html))
  129.         sections = soup.findAll('ul')
  130.  
  131.         articles = {}
  132.         key = None
  133.         ans = []
  134.  
  135.         for (i,section) in enumerate(sections) :
  136.  
  137.             # Get the section name
  138.             if section.has_key('id') :
  139.                 key = self.section_dates[i]
  140.                 articles[key] = []
  141.                 ans.append(key)
  142.             else :
  143.                 continue
  144.  
  145.             # Get the section article_list
  146.             article_list = section.findAll('li')
  147.  
  148.             # Extract the article attributes
  149.             for article in article_list :
  150.                 bylines = self.tag_to_strings(article)
  151.                 url = article.a['href']
  152.                 title = bylines[0]
  153.                 full_title = self.tag_to_string(article)
  154.  
  155.                 author = None
  156.                 description = None
  157.                 pubdate = None
  158.  
  159.                 if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
  160.                     description = "A summary of what's in the major U.S. newspapers."
  161.  
  162.                 if len(bylines) == 3 :
  163.                     author = bylines[2].strip()
  164.                     author = re.sub('[\r][\n][\t][\t\t]','', author)
  165.                     author = re.sub(',','', author)
  166.                     if bylines[1] is not None :
  167.                         description = bylines[1]
  168.                         full_byline = self.tag_to_string(article)
  169.                         if full_byline.find('major U.S. newspapers') > 0 :
  170.                             description = "A summary of what's in the major U.S. newspapers."
  171.  
  172.                 if len(bylines) > 3  and author is not None:
  173.                     author += " | "
  174.                     for (i,substring) in enumerate(bylines[3:]) :
  175.                         #print "substring: %s" % substring.encode('cp1252')
  176.                         author += substring.strip()
  177.                         if i < len(bylines[3:]) :
  178.                             author += " | "
  179.  
  180.                 # Skip articles whose descriptions contain excluded keywords
  181.                 if description is not None and len(self.excludedDescriptionKeywords):
  182.                     excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
  183.                     found_excluded = excluded.search(description)
  184.                     if found_excluded :
  185.                         if self.verbose : self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  186.                         continue
  187.  
  188.                 # Skip articles whose title contain excluded keywords
  189.                 if full_title is not None and len(self.excludedTitleKeywords):
  190.                     excluded = re.compile('|'.join(self.excludedTitleKeywords))
  191.                     #self.log("evaluating full_title: %s" % full_title)
  192.                     found_excluded = excluded.search(full_title)
  193.                     if found_excluded :
  194.                         if self.verbose : self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  195.                         continue
  196.  
  197.                 # Skip articles whose author contain excluded keywords
  198.                 if author is not None and len(self.excludedAuthorKeywords):
  199.                     excluded = re.compile('|'.join(self.excludedAuthorKeywords))
  200.                     found_excluded = excluded.search(author)
  201.                     if found_excluded :
  202.                         if self.verbose : self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  203.                         continue
  204.  
  205.                 skip_this_article = False
  206.                 # Check to make sure we're not adding a duplicate
  207.                 for article in articles[key] :
  208.                     if article['url'] == url :
  209.                         skip_this_article = True
  210.                         break
  211.  
  212.                 if skip_this_article :
  213.                     continue
  214.  
  215.                 # Build the dictionary entry for this article
  216.                 feed = key
  217.                 if not articles.has_key(feed) :
  218.                     articles[feed] = []
  219.                 articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
  220.                                            author=author, content=''))
  221.             # Promote 'newspapers' to top
  222.             for (i,article) in enumerate(articles[feed]) :
  223.                 if article['description'] is not None :
  224.                     if article['description'].find('newspapers') > 0 :
  225.                         articles[feed].insert(0,articles[feed].pop(i))
  226.  
  227.  
  228.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  229.         ans = self.remove_duplicates(ans)
  230.         return ans
  231.  
  232.     def flatten_document(self, ans):
  233.         flat_articles = []
  234.         for (i,section) in enumerate(ans) :
  235.             #self.log("flattening section %s: " % section[0])
  236.             for article in section[1] :
  237.                 #self.log("moving %s to flat_articles[]" % article['title'])
  238.                 flat_articles.append(article)
  239.         flat_section = ['All Articles', flat_articles]
  240.         flat_ans = [flat_section]
  241.         return flat_ans
  242.  
  243.     def remove_duplicates(self, ans):
  244.         # Return a stripped ans
  245.         for (i,section) in enumerate(ans) :
  246.             #self.log("section %s: " % section[0])
  247.             for article in section[1] :
  248.                 #self.log("\t%s" % article['title'])
  249.                 #self.log("\looking for %s" % article['url'])
  250.                 for (j,subsequent_section) in enumerate(ans[i+1:]) :
  251.                     for (k,subsequent_article) in enumerate(subsequent_section[1]) :
  252.                         if article['url'] == subsequent_article['url'] :
  253.                             #self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
  254.                             del subsequent_section[1][k]
  255.         return ans
  256.  
  257.     def print_version(self, url) :
  258.         return url + 'pagenum/all/'
  259.  
  260.     # Class methods
  261.     def parse_index(self) :
  262.         sections = self.extract_sections()
  263.         section_list = self.extract_section_articles(sections)
  264.         section_list = self.flatten_document(section_list)
  265.         return section_list
  266.  
  267.     def get_browser(self) :
  268.         return BasicNewsRecipe.get_browser()
  269.  
  270.     def stripAnchors(self,soup):
  271.         body = soup.find('div',attrs={'id':['article_body','content']})
  272.         if body is not None:
  273.             paras = body.findAll('p')
  274.             if paras is not None:
  275.                 for para in paras:
  276.                     aTags = para.findAll('a')
  277.                     if aTags is not None:
  278.                         for a in aTags:
  279.                             if a.img is None:
  280.                                 #print repr(a.renderContents())
  281.                                 a.replaceWith(a.renderContents().decode('utf-8','replace'))
  282.         return soup
  283.  
  284.     def preprocess_html(self, soup) :
  285.  
  286.         # Remove 'grayPlus4.png' images
  287.         imgs = soup.findAll('img')
  288.         if imgs is not None:
  289.             for img in imgs:
  290.                 if re.search("grayPlus4.png",str(img)):
  291.                     img.extract()
  292.  
  293.         # Delete article based upon content keywords
  294.         if len(self.excludedDescriptionKeywords):
  295.             excluded = re.compile('|'.join(self.excludedContentKeywords))
  296.             found_excluded = excluded.search(str(soup))
  297.             if found_excluded :
  298.                 print "no allowed content found, removing article"
  299.                 raise Exception('String error')
  300.  
  301.         # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
  302.         head = soup.find('head')
  303.         if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
  304.             byline = soup.find('div',attrs={'id':'byline'})
  305.             if byline is not None:
  306.                 byline['class'] = byline['id']
  307.  
  308.             dateline = soup.find('div',attrs={'id':'dateline'})
  309.             if dateline is not None:
  310.                 dateline['class'] = dateline['id']
  311.  
  312.             body = soup.find('div',attrs={'id':'content'})
  313.             if body is not None:
  314.                 body['class'] = 'article_body'
  315.  
  316.             # Synthesize a department kicker
  317.             h3Tag = Tag(soup,'h3')
  318.             emTag = Tag(soup,'em')
  319.             emTag.insert(0,NavigableString("the big money: Today's business press"))
  320.             h3Tag.insert(0,emTag)
  321.             soup.body.insert(0,h3Tag)
  322.  
  323.         # Strip anchors from HTML
  324.         return self.stripAnchors(soup)
  325.  
  326.     def postprocess_html(self, soup, first_fetch) :
  327.  
  328.         # Fix up dept_kicker as <h3><em>
  329.         dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
  330.         if dept_kicker is not None :
  331.             kicker_strings = self.tag_to_strings(dept_kicker)
  332.             #kicker = kicker_strings[2] + kicker_strings[3]
  333.             kicker = ''.join(kicker_strings[2:])
  334.             kicker = re.sub('\.','',kicker)
  335.             h3Tag = Tag(soup, "h3")
  336.             emTag = Tag(soup, "em")
  337.             emTag.insert(0,NavigableString(kicker))
  338.             h3Tag.insert(0, emTag)
  339.             dept_kicker.replaceWith(h3Tag)
  340.  
  341.         # Change <h1> to <h2>
  342.         headline = soup.find("h1")
  343.         tag = headline.find("span")
  344.         tag.name = 'div'
  345.  
  346.         if headline is not None :
  347.             h2tag = Tag(soup, "h2")
  348.             h2tag['class'] = "headline"
  349.             strs = self.tag_to_strings(headline)
  350.             result = ''
  351.             for (i,substr) in enumerate(strs) :
  352.                 result += substr
  353.                 if i < len(strs) -1 :
  354.                     result += '<br />'
  355.             #h2tag.insert(0, result)
  356.             #headline.replaceWith(h2tag)
  357.  
  358.         # Fix up the concatenated byline and dateline
  359.         byline = soup.find(True,attrs={'class':'byline'})
  360.         if byline is not None :
  361.             bylineTag = Tag(soup,'div')
  362.             bylineTag['class'] = 'byline'
  363.             #bylineTag['height'] = '0em'
  364.             bylineTag.insert(0,self.tag_to_string(byline))
  365.             byline.replaceWith(bylineTag)
  366.  
  367.         dateline = soup.find(True, attrs={'class':'dateline'})
  368.         if dateline is not None :
  369.             datelineTag = Tag(soup, 'div')
  370.             datelineTag['class'] = 'dateline'
  371.             #datelineTag['margin-top'] = '0em'
  372.             datelineTag.insert(0,self.tag_to_string(dateline))
  373.             dateline.replaceWith(datelineTag)
  374.  
  375.         # Change captions to italic, add <hr>
  376.         for caption in soup.findAll(True, {'class':'caption'}) :
  377.             if caption is not None:
  378.                 emTag = Tag(soup, "em")
  379.                 emTag.insert(0, '<br />' + self.tag_to_string(caption))
  380.                 hrTag = Tag(soup, 'hr')
  381.                 emTag.insert(1, hrTag)
  382.                 caption.replaceWith(emTag)
  383.  
  384.         # Fix photos
  385.         for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
  386.             if photo.a is not None and photo.a.img is not None:
  387.                 divTag = Tag(soup,'div')
  388.                 divTag['class'] ='imagewrapper'
  389.                 divTag.insert(0,photo.a.img)
  390.                 photo.replaceWith(divTag)
  391.  
  392.         return soup
  393.  
  394.     def postprocess_book(self, oeb, opts, log) :
  395.  
  396.         def extract_byline(href) :
  397.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  398.             byline = soup.find(True,attrs={'class':'byline'})
  399.             if byline is not None:
  400.                 return self.tag_to_string(byline,use_alt=False)
  401.             else :
  402.                 return None
  403.  
  404.         def extract_description(href) :
  405.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  406.             paragraphs = soup.findAll('p')
  407.             for p in paragraphs :
  408.                 if self.tag_to_string(p,use_alt=False).startswith('By ') or \
  409.                    self.tag_to_string(p,use_alt=False).startswith('Posted '):
  410.                     continue
  411.                 comment = p.find(text=lambda text:isinstance(text, Comment))
  412.                 if comment is not None:
  413.                     continue
  414.                 else:
  415.                     return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
  416.  
  417.             return None
  418.  
  419.         # Method entry point here
  420.         # Single section toc looks different than multi-section tocs
  421.         if oeb.toc.depth() == 2 :
  422.             for article in oeb.toc :
  423.                 if article.author is None :
  424.                     article.author = extract_byline(article.href)
  425.                 if article.description is None :
  426.                     article.description = extract_description(article.href)
  427.         elif oeb.toc.depth() == 3 :
  428.             for section in oeb.toc :
  429.                 for article in section :
  430.                     if article.author is None :
  431.                         article.author = extract_byline(article.href)
  432.                     if article.description is None :
  433.                         article.description = extract_description(article.href)
  434.  
  435.