Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / slate.recipe < prev next >

Wrap

Text File | 2011-09-09 | 18.7 KB | 452 lines

#!/usr/bin/env python __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __license__ = 'GPL v3' ''' calibre recipe for slate.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag class Slate(BasicNewsRecipe): # Method variables for customizing downloads description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' __author__ = 'GRiker, Sujata Raman and Nick Redding' max_articles_per_feed = 100 oldest_article = 14 recursions = 0 delay = 0 simultaneous_downloads = 5 timeout = 120.0 timefmt = '' feeds = None no_stylesheets = True encoding = None language = 'en' slate_complete = True if slate_complete: title = 'Slate (complete)' else: title = 'Slate (weekly)' # Method variables for customizing feed parsing summary_length = 250 use_embedded_content = None # Method variables for pre/post processing of HTML preprocess_regexps = [ (re.compile(r'Disclosure: Slate is owned by the Washington Post.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'Join the discussion about this story on.*', re.DOTALL|re.IGNORECASE), lambda match: '') ] match_regexps = [] # The second entry is for 'Big Money', which comes from a different site, uses different markup keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}), dict(attrs={ 'id':['content']}) ] # The second entry is for 'Big Money', which comes from a different site, uses different markup remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper', 'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio', 'bizbox_links_bottom','ris_links_wrapper','BOXXLE', 'comments_button','add_comments_button','comments-to-fray','marriott_ad', 'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}), dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ] excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] excludedTitleKeywords = ['Gabfest','Slate V','on Twitter'] excludedAuthorKeywords = [] excludedContentKeywords = ['http://twitter.com/Slate'] extra_css = ''' .h1_subhead{font-family:Arial; font-size:small; } h1{font-family:Verdana; font-size:large; } .byline {font-family:Georgia; margin-bottom: 0px; } .dateline {font-family:Arial; font-size: smaller; height: 0pt;} .imagewrapper {font-family:Verdana;font-size:x-small; } .source {font-family:Verdana; font-size:x-small;} .credit {font-family:Verdana; font-size: smaller;} #article_body {font-family:Verdana; } #content {font-family:Arial; } .caption{font-family:Verdana;font-style:italic; font-size:x-small;} h3{font-family:Arial; font-size:small} ''' # Local variables to extend class baseURL = 'http://slate.com' section_dates = [] # class extension methods def tag_to_strings(self, tag): if not tag: return '' if isinstance(tag, basestring): return tag strings = [] for item in tag.contents: if isinstance(item, (NavigableString, CData)): strings.append(item.string) elif isinstance(item, Tag): res = self.tag_to_string(item,use_alt=False) if res: strings.append(res) return strings def extract_named_sections(self): soup = self.index_to_soup( self.baseURL ) soup_nav_bar = soup.find(True, attrs={'id':'nav'}) briefing_nav = soup.find('li') briefing_url = briefing_nav.a['href'] for section_nav in soup_nav_bar.findAll('li'): section_name = self.tag_to_string(section_nav,use_alt=False) self.section_dates.append(section_name) soup = self.index_to_soup(briefing_url) self.log("Briefing url = %s " % briefing_url) section_lists = soup.findAll('ul','view_links_list') sections = [] for section in section_lists : sections.append(section) return sections def extract_dated_sections(self): soup = self.index_to_soup( self.baseURL ) soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'}) if soup_top_stories: self.section_dates.append("Top Stories") self.log("SELECTION TOP STORIES %s" % "Top Stories") soup = soup.find(True, attrs={'id':'toc_links_container'}) todays_section = soup.find(True, attrs={'class':'todaydateline'}) self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False)) older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) for older_section in older_section_dates : self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False)) if soup_top_stories: headline_stories = soup_top_stories self.log("HAVE top_stories") else: headline_stories = None self.log("NO top_stories") section_lists = soup.findAll('ul') # Prepend the headlines to the first section if headline_stories: section_lists.insert(0,headline_stories) sections = [] for section in section_lists : sections.append(section) return sections def extract_section_articles(self, sections_html) : # Find the containers with section content sections = sections_html articles = {} key = None ans = [] for (i,section) in enumerate(sections) : # Get the section name if section.has_key('id') : self.log("PROCESSING SECTION id = %s" % section['id']) key = self.section_dates[i] if key.startswith("Pod"): continue if key.startswith("Blog"): continue articles[key] = [] ans.append(key) elif self.slate_complete: key = self.section_dates[i] if key.startswith("Pod"): continue if key.startswith("Blog"): continue self.log("PROCESSING SECTION name = %s" % key) articles[key] = [] ans.append(key) else : self.log("SECTION %d HAS NO id" % i); continue # Get the section article_list article_list = section.findAll('li') # Extract the article attributes for article in article_list : bylines = self.tag_to_strings(article) url = article.a['href'] title = bylines[0] full_title = self.tag_to_string(article,use_alt=False) #self.log("ARTICLE TITLE%s" % title) #self.log("ARTICLE FULL_TITLE%s" % full_title) #self.log("URL %s" % url) author = None description = None pubdate = None if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 : description = "A summary of what's in the major U.S. newspapers." if len(bylines) == 3 : author = bylines[2].strip() author = re.sub('[\r][\n][\t][\t\t]','', author) author = re.sub(',','', author) if bylines[1] is not None : description = bylines[1] full_byline = self.tag_to_string(article) if full_byline.find('major U.S. newspapers') > 0 : description = "A summary of what's in the major U.S. newspapers." if len(bylines) > 3 and author is not None: author += " | " for (i,substring) in enumerate(bylines[3:]) : #print "substring: %s" % substring.encode('cp1252') author += substring.strip() if i < len(bylines[3:]) : author += " | " # Skip articles whose descriptions contain excluded keywords if description is not None and len(self.excludedDescriptionKeywords): excluded = re.compile('|'.join(self.excludedDescriptionKeywords)) found_excluded = excluded.search(description) if found_excluded : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue # Skip articles whose title contain excluded keywords if full_title is not None and len(self.excludedTitleKeywords): excluded = re.compile('|'.join(self.excludedTitleKeywords)) #self.log("evaluating full_title: %s" % full_title) found_excluded = excluded.search(full_title) if found_excluded : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue # Skip articles whose author contain excluded keywords if author is not None and len(self.excludedAuthorKeywords): excluded = re.compile('|'.join(self.excludedAuthorKeywords)) found_excluded = excluded.search(author) if found_excluded : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue skip_this_article = False # Check to make sure we're not adding a duplicate for article in articles[key] : if article['url'] == url : skip_this_article = True self.log("SKIPPING DUP %s" % url) break if skip_this_article : continue # Build the dictionary entry for this article feed = key if not articles.has_key(feed) : articles[feed] = [] articles[feed].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) #self.log("KEY %s" % feed) #self.log("APPENDED %s" % url) # Promote 'newspapers' to top for (i,article) in enumerate(articles[feed]) : if article['description'] is not None : if article['description'].find('newspapers') > 0 : articles[feed].insert(0,articles[feed].pop(i)) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans def print_version(self, url) : return url + 'pagenum/all/' # Class methods def parse_index(self) : if self.slate_complete: sections = self.extract_named_sections() else: sections = self.extract_dated_sections() section_list = self.extract_section_articles(sections) return section_list def get_masthead_url(self): masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif' br = BasicNewsRecipe.get_browser() try: br.open(masthead) except: self.log("\nMasthead unavailable") masthead = None return masthead def stripAnchors(self,soup): body = soup.find('div',attrs={'id':['article_body','content']}) if body is not None: paras = body.findAll('p') if paras is not None: for para in paras: aTags = para.findAll('a') if aTags is not None: for a in aTags: if a.img is None: #print repr(a.renderContents()) a.replaceWith(a.renderContents().decode('utf-8','replace')) return soup def preprocess_html(self, soup) : # Remove 'grayPlus4.png' images imgs = soup.findAll('img') if imgs is not None: for img in imgs: if re.search("grayPlus4.png",str(img)): img.extract() # Delete article based upon content keywords if len(self.excludedDescriptionKeywords): excluded = re.compile('|'.join(self.excludedContentKeywords)) found_excluded = excluded.search(str(soup)) if found_excluded : print "No allowed content found, removing article" raise Exception('Rejected article') # Articles from www.thebigmoney.com use different tagging for byline, dateline and body head = soup.find('head') if head.link is not None and re.search('www\.thebigmoney\.com', str(head)): byline = soup.find('div',attrs={'id':'byline'}) if byline is not None: byline['class'] = byline['id'] dateline = soup.find('div',attrs={'id':'dateline'}) if dateline is not None: dateline['class'] = dateline['id'] body = soup.find('div',attrs={'id':'content'}) if body is not None: body['class'] = 'article_body' # Synthesize a department kicker h3Tag = Tag(soup,'h3') emTag = Tag(soup,'em') emTag.insert(0,NavigableString("the big money: Today's business press")) h3Tag.insert(0,emTag) soup.body.insert(0,h3Tag) # Strip anchors from HTML return self.stripAnchors(soup) def postprocess_html(self, soup, first_fetch) : # Fix up dept_kicker as <h3> dept_kicker = soup.find('div', attrs={'class':'department_kicker'}) if dept_kicker is not None : kicker_strings = self.tag_to_strings(dept_kicker) kicker = ''.join(kicker_strings[2:]) kicker = re.sub('\.','',kicker) h3Tag = Tag(soup, "h3") emTag = Tag(soup, "em") emTag.insert(0,NavigableString(kicker)) h3Tag.insert(0, emTag) dept_kicker.replaceWith(h3Tag) else: self.log("No kicker--return null") return None # Fix up the concatenated byline and dateline byline = soup.find(True,attrs={'class':'byline'}) if byline is not None : bylineTag = Tag(soup,'div') bylineTag['class'] = 'byline' #bylineTag['height'] = '0em' bylineTag.insert(0,self.tag_to_string(byline)) byline.replaceWith(bylineTag) dateline = soup.find(True, attrs={'class':'dateline'}) if dateline is not None : datelineTag = Tag(soup, 'div') datelineTag['class'] = 'dateline' #datelineTag['margin-top'] = '0em' datelineTag.insert(0,self.tag_to_string(dateline)) dateline.replaceWith(datelineTag) # Change captions to italic, add <hr> for caption in soup.findAll(True, {'class':'caption'}) : if caption is not None: emTag = Tag(soup, "em") emTag.insert(0, ' ' + self.tag_to_string(caption)) hrTag = Tag(soup, 'hr') emTag.insert(1, hrTag) caption.replaceWith(emTag) # Fix photos for photo in soup.findAll('span',attrs={'class':'imagewrapper'}): if photo.a is not None and photo.a.img is not None: divTag = Tag(soup,'div') divTag['class'] ='imagewrapper' divTag.insert(0,photo.a.img) photo.replaceWith(divTag) return soup def postprocess_book(self, oeb, opts, log) : def extract_byline(href) : soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) byline = soup.find(True,attrs={'class':'byline'}) if byline is not None: return self.tag_to_string(byline,use_alt=False) else : return None def extract_description(href) : soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) paragraphs = soup.findAll('p') for p in paragraphs : if self.tag_to_string(p,use_alt=False).startswith('By ') or \ self.tag_to_string(p,use_alt=False).startswith('Posted '): continue comment = p.find(text=lambda text:isinstance(text, Comment)) if comment is not None: continue else: return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...' return None # Method entry point here # Single section toc looks different than multi-section tocs if oeb.toc.depth() == 2 : for article in oeb.toc : if article.author is None : article.author = extract_byline(article.href) if article.description is None : article.description = extract_description(article.href) elif oeb.toc.depth() == 3 : for section in oeb.toc : for article in section : if article.author is None : article.author = extract_byline(article.href) if article.description is None : article.description = extract_description(article.href)