Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3896 < prev next >

Wrap

Text File | 2010-01-25 | 10.5 KB | 240 lines

#!/usr/bin/env python __license__ = 'GPL v3' ''' macleans.ca ''' from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag from datetime import timedelta, date class Macleans(BasicNewsRecipe): title = u'Macleans Magazine' __author__ = 'Nick Redding' language = 'en_CA' description = ('Macleans Magazine') no_stylesheets = True timefmt = ' [%b %d]' # customization notes: delete sections you are not interested in # set oldest_article to the maximum number of days back from today to include articles sectionlist = [ ['http://www2.macleans.ca/','Front Page'], ['http://www2.macleans.ca/category/canada/','Canada'], ['http://www2.macleans.ca/category/world-from-the-magazine/','World'], ['http://www2.macleans.ca/category/business','Business'], ['http://www2.macleans.ca/category/arts-culture/','Culture'], ['http://www2.macleans.ca/category/opinion','Opinion'], ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'], ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'], ['http://www2.macleans.ca/category/education/','On Campus'], ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel'] ] oldest_article = 7 # formatting for print version of articles extra_css = '''h2{font-family:Times,serif; font-size:large;} small {font-family:Times,serif; font-size:xx-small; list-style-type: none;} ''' # tag handling for print version of articles keep_only_tags = [dict(id='tw-print')] remove_tags = [dict({'class':'postmetadata'})] def preprocess_html(self,soup): for img_tag in soup.findAll('img'): parent_tag = img_tag.parent if parent_tag.name == 'a': new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_tag) elif parent_tag.name == 'p': if not self.tag_to_string(parent_tag) == '': new_div = Tag(soup,'div') new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_div) new_div.insert(0,new_tag) new_div.insert(1,parent_tag) return soup def parse_index(self): articles = {} key = None ans = [] def parse_index_page(page_url,page_title): def decode_date(datestr): dmysplit = datestr.strip().lower().split(',') mdsplit = dmysplit[1].split() m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1 d = int(mdsplit[1]) y = int(dmysplit[2].split()[0]) return date(y,m,d) def article_title(tag): atag = tag.find('a',href=True) if not atag: return '' return self.tag_to_string(atag) def article_url(tag): atag = tag.find('a',href=True) if not atag: return '' return atag['href']+'print/' def article_description(tag): for p_tag in tag.findAll('p'): d = self.tag_to_string(p_tag,False) if not d == '': return d return '' def compound_h4_h3_title(tag): if tag.h4: if tag.h3: return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False) else: return self.tag_to_string(tag.h4,False) elif tag.h3: return self.tag_to_string(tag.h3,False) else: return '' def compound_h2_h4_title(tag): if tag.h2: if tag.h4: return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False) else: return self.tag_to_string(tag.h2,False) elif tag.h4: return self.tag_to_string(tag.h4,False) else: return '' def handle_article(header_tag, outer_tag): if header_tag: url = article_url(header_tag) title = article_title(header_tag) author_date_tag = outer_tag.h4 if author_date_tag: author_date = self.tag_to_string(author_date_tag,False).split(' - ') author = author_date[0].strip() article_date = decode_date(author_date[1]) earliest_date = date.today() - timedelta(days=self.oldest_article) if article_date < earliest_date: self.log("Skipping article dated %s" % author_date[1]) else: excerpt_div = outer_tag.find('div','excerpt') if excerpt_div: description = article_description(excerpt_div) else: description = '' if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content='')) def handle_category_article(cat, header_tag, outer_tag): url = article_url(header_tag) title = article_title(header_tag) if not title == '': title = cat+u'\u2014'+title a_tag = outer_tag.find('span','authorLink') if a_tag: author = self.tag_to_string(a_tag,False) a_tag.parent.extract() else: author = '' description = article_description(outer_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content='')) soup = self.index_to_soup(page_url) if page_title == 'Front Page': # special processing for the front page top_stories = soup.find('div',{ "id" : "macleansFeatured" }) if top_stories: for div_slide in top_stories.findAll('div','slide'): url = article_url(div_slide) div_title = div_slide.find('div','header') if div_title: title = self.tag_to_string(div_title,False) else: title = '' description = article_description(div_slide) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) from_macleans = soup.find('div',{ "id" : "fromMacleans" }) if from_macleans: for li_tag in from_macleans.findAll('li','fromMacleansArticle'): title = compound_h4_h3_title(li_tag) url = article_url(li_tag) description = article_description(li_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) blog_central = soup.find('div',{ "id" : "bloglist" }) if blog_central: for li_tag in blog_central.findAll('li'): title = compound_h2_h4_title(li_tag) if li_tag.h4: url = article_url(li_tag.h4) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content='')) # need_to_know = soup.find('div',{ "id" : "needToKnow" }) # if need_to_know: # for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}): # title = compound_h4_h3_title(div_tag) # url = article_url(div_tag) # description = article_description(div_tag) # if not articles.has_key(page_title): # articles[page_title] = [] # articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) for news_category in soup.findAll('div','newsCategory'): news_cat = self.tag_to_string(news_category.h4,False) handle_category_article(news_cat, news_category.find('h2'), news_category.find('div')) for news_item in news_category.findAll('li'): handle_category_article(news_cat,news_item.h3,news_item) return # find the div containing the highlight article div_post = soup.find('div','post') if div_post: h1_tag = div_post.h1 handle_article(h1_tag,div_post) # find the divs containing the rest of the articles div_other = div_post.find('div', { "id" : "categoryOtherPosts" }) if div_other: for div_entry in div_other.findAll('div','entry'): h2_tag = div_entry.h2 handle_article(h2_tag,div_entry) for page_name,page_title in self.sectionlist: parse_index_page(page_name,page_title) ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans