home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3896 < prev    next >
Encoding:
Text File  |  2010-01-25  |  10.5 KB  |  240 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4.  
  5. '''
  6. macleans.ca
  7. '''
  8. from calibre.web.feeds.recipes import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import Tag
  10. from datetime import timedelta, date
  11.  
  12. class Macleans(BasicNewsRecipe):
  13.     title          = u'Macleans Magazine'
  14.     __author__     = 'Nick Redding'
  15.     language = 'en_CA'
  16.     description = ('Macleans Magazine')
  17.  
  18.     no_stylesheets = True
  19.     timefmt = ' [%b %d]'
  20.  
  21.     # customization notes: delete sections you are not interested in
  22.     # set oldest_article to the maximum number of days back from today to include articles
  23.     sectionlist = [
  24.                         ['http://www2.macleans.ca/','Front Page'],
  25.                         ['http://www2.macleans.ca/category/canada/','Canada'],
  26.                         ['http://www2.macleans.ca/category/world-from-the-magazine/','World'],
  27.                         ['http://www2.macleans.ca/category/business','Business'],
  28.                         ['http://www2.macleans.ca/category/arts-culture/','Culture'],
  29.                         ['http://www2.macleans.ca/category/opinion','Opinion'],
  30.                         ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'],
  31.                         ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'],
  32.                         ['http://www2.macleans.ca/category/education/','On Campus'],
  33.                         ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel']
  34.                     ]
  35.     oldest_article = 7
  36.  
  37.     # formatting for print version of articles
  38.     extra_css   =   '''h2{font-family:Times,serif; font-size:large;}
  39.                         small {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
  40.                     '''
  41.  
  42.     # tag handling for print version of articles
  43.     keep_only_tags = [dict(id='tw-print')]
  44.     remove_tags =   [dict({'class':'postmetadata'})]
  45.  
  46.  
  47.     def preprocess_html(self,soup):
  48.         for img_tag in soup.findAll('img'):
  49.             parent_tag = img_tag.parent
  50.             if parent_tag.name == 'a':
  51.                 new_tag = Tag(soup,'p')
  52.                 new_tag.insert(0,img_tag)
  53.                 parent_tag.replaceWith(new_tag)
  54.             elif parent_tag.name == 'p':
  55.                 if not self.tag_to_string(parent_tag) == '':
  56.                     new_div = Tag(soup,'div')
  57.                     new_tag = Tag(soup,'p')
  58.                     new_tag.insert(0,img_tag)
  59.                     parent_tag.replaceWith(new_div)
  60.                     new_div.insert(0,new_tag)
  61.                     new_div.insert(1,parent_tag)
  62.         return soup
  63.  
  64.     def parse_index(self):
  65.  
  66.  
  67.  
  68.         articles = {}
  69.         key = None
  70.         ans = []
  71.  
  72.         def parse_index_page(page_url,page_title):
  73.  
  74.             def decode_date(datestr):
  75.                 dmysplit = datestr.strip().lower().split(',')
  76.                 mdsplit = dmysplit[1].split()
  77.                 m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1
  78.                 d = int(mdsplit[1])
  79.                 y = int(dmysplit[2].split()[0])
  80.                 return date(y,m,d)
  81.  
  82.             def article_title(tag):
  83.                 atag = tag.find('a',href=True)
  84.                 if not atag:
  85.                     return ''
  86.                 return self.tag_to_string(atag)
  87.  
  88.             def article_url(tag):
  89.                 atag = tag.find('a',href=True)
  90.                 if not atag:
  91.                     return ''
  92.                 return atag['href']+'print/'
  93.  
  94.             def article_description(tag):
  95.                 for p_tag in tag.findAll('p'):
  96.                     d = self.tag_to_string(p_tag,False)
  97.                     if not d == '':
  98.                         return d
  99.                 return ''
  100.  
  101.             def compound_h4_h3_title(tag):
  102.                 if tag.h4:
  103.                     if tag.h3:
  104.                         return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False)
  105.                     else:
  106.                         return self.tag_to_string(tag.h4,False)
  107.                 elif tag.h3:
  108.                     return self.tag_to_string(tag.h3,False)
  109.                 else:
  110.                     return ''
  111.  
  112.             def compound_h2_h4_title(tag):
  113.                 if tag.h2:
  114.                     if tag.h4:
  115.                         return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False)
  116.                     else:
  117.                         return self.tag_to_string(tag.h2,False)
  118.                 elif tag.h4:
  119.                     return self.tag_to_string(tag.h4,False)
  120.                 else:
  121.                     return ''
  122.  
  123.  
  124.             def handle_article(header_tag, outer_tag):
  125.                 if header_tag:
  126.                     url = article_url(header_tag)
  127.                     title = article_title(header_tag)
  128.                     author_date_tag = outer_tag.h4
  129.                     if author_date_tag:
  130.                         author_date = self.tag_to_string(author_date_tag,False).split(' - ')
  131.                         author = author_date[0].strip()
  132.                         article_date = decode_date(author_date[1])
  133.                         earliest_date = date.today() - timedelta(days=self.oldest_article)
  134.                         if article_date < earliest_date:
  135.                             self.log("Skipping article dated %s" % author_date[1])
  136.                         else:
  137.                             excerpt_div = outer_tag.find('div','excerpt')
  138.                             if excerpt_div:
  139.                                 description = article_description(excerpt_div)
  140.                             else:
  141.                                 description = ''
  142.                             if not articles.has_key(page_title):
  143.                                 articles[page_title] = []
  144.                             articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content=''))
  145.  
  146.             def handle_category_article(cat, header_tag, outer_tag):
  147.                 url = article_url(header_tag)
  148.                 title = article_title(header_tag)
  149.                 if not title == '':
  150.                     title = cat+u'\u2014'+title
  151.                 a_tag = outer_tag.find('span','authorLink')
  152.                 if a_tag:
  153.                     author = self.tag_to_string(a_tag,False)
  154.                     a_tag.parent.extract()
  155.                 else:
  156.                     author = ''
  157.                 description = article_description(outer_tag)
  158.                 if not articles.has_key(page_title):
  159.                     articles[page_title] = []
  160.                 articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content=''))
  161.  
  162.  
  163.             soup = self.index_to_soup(page_url)
  164.  
  165.             if page_title == 'Front Page':
  166.                 # special processing for the front page
  167.                 top_stories = soup.find('div',{ "id" : "macleansFeatured" })
  168.                 if top_stories:
  169.                     for div_slide in top_stories.findAll('div','slide'):
  170.                         url = article_url(div_slide)
  171.                         div_title = div_slide.find('div','header')
  172.                         if div_title:
  173.                             title = self.tag_to_string(div_title,False)
  174.                         else:
  175.                             title = ''
  176.                         description = article_description(div_slide)
  177.                         if not articles.has_key(page_title):
  178.                              articles[page_title] = []
  179.                         articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
  180.  
  181.                 from_macleans = soup.find('div',{ "id" : "fromMacleans" })
  182.                 if from_macleans:
  183.                     for li_tag in from_macleans.findAll('li','fromMacleansArticle'):
  184.                         title = compound_h4_h3_title(li_tag)
  185.                         url = article_url(li_tag)
  186.                         description = article_description(li_tag)
  187.                         if not articles.has_key(page_title):
  188.                             articles[page_title] = []
  189.                         articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
  190.  
  191.                 blog_central = soup.find('div',{ "id" : "bloglist" })
  192.                 if blog_central:
  193.                     for li_tag in blog_central.findAll('li'):
  194.                         title = compound_h2_h4_title(li_tag)
  195.                         if li_tag.h4:
  196.                             url = article_url(li_tag.h4)
  197.                             if not articles.has_key(page_title):
  198.                                 articles[page_title] = []
  199.                             articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content=''))
  200.  
  201. #                need_to_know = soup.find('div',{ "id" : "needToKnow" })
  202. #                if need_to_know:
  203. #                    for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
  204. #                        title = compound_h4_h3_title(div_tag)
  205. #                        url = article_url(div_tag)
  206. #                        description = article_description(div_tag)
  207. #                        if not articles.has_key(page_title):
  208. #                            articles[page_title] = []
  209. #                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
  210.  
  211.                 for news_category in soup.findAll('div','newsCategory'):
  212.                     news_cat = self.tag_to_string(news_category.h4,False)
  213.                     handle_category_article(news_cat, news_category.find('h2'), news_category.find('div'))
  214.                     for news_item in news_category.findAll('li'):
  215.                         handle_category_article(news_cat,news_item.h3,news_item)
  216.  
  217.                 return
  218.  
  219.             # find the div containing the highlight article
  220.             div_post = soup.find('div','post')
  221.             if div_post:
  222.                 h1_tag = div_post.h1
  223.                 handle_article(h1_tag,div_post)
  224.  
  225.             # find the divs containing the rest of the articles
  226.             div_other = div_post.find('div', { "id" : "categoryOtherPosts" })
  227.             if div_other:
  228.                 for div_entry in div_other.findAll('div','entry'):
  229.                     h2_tag = div_entry.h2
  230.                     handle_article(h2_tag,div_entry)
  231.  
  232.  
  233.  
  234.         for page_name,page_title in self.sectionlist:
  235.             parse_index_page(page_name,page_title)
  236.             ans.append(page_title)
  237.  
  238.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  239.         return ans
  240.