home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
- __license__ = 'GPL v3'
- __author__ = 'Lorenzo Vigentini, based on Darko Miletic'
- __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
- __version__ = 'v1.02'
- __date__ = '14, March 2010'
- __description__ = 'Italian daily newspaper (english version)'
- # NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie:
- # actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml
- # this needs to be change to
- # real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml
- '''
- http://www.corriere.it/
- '''
-
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class ilCorriereEn(BasicNewsRecipe):
- author = 'Lorenzo Vigentini, based on Darko Miletic'
- description = 'Italian daily newspaper (english version)'
-
- cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
- title = u'Il Corriere della sera (english) '
- publisher = 'RCS Digital'
- category = 'News, politics, culture, economy, general interest'
-
- language = 'en'
- timefmt = '[%a, %d %b, %Y]'
-
- oldest_article = 5
- max_articles_per_feed = 100
- use_embedded_content = False
- recursion = 10
-
- remove_javascript = True
- no_stylesheets = True
-
- def get_article_url(self, article):
- articleUrl= article.get('link')
- segments = articleUrl.split('/')
- basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/'
-
- #the date has to be redone with the url structure
- mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre']
- mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
- myDate = segments[4].split('_')
- x=0
- for x in range(11):
- if myDate[1] == mlist1[x]:
- noMonth=mlist2[x]
- break
-
- newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/'
-
- #clean the article title
- articleURLseg=segments[5].split('-')
- myArticle = (articleURLseg[0])[:-9] + '.shtml'
-
- myURL= basename + newDateUrl + myArticle
- #print myURL
- return myURL
-
- keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
-
- remove_tags = [
- dict(name=['base','object','link','embed']),
- dict(name='div', attrs={'class':'news-goback'}),
- dict(name='ul', attrs={'class':'toolbar'})
- ]
-
- remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
-
- feeds = [
- (u'News' , u'http://www.corriere.it/rss/english.xml' )
- ]
-