home *** CD-ROM | disk | FTP | other *** search
-
- __license__ = 'GPL v3'
- __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
- '''
- arstechnica.com
- '''
-
- import re
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
-
- class ArsTechnica2(BasicNewsRecipe):
- title = u'Ars Technica'
- language = 'en'
- __author__ = 'Darko Miletic and Sujata Raman'
- description = 'The art of technology'
- publisher = 'Ars Technica'
- category = 'news, IT, technology'
- oldest_article = 2
- max_articles_per_feed = 100
- no_stylesheets = True
- encoding = 'utf-8'
- use_embedded_content = False
- extra_css = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
-
- conversion_options = {
- 'comments' : description
- ,'tags' : category
- ,'language' : language
- ,'publisher' : publisher
- }
-
-
- preprocess_regexps = [
- (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
- ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
- ]
-
- keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
-
- remove_tags = [
- dict(name=['object','link','embed'])
- ,dict(name='div', attrs={'class':'read-more-link'})
- ]
- remove_attributes=['width','height']
-
- feeds = [
- (u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
- ,(u'Opposable Thumbs (Gaming content)' , u'http://feeds.arstechnica.com/arstechnica/gaming/' )
- ,(u'Gear and Gadgets' , u'http://feeds.arstechnica.com/arstechnica/gadgets/' )
- ,(u'Chipster (Hardware content)' , u'http://feeds.arstechnica.com/arstechnica/hardware/' )
- ,(u'Uptime (IT content)' , u'http://feeds.arstechnica.com/arstechnica/business/' )
- ,(u'Open Ended (Open Source content)' , u'http://feeds.arstechnica.com/arstechnica/open-source/')
- ,(u'One Microsoft Way' , u'http://feeds.arstechnica.com/arstechnica/microsoft/' )
- ,(u'Nobel Intent (Science content)' , u'http://feeds.arstechnica.com/arstechnica/science/' )
- ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
- ]
-
- def append_page(self, soup, appendtag, position):
- pager = soup.find('div',attrs={'class':'pager'})
- if pager:
- for atag in pager.findAll('a',href=True):
- str = self.tag_to_string(atag)
- if str.startswith('Next'):
- nurl = 'http://arstechnica.com' + atag['href']
- rawc = self.index_to_soup(nurl,True)
- soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
-
- readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
- if readmoretag:
- readmoretag.extract()
- texttag = soup2.find('div', attrs={'class':'body'})
- for it in texttag.findAll(style=True):
- del it['style']
-
- newpos = len(texttag.contents)
- self.append_page(soup2,texttag,newpos)
- texttag.extract()
- pager.extract()
- appendtag.insert(position,texttag)
-
-
- def preprocess_html(self, soup):
- ftag = soup.find('div', attrs={'class':'byline'})
- if ftag:
- brtag = Tag(soup,'br')
- brtag2 = Tag(soup,'br')
- ftag.insert(4,brtag)
- ftag.insert(5,brtag2)
-
- for item in soup.findAll(style=True):
- del item['style']
-
- self.append_page(soup, soup.body, 3)
-
- return soup
-
- def get_article_url(self, article):
- return article.get('guid', None).rpartition('?')[0]
-