home *** CD-ROM | disk | FTP | other *** search
- import re
- from calibre.web.feeds.recipes import BasicNewsRecipe
- from calibre.constants import config_dir, CONFIG_DIR_MODE
- import os, os.path, urllib
- from hashlib import md5
-
- class OfficeSpaceBlogHu(BasicNewsRecipe):
- __author__ = 'Zsolt Botykai'
- title = u'Office Space Blog'
- description = u"officespace.blog.hu"
- oldest_article = 10000
- max_articles_per_feed = 10000
- reverse_article_order = True
- language = 'hu'
- remove_javascript = True
- remove_empty_feeds = True
- no_stylesheets = True
- feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
- remove_javascript = True
- use_embedded_content = False
- title = u'Irodai patkényok'
- feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')]
-
- masthead_url='http://m.blog.hu/of/officespace/ipfejlec7.jpg'
-
- keep_only_tags = [
- dict(name='div', attrs={'id':['mainWrapper']})
- ]
-
- # 1.: I like justified lines more
- # 2.: remove empty paragraphs
- # 3.: drop header and sidebar
- # 4.: drop comments counter
- # 5.: drop everything after article-tags
- # 6-8.: drop audit images
-
- preprocess_regexps = [
- (re.compile(r'<p align="left"'), lambda m: '<p'),
- (re.compile(r'<p>( | )*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
- (re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL|re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
- (re.compile(r'<h3 class="comments">.*?</h3>', re.DOTALL|re.IGNORECASE), lambda match: ''),
- (re.compile(r'<div class="related">.*?</body>', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
- (re.compile(r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
- (re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
- (re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''),
- ]
- extra_css = '''
- body { background-color: white; color: black }
- '''
-
- def get_cover_url(self):
- return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
-
- def preprocess_html(self, soup):
- for tagz in soup.findAll('h3', attrs={'class':'tags'}):
- for taglink in tagz.findAll('a'):
- if taglink.string is not None:
- tstr = taglink.string + ','
- taglink.replaceWith(tstr)
-
- for alink in soup.findAll('a'):
- if alink.string is not None:
- tstr = alink.string
- alink.replaceWith(tstr)
-
- return soup
-
- # As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
- def parse_feeds(self):
- recipe_dir = os.path.join(config_dir,'recipes')
- hash_dir = os.path.join(recipe_dir,'recipe_storage')
- feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
- if not os.path.isdir(feed_dir):
- os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
-
- feeds = BasicNewsRecipe.parse_feeds(self)
-
- for feed in feeds:
- feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
- feed_fn = os.path.join(feed_dir,feed_hash)
-
- past_items = set()
- if os.path.exists(feed_fn):
- with file(feed_fn) as f:
- for h in f:
- past_items.add(h.strip())
-
- cur_items = set()
- for article in feed.articles[:]:
- item_hash = md5()
- if article.content: item_hash.update(article.content.encode('utf-8'))
- if article.summary: item_hash.update(article.summary.encode('utf-8'))
- item_hash = item_hash.hexdigest()
- if article.url:
- item_hash = article.url + ':' + item_hash
- cur_items.add(item_hash)
- if item_hash in past_items:
- feed.articles.remove(article)
- with file(feed_fn,'w') as f:
- for h in cur_items:
- f.write(h+'\n')
-
- remove = [f for f in feeds if len(f) == 0 and
- self.remove_empty_feeds]
- for f in remove:
- feeds.remove(f)
-
- return feeds
-
-