home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __license__ = 'GPL v3'
- __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
- import time
- import traceback
- import copy
- import re
- from lxml import html
- from calibre.web.feeds.feedparser import parse
- from calibre.utils.logging import default_log
- from calibre import entity_to_unicode, strftime
- from calibre.utils.date import dt_factory, utcnow, local_tz
-
- class Article(object):
-
- def __init__(self, id, title, url, author, summary, published, content):
- self.downloaded = False
- self.id = id
- self._title = None if title else title
-
- try:
- self._title = re.sub('&(\\S+?);', entity_to_unicode, self._title)
- except:
- pass
-
- if not isinstance(self._title, unicode):
- self._title = self._title.decode('utf-8', 'replace')
-
- self.url = url
- self.author = author
- if author and not isinstance(author, unicode):
- author = author.decode('utf-8', 'replace')
-
- self.summary = summary
- if summary and not isinstance(summary, unicode):
- summary = summary.decode('utf-8', 'replace')
-
- if summary and '<' in summary:
-
- try:
- s = html.fragment_fromstring(summary, create_parent = True)
- summary = html.tostring(s, method = 'text', encoding = unicode)
- print 'Failed to process article summary, deleting:'
- print summary.encode('utf-8')
- traceback.print_exc()
- summary = u''
-
-
- self.text_summary = summary
- self.author = author
- self.content = content
- self.date = published
- self.utctime = dt_factory(self.date, assume_utc = True, as_utc = True)
- self.localtime = self.utctime.astimezone(local_tz)
- self._formatted_date = None
-
-
- def formatted_date(self):
-
- def fget(self):
- if self._formatted_date is None:
- self._formatted_date = strftime(' [%a, %d %b %H:%M]', t = self.localtime.timetuple())
-
- return self._formatted_date
-
-
- def fset(self, val):
- if isinstance(val, unicode):
- self._formatted_date = val
-
-
- return property(fget = fget, fset = fset)
-
- formatted_date = dynamic_property(formatted_date)
-
- def title(self):
-
- def fget(self):
- t = self._title
- if not isinstance(t, unicode) and hasattr(t, 'decode'):
- t = t.decode('utf-8', 'replace')
-
- return t
-
-
- def fset(self, val):
- self._title = val
-
- return property(fget = fget, fset = fset)
-
- title = dynamic_property(title)
-
- def __repr__(self):
- return (u'Title : %s\nURL : %s\nAuthor : %s\nSummary : %s\nDate : %s\nHas content : %s\n' % (self.title, self.url, self.author, self.summary[:20] + '...', self.localtime.strftime('%a, %d %b, %Y %H:%M'), bool(self.content))).encode('utf-8')
-
-
- def __str__(self):
- return repr(self)
-
-
- def is_same_as(self, other_article):
- if self.url:
- return self.url == getattr(other_article, 'url', False)
- return self.content == getattr(other_article, 'content', False)
-
-
-
- class Feed(object):
-
- def __init__(self, get_article_url = (lambda item: item.get('link', None)), log = default_log):
- self.logger = log
- self.get_article_url = get_article_url
-
-
- def populate_from_feed(self, feed, title = None, oldest_article = 7, max_articles_per_feed = 100):
- entries = feed.entries
- feed = feed.feed
- self.title = None if not title else title
- self.description = feed.get('description', '')
- image = feed.get('image', { })
- self.image_url = image.get('href', None)
- self.image_width = image.get('width', 88)
- self.image_height = image.get('height', 31)
- self.image_alt = image.get('title', '')
- self.articles = []
- self.id_counter = 0
- self.added_articles = []
- self.oldest_article = oldest_article
- for item in entries:
- if len(self.articles) >= max_articles_per_feed:
- break
-
- self.parse_article(item)
-
-
-
- def populate_from_preparsed_feed(self, title, articles, oldest_article = 7, max_articles_per_feed = 100):
- self.title = None(unicode if title else _('Unknown feed'))
- self.description = ''
- self.image_url = None
- self.articles = []
- self.added_articles = []
- self.oldest_article = oldest_article
- self.id_counter = 0
- for item in articles:
- if len(self.articles) >= max_articles_per_feed:
- break
-
- id = item.get('id', 'internal id#' + str(self.id_counter))
- if id in self.added_articles:
- return None
- self.added_articles.append(id)
- self.id_counter += 1
- published = time.gmtime(item.get('timestamp', time.time()))
- title = item.get('title', _('Untitled article'))
- link = item.get('url', None)
- description = item.get('description', '')
- content = item.get('content', '')
- author = item.get('author', '')
- article = Article(id, title, link, author, description, published, content)
- delta = utcnow() - article.utctime
- d = item.get('date', '')
- article.formatted_date = d
-
-
-
- def parse_article(self, item):
- id = item.get('id', 'internal id#' + str(self.id_counter))
- if id in self.added_articles:
- return None
- published = item.get('date_parsed', time.gmtime())
- if not published:
- published = time.gmtime()
-
- self.id_counter += 1
- self.added_articles.append(id)
- title = item.get('title', _('Untitled article'))
-
- try:
- link = self.get_article_url(item)
- except:
- self
- self.logger.warning('Failed to get link for %s' % title)
- self.logger.debug(traceback.format_exc())
- link = None
-
- description = item.get('summary', None)
- author = item.get('author', None)
- content = _[1]
- content = [ _[2] if isinstance(i, unicode) else i.decode('utf-8', 'replace') for i in content ]
- content = u'\n'.join(content)
- if not link and not content:
- return None
- article = Article(id, title, link, author, description, published, content)
- delta = utcnow() - article.utctime
-
-
- def reverse(self):
- self.articles.reverse()
-
-
- def __iter__(self):
- return iter(self.articles)
-
-
- def __len__(self):
- return len(self.articles)
-
-
- def __repr__(self):
- res = [ ('%20s\n' % '').replace(' ', '_') + repr(art) for art in self ]
- return '\n' + '\n'.join(res) + '\n'
-
-
- def __str__(self):
- return repr(self)
-
-
- def __bool__(self):
- for article in self:
- if getattr(article, 'downloaded', False):
- return True
-
- return False
-
-
- def has_embedded_content(self):
- length = 0
- for a in self:
- if a.content or a.summary:
- None += None(length, max(len if a.content else '')(len if a.summary else ''))
- continue
-
- return length > 2000 * len(self)
-
-
- def has_article(self, article):
- for a in self:
- if a.is_same_as(article):
- return True
-
- return False
-
-
- def find(self, article):
- for i, a in enumerate(self):
- if a.is_same_as(article):
- return i
-
- return -1
-
-
- def remove(self, article):
- i = self.index(article)
- if i > -1:
- self.articles[i:i + 1] = []
-
-
-
-
- class FeedCollection(list):
-
- def __init__(self, feeds):
- []([], _[1])
- found_articles = set([])
- duplicates = set([])
-
- def in_set(s, a):
- for x in s:
- if a.is_same_as(x):
- return x
-
-
- print '#feeds', len(self)
- print map(len, self)
- for f in self:
- dups = []
- for a in f:
- first = in_set(found_articles, a)
- if first is not None:
- dups.append(a)
- duplicates.add((first, f))
- continue
- found_articles.add(a)
-
- for x in dups:
- f.articles.remove(x)
-
-
- self.duplicates = duplicates
- print len(duplicates)
- print map(len, self)
-
-
- def find_article(self, article):
- for j, f in enumerate(self):
- for i, a in enumerate(f):
- if a is article:
- return (j, i)
-
-
-
-
- def restore_duplicates(self):
- temp = []
- for article, feed in self.duplicates:
- art = copy.deepcopy(article)
- (j, i) = self.find_article(article)
- art.url = '../feed_%d/article_%d/index.html' % (j, i)
- temp.append((feed, art))
-
- for feed, art in temp:
- feed.articles.append(art)
-
-
-
-
- def feed_from_xml(raw_xml, title = None, oldest_article = 7, max_articles_per_feed = 100, get_article_url = (lambda item: item.get('link', None)), log = default_log):
- feed = parse(raw_xml)
- pfeed = Feed(get_article_url = get_article_url, log = log)
- pfeed.populate_from_feed(feed, title = title, oldest_article = oldest_article, max_articles_per_feed = max_articles_per_feed)
- return pfeed
-
-
- def feeds_from_index(index, oldest_article = 7, max_articles_per_feed = 100, log = default_log):
- feeds = []
- for title, articles in index:
- pfeed = Feed(log = log)
- pfeed.populate_from_preparsed_feed(title, articles, oldest_article = oldest_article, max_articles_per_feed = max_articles_per_feed)
- feeds.append(pfeed)
-
- return feeds
-
-