home *** CD-ROM | disk | FTP | other *** search
- ##
- ## web2lrf profile to download articles from Barrons.com
- ## can download subscriber-only content if username and
- ## password are supplied.
- ##
- '''
- '''
-
- import re
-
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class Barrons(BasicNewsRecipe):
-
- title = 'Barron\'s'
- max_articles_per_feed = 50
- needs_subscription = True
- language = 'en'
-
- __author__ = 'Kovid Goyal and Sujata Raman'
- description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
- timefmt = ' [%a, %b %d, %Y]'
- use_embedded_content = False
- no_stylesheets = True
- match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
- conversion_options = {'linearize_tables': True}
- ##delay = 1
-
- ## Don't grab articles more than 7 days old
- oldest_article = 7
-
- extra_css = '''
- .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
- h3{font-family:Georgia,"Times New Roman",Times,serif; }
- h2{font-family:Georgia,"Times New Roman",Times,serif; }
- h1{ font-family:Georgia,"Times New Roman",Times,serif; }
- .byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
- .subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
- .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
- .insettipUnit{font-size: x-small;}
- '''
- remove_tags = [
- dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
- dict(name = 'a', attrs ={'class':'insetClose'})
- ]
-
- preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
- [
- ## Remove anything before the body of the article.
- (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
-
- ## Remove any insets from the body of the article.
- (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
-
- ## Remove any reprint info from the body of the article.
- (r'<hr size.*?<p', lambda match : '<p'),
-
- ## Remove anything after the end of the article.
- (r'<!-- article end.*?</body>', lambda match : '</body>'),
- ]
- ]
-
- def get_browser(self):
- br = BasicNewsRecipe.get_browser()
- if self.username is not None and self.password is not None:
- br.open('http://commerce.barrons.com/auth/login')
- br.select_form(name='login_form')
- br['user'] = self.username
- br['password'] = self.password
- br.submit()
- return br
-
- ## Use the print version of a page when available.
-
- def print_version(self, url):
- main, sep, rest = url.rpartition('?')
- return main + '#printmode'
-
- def postprocess_html(self, soup, first):
-
- for tag in soup.findAll(name=['ul', 'li']):
- tag.name = 'div'
- for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
- tag.extract()
-
- return soup
-
- ## Comment out the feeds you don't want retrieved.
- ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
-
- def get_feeds(self):
- return [
- ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
- ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
- ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
- ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
- ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
- ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
- ]
-
- def get_article_url(self, article):
- return article.get('link', None)
-
-
- def get_cover_url(self):
- cover_url = None
- index = 'http://online.barrons.com/home-page'
- soup = self.index_to_soup(index)
- link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
- if link_item:
- cover_url = link_item.img['src']
- return cover_url
-
-
- ## Logout of website
- ## NOT CURRENTLY WORKING
- # def cleanup(self):
- # try:
- # self.browser.set_debug_responses(True)
- # import sys, logging
- # logger = logging.getLogger("mechanize")
- # logger.addHandler(logging.StreamHandler(sys.stdout))
- # logger.setLevel(logging.INFO)
-
- # res = self.browser.open('http://online.barrons.com/logout')
- # except:
- # import traceback
- # traceback.print_exc()
-
-
-
-