Maximum CD 2010 November

home *** CD-ROM | disk | FTP | other *** search

/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_3625 < prev next >

Wrap

Text File | 2010-01-23 | 5.1 KB | 132 lines

## ## web2lrf profile to download articles from Barrons.com ## can download subscriber-only content if username and ## password are supplied. ## ''' ''' import re from calibre.web.feeds.news import BasicNewsRecipe class Barrons(BasicNewsRecipe): title = 'Barron\'s' max_articles_per_feed = 50 needs_subscription = True language = 'en' __author__ = 'Kovid Goyal and Sujata Raman' description = 'Weekly publication for investors from the publisher of the Wall Street Journal' timefmt = ' [%a, %b %d, %Y]' use_embedded_content = False no_stylesheets = True match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] conversion_options = {'linearize_tables': True} ##delay = 1 ## Don't grab articles more than 7 days old oldest_article = 7 extra_css = ''' .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} h3{font-family:Georgia,"Times New Roman",Times,serif; } h2{font-family:Georgia,"Times New Roman",Times,serif; } h1{ font-family:Georgia,"Times New Roman",Times,serif; } .byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} .subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;} .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;} .insettipUnit{font-size: x-small;} ''' remove_tags = [ dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), dict(name = 'a', attrs ={'class':'insetClose'}) ] preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ ## Remove anything before the body of the article. (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), ## Remove any insets from the body of the article. (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), ## Remove any reprint info from the body of the article. (r'<hr size.*?<p', lambda match : '<p'), ## Remove anything after the end of the article. (r'<!-- article end.*?</body>', lambda match : '</body>'), ] ] def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://commerce.barrons.com/auth/login') br.select_form(name='login_form') br['user'] = self.username br['password'] = self.password br.submit() return br ## Use the print version of a page when available. def print_version(self, url): main, sep, rest = url.rpartition('?') return main + '#printmode' def postprocess_html(self, soup, first): for tag in soup.findAll(name=['ul', 'li']): tag.name = 'div' for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}): tag.extract() return soup ## Comment out the feeds you don't want retrieved. ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire def get_feeds(self): return [ ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), ] def get_article_url(self, article): return article.get('link', None) def get_cover_url(self): cover_url = None index = 'http://online.barrons.com/home-page' soup = self.index_to_soup(index) link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'}) if link_item: cover_url = link_item.img['src'] return cover_url ## Logout of website ## NOT CURRENTLY WORKING # def cleanup(self): # try: # self.browser.set_debug_responses(True) # import sys, logging # logger = logging.getLogger("mechanize") # logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.setLevel(logging.INFO) # res = self.browser.open('http://online.barrons.com/logout') # except: # import traceback # traceback.print_exc()