home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_708 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-08-06  |  37.0 KB  |  755 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. import unittest
  5. from BeautifulSoup import *
  6.  
  7. class SoupTest(unittest.TestCase):
  8.     
  9.     def assertSoupEquals(self, toParse, rep = None, c = BeautifulSoup, encoding = None):
  10.         if rep == None:
  11.             rep = toParse
  12.         
  13.         obj = c(toParse)
  14.         if encoding is None:
  15.             rep2 = obj.decode()
  16.         else:
  17.             rep2 = obj.encode(encoding)
  18.         self.assertEqual(rep2, rep)
  19.  
  20.  
  21.  
  22. class FollowThatTag(SoupTest):
  23.     
  24.     def setUp(self):
  25.         ml = '\n        <a id="x">1</a>\n        <A id="a">2</a>\n        <b id="b">3</a>\n        <b href="foo" id="x">4</a>\n        <ac width=100>4</ac>'
  26.         self.soup = BeautifulStoneSoup(ml)
  27.  
  28.     
  29.     def testFindAllByName(self):
  30.         matching = self.soup('a')
  31.         self.assertEqual(len(matching), 2)
  32.         self.assertEqual(matching[0].name, 'a')
  33.         self.assertEqual(matching, self.soup.findAll('a'))
  34.         self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
  35.  
  36.     
  37.     def testFindAllByAttribute(self):
  38.         matching = self.soup.findAll(id = 'x')
  39.         self.assertEqual(len(matching), 2)
  40.         self.assertEqual(matching[0].name, 'a')
  41.         self.assertEqual(matching[1].name, 'b')
  42.         matching2 = self.soup.findAll(attrs = {
  43.             'id': 'x' })
  44.         self.assertEqual(matching, matching2)
  45.         strainer = SoupStrainer(attrs = {
  46.             'id': 'x' })
  47.         self.assertEqual(matching, self.soup.findAll(strainer))
  48.         self.assertEqual(len(self.soup.findAll(id = None)), 1)
  49.         self.assertEqual(len(self.soup.findAll(width = 100)), 1)
  50.         self.assertEqual(len(self.soup.findAll(junk = None)), 5)
  51.         self.assertEqual(len(self.soup.findAll(junk = [
  52.             1,
  53.             None])), 5)
  54.         self.assertEqual(len(self.soup.findAll(junk = re.compile('.*'))), 0)
  55.         self.assertEqual(len(self.soup.findAll(junk = True)), 0)
  56.         self.assertEqual(len(self.soup.findAll(junk = True)), 0)
  57.         self.assertEqual(len(self.soup.findAll(href = True)), 1)
  58.  
  59.     
  60.     def testFindallByClass(self):
  61.         soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
  62.         self.assertEqual(soup.find('a', '1').string, 'Bar')
  63.  
  64.     
  65.     def testFindAllByList(self):
  66.         matching = self.soup([
  67.             'a',
  68.             'ac'])
  69.         self.assertEqual(len(matching), 3)
  70.  
  71.     
  72.     def testFindAllByHash(self):
  73.         matching = self.soup({
  74.             'a': True,
  75.             'b': True })
  76.         self.assertEqual(len(matching), 4)
  77.  
  78.     
  79.     def testFindAllText(self):
  80.         soup = BeautifulSoup('<html>\xbb</html>')
  81.         self.assertEqual(soup.findAll(text = re.compile('.*')), [
  82.             u'┬╗'])
  83.  
  84.     
  85.     def testFindAllByRE(self):
  86.         import re
  87.         r = re.compile('a.*')
  88.         self.assertEqual(len(self.soup(r)), 3)
  89.  
  90.     
  91.     def testFindAllByMethod(self):
  92.         
  93.         def matchTagWhereIDMatchesName(tag):
  94.             return tag.name == tag.get('id')
  95.  
  96.         matching = self.soup.findAll(matchTagWhereIDMatchesName)
  97.         self.assertEqual(len(matching), 2)
  98.         self.assertEqual(matching[0].name, 'a')
  99.  
  100.     
  101.     def testParents(self):
  102.         soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
  103.         b = soup.b
  104.         self.assertEquals(len(b.findParents('ul', {
  105.             'id': 'foo' })), 2)
  106.         self.assertEquals(b.findParent('ul')['a'], 'b')
  107.  
  108.     PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
  109.     
  110.     def testNext(self):
  111.         soup = self.PROXIMITY_TEST
  112.         b = soup.find('b', {
  113.             'id': 2 })
  114.         self.assertEquals(b.findNext('b')['id'], '3')
  115.         self.assertEquals(b.findNext('b')['id'], '3')
  116.         self.assertEquals(len(b.findAllNext('b')), 2)
  117.         self.assertEquals(len(b.findAllNext('b', {
  118.             'id': 4 })), 1)
  119.  
  120.     
  121.     def testPrevious(self):
  122.         soup = self.PROXIMITY_TEST
  123.         b = soup.find('b', {
  124.             'id': 3 })
  125.         self.assertEquals(b.findPrevious('b')['id'], '2')
  126.         self.assertEquals(b.findPrevious('b')['id'], '2')
  127.         self.assertEquals(len(b.findAllPrevious('b')), 2)
  128.         self.assertEquals(len(b.findAllPrevious('b', {
  129.             'id': 2 })), 1)
  130.  
  131.     SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
  132.     
  133.     def testNextSibling(self):
  134.         soup = self.SIBLING_TEST
  135.         tag = 'blockquote'
  136.         b = soup.find(tag, {
  137.             'id': 2 })
  138.         self.assertEquals(b.findNext(tag)['id'], '2.1')
  139.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  140.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  141.         self.assertEquals(len(b.findNextSiblings(tag)), 2)
  142.         self.assertEquals(len(b.findNextSiblings(tag, {
  143.             'id': 4 })), 1)
  144.  
  145.     
  146.     def testPreviousSibling(self):
  147.         soup = self.SIBLING_TEST
  148.         tag = 'blockquote'
  149.         b = soup.find(tag, {
  150.             'id': 3 })
  151.         self.assertEquals(b.findPrevious(tag)['id'], '2.1')
  152.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  153.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  154.         self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
  155.         self.assertEquals(len(b.findPreviousSiblings(tag, id = 1)), 1)
  156.  
  157.     
  158.     def testTextNavigation(self):
  159.         soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
  160.         baz = soup.find(text = 'Baz')
  161.         self.assertEquals(baz.findParent('i')['id'], '1')
  162.         self.assertEquals(baz.findNext(text = 'Blee'), 'Blee')
  163.         self.assertEquals(baz.findNextSibling(text = 'Blee'), 'Blee')
  164.         self.assertEquals(baz.findNextSibling(text = 'Blargh'), None)
  165.         self.assertEquals(baz.findNextSibling('hr')['id'], '1')
  166.  
  167.  
  168.  
  169. class SiblingRivalry(SoupTest):
  170.     
  171.     def testSiblings(self):
  172.         soup = BeautifulSoup('<ul><li>1<p>A</p>B<li>2<li>3</ul>')
  173.         secondLI = soup.find('li').nextSibling
  174.         if secondLI.name == 'li':
  175.             pass
  176.         self.assert_(secondLI.string == '2')
  177.         self.assertEquals(soup.find(text = '1').nextSibling.name, 'p')
  178.         self.assertEquals(soup.find('p').nextSibling, 'B')
  179.         self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
  180.  
  181.  
  182.  
  183. class TagsAreObjectsToo(SoupTest):
  184.     
  185.     def testLen(self):
  186.         soup = BeautifulSoup('<top>1<b>2</b>3</top>')
  187.         self.assertEquals(len(soup.top), 3)
  188.  
  189.  
  190.  
  191. class StringEmUp(SoupTest):
  192.     
  193.     def testString(self):
  194.         s = BeautifulSoup('<b>foo</b>')
  195.         self.assertEquals(s.b.string, 'foo')
  196.  
  197.     
  198.     def testLackOfString(self):
  199.         s = BeautifulSoup('<b>f<i>e</i>o</b>')
  200.         self.assert_(not (s.b.string))
  201.  
  202.  
  203.  
  204. class ThatsMyLimit(SoupTest):
  205.     
  206.     def testBasicLimits(self):
  207.         s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
  208.         self.assertEquals(len(s.findAll('br')), 4)
  209.         self.assertEquals(len(s.findAll('br', limit = 2)), 2)
  210.         self.assertEquals(len(s('br', limit = 2)), 2)
  211.  
  212.  
  213.  
  214. class OnlyTheLonely(SoupTest):
  215.     
  216.     def setUp(self):
  217.         x = []
  218.         for i in range(1, 6):
  219.             x.append('<a id="%s">' % i)
  220.             for j in range(100, 103):
  221.                 x.append('<b id="%s.%s">Content %s.%s</b>' % (i, j, i, j))
  222.             
  223.             x.append('</a>')
  224.         
  225.         self.x = ''.join(x)
  226.  
  227.     
  228.     def testOnly(self):
  229.         strainer = SoupStrainer('b')
  230.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  231.         self.assertEquals(len(soup), 15)
  232.         strainer = SoupStrainer(id = re.compile('100.*'))
  233.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  234.         self.assertEquals(len(soup), 5)
  235.         strainer = SoupStrainer(text = re.compile('10[01].*'))
  236.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  237.         self.assertEquals(len(soup), 10)
  238.         strainer = SoupStrainer(text = (lambda x: x[8] == '3'))
  239.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  240.         self.assertEquals(len(soup), 3)
  241.  
  242.  
  243.  
  244. class PickleMeThis(SoupTest):
  245.     
  246.     def setUp(self):
  247.         self.page = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"\n"http://www.w3.org/TR/REC-html40/transitional.dtd">\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>Beautiful Soup: We called him Tortoise because he taught us.</title>\n<link rev="made" href="mailto:leonardr@segfault.org">\n<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">\n<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">\n<meta name="author" content="Leonard Richardson">\n</head>\n<body>\n<a href="foo">foo</a>\n<a href="foo"><b>bar</b></a>\n</body>\n</html>'
  248.         self.soup = BeautifulSoup(self.page)
  249.  
  250.     
  251.     def testPickle(self):
  252.         import pickle
  253.         dumped = pickle.dumps(self.soup, 2)
  254.         loaded = pickle.loads(dumped)
  255.         self.assertEqual(loaded.__class__, BeautifulSoup)
  256.         self.assertEqual(loaded.decode(), self.soup.decode())
  257.  
  258.     
  259.     def testDeepcopy(self):
  260.         deepcopy = deepcopy
  261.         import copy
  262.         deepcopy(BeautifulSoup('<a></a>'))
  263.         copied = deepcopy(self.soup)
  264.         self.assertEqual(copied.decode(), self.soup.decode())
  265.  
  266.     
  267.     def testUnicodePickle(self):
  268.         import cPickle as pickle
  269.         html = '<b>' + chr(195) + '</b>'
  270.         soup = BeautifulSoup(html)
  271.         dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
  272.         loaded = pickle.loads(dumped)
  273.         self.assertEqual(loaded.decode(), soup.decode())
  274.  
  275.  
  276.  
  277. class WriteOnlyCode(SoupTest):
  278.     
  279.     def testModifyAttributes(self):
  280.         soup = BeautifulSoup('<a id="1"></a>')
  281.         soup.a['id'] = 2
  282.         self.assertEqual(soup.decode(), '<a id="2"></a>')
  283.         del soup.a['id']
  284.         self.assertEqual(soup.decode(), '<a></a>')
  285.         soup.a['id2'] = 'foo'
  286.         self.assertEqual(soup.decode(), '<a id2="foo"></a>')
  287.  
  288.     
  289.     def testNewTagCreation(self):
  290.         soup = BeautifulSoup()
  291.         a = Tag(soup, 'a')
  292.         ol = Tag(soup, 'ol')
  293.         a['href'] = 'http://foo.com/'
  294.         self.assertRaises((KeyError,), (lambda : ol['href']))
  295.  
  296.     
  297.     def testTagReplacement(self):
  298.         text = '<a><b></b><c>Foo<d></d></c></a><a><e></e></a>'
  299.         soup = BeautifulSoup(text)
  300.         c = soup.c
  301.         soup.c.replaceWith(c)
  302.         self.assertEquals(soup.decode(), text)
  303.         soup = BeautifulSoup('<b>Argh!</b>')
  304.         soup.find(text = 'Argh!').replaceWith('Hooray!')
  305.         newText = soup.find(text = 'Hooray!')
  306.         b = soup.b
  307.         self.assertEqual(newText.previous, b)
  308.         self.assertEqual(newText.parent, b)
  309.         self.assertEqual(newText.previous.next, newText)
  310.         self.assertEqual(newText.next, None)
  311.         soup = BeautifulSoup('<a><b>Argh!</b><c></c><d></d></a>')
  312.         soup.b.insert(1, 'Hooray!')
  313.         newText = soup.find(text = 'Hooray!')
  314.         self.assertEqual(newText.previous, 'Argh!')
  315.         self.assertEqual(newText.previous.next, newText)
  316.         self.assertEqual(newText.previousSibling, 'Argh!')
  317.         self.assertEqual(newText.previousSibling.nextSibling, newText)
  318.         self.assertEqual(newText.nextSibling, None)
  319.         self.assertEqual(newText.next, soup.c)
  320.         text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
  321.         soup = BeautifulSoup(text)
  322.         (no, show) = soup.findAll('b')
  323.         show.replaceWith(no)
  324.         self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")
  325.         soup = BeautifulSoup('<a><b>Find</b><c>lady!</c><d></d></a>')
  326.         tag = Tag(soup, 'magictag')
  327.         tag.insert(0, 'the')
  328.         soup.a.insert(1, tag)
  329.         b = soup.b
  330.         c = soup.c
  331.         theText = tag.find(text = True)
  332.         findText = b.find(text = 'Find')
  333.         self.assertEqual(findText.next, tag)
  334.         self.assertEqual(tag.previous, findText)
  335.         self.assertEqual(b.nextSibling, tag)
  336.         self.assertEqual(tag.previousSibling, b)
  337.         self.assertEqual(tag.nextSibling, c)
  338.         self.assertEqual(c.previousSibling, tag)
  339.         self.assertEqual(theText.next, c)
  340.         self.assertEqual(c.previous, theText)
  341.         soup = BeautifulSoup('<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>')
  342.         f = soup.f
  343.         a = soup.a
  344.         c = soup.c
  345.         e = soup.e
  346.         weText = a.find(text = 'We')
  347.         soup.b.replaceWith(soup.f)
  348.         self.assertEqual(soup.decode(), '<a>We<f>refuse</f></a><e>to<g>service</g></e>')
  349.         self.assertEqual(f.previous, weText)
  350.         self.assertEqual(weText.next, f)
  351.         self.assertEqual(f.previousSibling, weText)
  352.         self.assertEqual(f.nextSibling, None)
  353.         self.assertEqual(weText.nextSibling, f)
  354.  
  355.     
  356.     def testAppend(self):
  357.         doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
  358.         soup = BeautifulSoup(doc)
  359.         second_para = soup('p')[1]
  360.         bold = soup.find('b')
  361.         soup('p')[1].append(soup.find('b'))
  362.         self.assertEqual(bold.parent, second_para)
  363.         self.assertEqual(soup.decode(), "<p>Don't leave me .</p> <p>Don't leave me.<b>here</b></p>")
  364.  
  365.     
  366.     def testTagExtraction(self):
  367.         text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
  368.         soup = BeautifulSoup(text)
  369.         extracted = soup.find('div', id = 'nav').extract()
  370.         self.assertEqual(soup.decode(), '<html>Real content here.</html>')
  371.         self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
  372.         text = '<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>'
  373.         soup = BeautifulStoneSoup(text)
  374.         doc = soup.doc
  375.         (numbers, roman, letters) = soup('a')
  376.         self.assertEqual(roman.parent, doc)
  377.         oldPrevious = roman.previous
  378.         endOfThisTag = roman.nextSibling.previous
  379.         self.assertEqual(oldPrevious, '2')
  380.         self.assertEqual(roman.next, 'i')
  381.         self.assertEqual(endOfThisTag, 'ii')
  382.         self.assertEqual(roman.previousSibling, numbers)
  383.         self.assertEqual(roman.nextSibling, letters)
  384.         roman.extract()
  385.         self.assertEqual(roman.parent, None)
  386.         self.assertEqual(roman.previous, None)
  387.         self.assertEqual(roman.next, 'i')
  388.         self.assertEqual(letters.previous, '2')
  389.         self.assertEqual(roman.previousSibling, None)
  390.         self.assertEqual(roman.nextSibling, None)
  391.         self.assertEqual(endOfThisTag.next, None)
  392.         self.assertEqual(roman.b.contents[0].next, None)
  393.         self.assertEqual(numbers.nextSibling, letters)
  394.         self.assertEqual(letters.previousSibling, numbers)
  395.         self.assertEqual(len(doc.contents), 2)
  396.         self.assertEqual(doc.contents[0], numbers)
  397.         self.assertEqual(doc.contents[1], letters)
  398.         text = '<a>1<b>2<c>Hollywood, baby!</c></b></a>3'
  399.         soup = BeautifulStoneSoup(text)
  400.         one = soup.find(text = '1')
  401.         three = soup.find(text = '3')
  402.         toExtract = soup.b
  403.         soup.b.extract()
  404.         self.assertEqual(one.next, three)
  405.         self.assertEqual(three.previous, one)
  406.         self.assertEqual(one.parent.nextSibling, three)
  407.         self.assertEqual(three.previousSibling, soup.a)
  408.  
  409.  
  410.  
  411. class TheManWithoutAttributes(SoupTest):
  412.     
  413.     def testHasKey(self):
  414.         text = "<foo attr='bar'>"
  415.         self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
  416.  
  417.  
  418.  
  419. class QuoteMeOnThat(SoupTest):
  420.     
  421.     def testQuotedAttributeValues(self):
  422.         self.assertSoupEquals("<foo attr='bar'></foo>", '<foo attr="bar"></foo>')
  423.         text = '<foo attr=\'bar "brawls" happen\'>a</foo>'
  424.         soup = BeautifulSoup(text)
  425.         self.assertEquals(soup.decode(), text)
  426.         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
  427.         newText = '<foo attr=\'Brawls happen at "Bob&squot;s Bar"\'>a</foo>'
  428.         self.assertSoupEquals(soup.decode(), newText)
  429.         self.assertSoupEquals('<this is="really messed up & stuff">', '<this is="really messed up & stuff"></this>')
  430.  
  431.  
  432.  
  433. class YoureSoLiteral(SoupTest):
  434.     
  435.     def testLiteralMode(self):
  436.         text = '<script>if (i<imgs.length)</script><b>Foo</b>'
  437.         soup = BeautifulSoup(text)
  438.         self.assertEqual(soup.script.contents[0], 'if (i<imgs.length)')
  439.         self.assertEqual(soup.b.contents[0], 'Foo')
  440.  
  441.     
  442.     def testTextArea(self):
  443.         text = '<textarea><b>This is an example of an HTML tag</b><&<&</textarea>'
  444.         soup = BeautifulSoup(text)
  445.         self.assertEqual(soup.textarea.contents[0], '<b>This is an example of an HTML tag</b><&<&')
  446.  
  447.  
  448.  
  449. class OperatorOverload(SoupTest):
  450.     
  451.     def testTagNameAsFind(self):
  452.         soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
  453.         self.assertEqual(soup.b.i, soup.find('b').find('i'))
  454.         self.assertEqual(soup.b.i.string, 'bar')
  455.         self.assertEqual(soup.b['id'], '1')
  456.         self.assertEqual(soup.b.contents[0], 'foo')
  457.         self.assert_(not (soup.a))
  458.         self.assertEqual(soup.bTag.iTag.string, 'bar')
  459.         self.assertEqual(soup.b.iTag.string, 'bar')
  460.         self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
  461.  
  462.  
  463.  
  464. class NestableEgg(SoupTest):
  465.     
  466.     def testParaInsideBlockquote(self):
  467.         soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
  468.         self.assertEqual(soup.blockquote.p.b.string, 'Foo')
  469.         self.assertEqual(soup.blockquote.b.string, 'Foo')
  470.         self.assertEqual(soup.find('p', recursive = False).string, 'Bar')
  471.  
  472.     
  473.     def testNestedTables(self):
  474.         text = '<table id="1"><tr><td>Here\'s another table:\n        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>'
  475.         soup = BeautifulSoup(text)
  476.         self.assertEquals(soup.table.table.td.string, 'Juicy text')
  477.         self.assertEquals(len(soup.findAll('table')), 2)
  478.         self.assertEquals(len(soup.table.findAll('table')), 1)
  479.         self.assertEquals(soup.find('table', {
  480.             'id': 2 }).parent.parent.parent.name, 'table')
  481.         text = '<table><tr><td><div><table>Foo</table></div></td></tr></table>'
  482.         soup = BeautifulSoup(text)
  483.         self.assertEquals(soup.table.tr.td.div.table.contents[0], 'Foo')
  484.         text = '<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>\n        <tfoot><tr>Baz</tr></tfoot></table>'
  485.         soup = BeautifulSoup(text)
  486.         self.assertEquals(soup.table.thead.tr.contents[0], 'Foo')
  487.  
  488.     
  489.     def testBadNestedTables(self):
  490.         soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
  491.         self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
  492.  
  493.  
  494.  
  495. class CleanupOnAisleFour(SoupTest):
  496.     
  497.     def testSelfClosingtag(self):
  498.         self.assertEqual(BeautifulSoup('Foo<br/>Bar').find('br').decode(), '<br />')
  499.         self.assertSoupEquals('<p>test1<br/>test2</p>', '<p>test1<br />test2</p>')
  500.         text = '<p>test1<selfclosing>test2'
  501.         soup = BeautifulStoneSoup(text)
  502.         self.assertEqual(soup.decode(), '<p>test1<selfclosing>test2</selfclosing></p>')
  503.         soup = BeautifulStoneSoup(text, selfClosingTags = 'selfclosing')
  504.         self.assertEqual(soup.decode(), '<p>test1<selfclosing />test2</p>')
  505.  
  506.     
  507.     def testSelfClosingTagOrNot(self):
  508.         text = '<item><link>http://foo.com/</link></item>'
  509.         self.assertEqual(BeautifulStoneSoup(text).decode(), text)
  510.         self.assertEqual(BeautifulSoup(text).decode(), '<item><link />http://foo.com/</item>')
  511.  
  512.     
  513.     def testBooleanAttributes(self):
  514.         text = '<td nowrap>foo</td>'
  515.         self.assertSoupEquals(text, text)
  516.  
  517.     
  518.     def testCData(self):
  519.         xml = '<root>foo<![CDATA[foobar]]>bar</root>'
  520.         self.assertSoupEquals(xml, xml)
  521.         r = re.compile('foo.*bar')
  522.         soup = BeautifulSoup(xml)
  523.         self.assertEquals(soup.find(text = r).string, 'foobar')
  524.         self.assertEquals(soup.find(text = r).__class__, CData)
  525.  
  526.     
  527.     def testComments(self):
  528.         xml = 'foo<!--foobar-->baz'
  529.         self.assertSoupEquals(xml)
  530.         r = re.compile('foo.*bar')
  531.         soup = BeautifulSoup(xml)
  532.         self.assertEquals(soup.find(text = r).string, 'foobar')
  533.         self.assertEquals(soup.find(text = 'foobar').__class__, Comment)
  534.  
  535.     
  536.     def testDeclaration(self):
  537.         xml = 'foo<!DOCTYPE foobar>baz'
  538.         self.assertSoupEquals(xml)
  539.         r = re.compile('.*foo.*bar')
  540.         soup = BeautifulSoup(xml)
  541.         text = 'DOCTYPE foobar'
  542.         self.assertEquals(soup.find(text = r).string, text)
  543.         self.assertEquals(soup.find(text = text).__class__, Declaration)
  544.         namespaced_doctype = '<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"><html>foo</html>'
  545.         soup = BeautifulSoup(namespaced_doctype)
  546.         self.assertEquals(soup.contents[0], 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
  547.         self.assertEquals(soup.html.contents[0], 'foo')
  548.  
  549.     
  550.     def testEntityConversions(self):
  551.         text = '<<sacré bleu!>>'
  552.         soup = BeautifulStoneSoup(text)
  553.         self.assertSoupEquals(text)
  554.         xmlEnt = BeautifulStoneSoup.XML_ENTITIES
  555.         htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
  556.         xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
  557.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  558.         self.assertEquals(soup.decode(), '<<sacré bleu!>>')
  559.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  560.         self.assertEquals(soup.decode(), '<<sacré bleu!>>')
  561.         soup = BeautifulStoneSoup(text, convertEntities = htmlEnt)
  562.         self.assertEquals(soup.decode(), u'<<sacr├⌐ bleu!>>')
  563.         text = '<™''
  564.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  565.         self.assertEquals(soup.decode(), u"<™'")
  566.         soup = BeautifulStoneSoup(text, convertEntities = htmlEnt)
  567.         self.assertEquals(soup.decode(), u'<Γäó'')
  568.         soup = BeautifulStoneSoup(text, convertEntities = xhtmlEnt)
  569.         self.assertEquals(soup.decode(), u"<Γäó'")
  570.  
  571.     
  572.     def testNonBreakingSpaces(self):
  573.         soup = BeautifulSoup('<a>  </a>', convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
  574.         self.assertEquals(soup.decode(), u'<a>┬á┬á</a>')
  575.  
  576.     
  577.     def testWhitespaceInDeclaration(self):
  578.         self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
  579.  
  580.     
  581.     def testJunkInDeclaration(self):
  582.         self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
  583.  
  584.     
  585.     def testIncompleteDeclaration(self):
  586.         self.assertSoupEquals('a<!b <p>c')
  587.  
  588.     
  589.     def testEntityReplacement(self):
  590.         self.assertSoupEquals('<b>hello there</b>')
  591.  
  592.     
  593.     def testEntitiesInAttributeValues(self):
  594.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8')
  595.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8')
  596.         soup = BeautifulSoup('<x t=">™">', convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
  597.         self.assertEquals(soup.decode(), u'<x t=">Γäó"></x>')
  598.         uri = 'http://crummy.com?sacré&bleu'
  599.         link = '<a href="%s"></a>' % uri
  600.         soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES)
  601.         self.assertEquals(soup.decode(), link.replace('é', u'├⌐'))
  602.         uri = 'http://crummy.com?sacré&bleu'
  603.         link = '<a href="%s"></a>' % uri
  604.         soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES)
  605.         self.assertEquals(soup.a['href'], uri.replace('é', u'├⌐'))
  606.  
  607.     
  608.     def testNakedAmpersands(self):
  609.         html = {
  610.             'convertEntities': BeautifulStoneSoup.HTML_ENTITIES }
  611.         soup = BeautifulStoneSoup('AT&T ', **html)
  612.         self.assertEquals(soup.decode(), 'AT&T ')
  613.         nakedAmpersandInASentence = 'AT&T was Ma Bell'
  614.         soup = BeautifulStoneSoup(nakedAmpersandInASentence, **html)
  615.         self.assertEquals(soup.decode(), nakedAmpersandInASentence.replace('&', '&'))
  616.         invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
  617.         validURL = invalidURL.replace('&', '&')
  618.         soup = BeautifulStoneSoup(invalidURL)
  619.         self.assertEquals(soup.decode(), validURL)
  620.         soup = BeautifulStoneSoup(validURL)
  621.         self.assertEquals(soup.decode(), validURL)
  622.  
  623.  
  624.  
  625. class EncodeRed(SoupTest):
  626.     
  627.     def testUnicodeDammitStandalone(self):
  628.         markup = '<foo>\x92</foo>'
  629.         dammit = UnicodeDammit(markup)
  630.         self.assertEquals(dammit.unicode, '<foo>’</foo>')
  631.         hebrew = '\xed\xe5\xec\xf9'
  632.         dammit = UnicodeDammit(hebrew, [
  633.             'iso-8859-8'])
  634.         self.assertEquals(dammit.unicode, u'╫¥╫ò╫£╫⌐')
  635.         self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
  636.  
  637.     
  638.     def testGarbageInGarbageOut(self):
  639.         ascii = '<foo>a</foo>'
  640.         asciiSoup = BeautifulStoneSoup(ascii)
  641.         self.assertEquals(ascii, asciiSoup.decode())
  642.         unicodeData = u'<foo>├╝</foo>'
  643.         utf8 = unicodeData.encode('utf-8')
  644.         self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
  645.         unicodeSoup = BeautifulStoneSoup(unicodeData)
  646.         self.assertEquals(unicodeData, unicodeSoup.decode())
  647.         self.assertEquals(unicodeSoup.foo.string, u'├╝')
  648.         utf8Soup = BeautifulStoneSoup(utf8, fromEncoding = 'utf-8')
  649.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  650.         self.assertEquals(utf8Soup.originalEncoding, 'utf-8')
  651.         utf8Soup = BeautifulStoneSoup(unicodeData)
  652.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  653.         self.assertEquals(utf8Soup.originalEncoding, None)
  654.  
  655.     
  656.     def testHandleInvalidCodec(self):
  657.         for bad_encoding in [
  658.             '.utf8',
  659.             '...',
  660.             'utF---16.!']:
  661.             soup = BeautifulSoup(u'R├ñksm├╢rg├Ñs'.encode('utf-8'), fromEncoding = bad_encoding)
  662.             self.assertEquals(soup.originalEncoding, 'utf-8')
  663.         
  664.  
  665.     
  666.     def testUnicodeSearch(self):
  667.         html = u'<html><body><h1>R├ñksm├╢rg├Ñs</h1></body></html>'
  668.         soup = BeautifulSoup(html)
  669.         self.assertEqual(soup.find(text = u'R├ñksm├╢rg├Ñs'), u'R├ñksm├╢rg├Ñs')
  670.  
  671.     
  672.     def testRewrittenXMLHeader(self):
  673.         euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
  674.         utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
  675.         soup = BeautifulStoneSoup(euc_jp)
  676.         if soup.originalEncoding != 'euc-jp':
  677.             raise Exception("Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it.")
  678.         soup.originalEncoding != 'euc-jp'
  679.         self.assertEquals(soup.originalEncoding, 'euc-jp')
  680.         self.assertEquals(soup.renderContents('utf-8'), utf8)
  681.         old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
  682.         new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
  683.         self.assertSoupEquals(old_text, new_text)
  684.  
  685.     
  686.     def testRewrittenMetaTag(self):
  687.         no_shift_jis_html = '<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'
  688.         soup = BeautifulSoup(no_shift_jis_html)
  689.         strainer = SoupStrainer('pre')
  690.         soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese = strainer)
  691.         self.assertEquals(soup.contents[0].name, 'pre')
  692.         meta_tag = '<meta content="text/html; charset=x-sjis" http-equiv="Content-type" />'
  693.         shift_jis_html = '<html><head>\n%s\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>' % meta_tag
  694.         soup = BeautifulSoup(shift_jis_html)
  695.         if soup.originalEncoding != 'shift-jis':
  696.             raise Exception("Test failed when parsing shift-jis document with meta tag '%s'.If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it." % meta_tag)
  697.         soup.originalEncoding != 'shift-jis'
  698.         self.assertEquals(soup.originalEncoding, 'shift-jis')
  699.         content_type_tag = soup.meta['content']
  700.         self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%')
  701.         content_type = str(soup.meta)
  702.         index = content_type.find('charset=')
  703.         self.assertEqual(content_type[index:index + len('charset=utf8') + 1], 'charset=utf-8')
  704.         content_type = soup.meta.encode('shift-jis')
  705.         index = content_type.find('charset=')
  706.         self.assertEqual(content_type[index:index + len('charset=shift-jis')], 'charset=shift-jis'.encode())
  707.         self.assertEquals(soup.encode('utf-8'), '<html><head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-type" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')
  708.         self.assertEquals(soup.encode('shift-jis'), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode()))
  709.         isolatin = '<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>'
  710.         soup = BeautifulSoup(isolatin)
  711.         utf8 = isolatin.replace('ISO-Latin-1'.encode(), 'utf-8'.encode())
  712.         utf8 = utf8.replace('\xe9', '\xc3\xa9')
  713.         self.assertSoupEquals(soup.encode('utf-8'), utf8, encoding = 'utf-8')
  714.  
  715.     
  716.     def testHebrew(self):
  717.         iso_8859_8 = '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
  718.         utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
  719.         soup = BeautifulStoneSoup(iso_8859_8, fromEncoding = 'iso-8859-8')
  720.         self.assertEquals(soup.encode('utf-8'), utf8)
  721.  
  722.     
  723.     def testSmartQuotesNotSoSmartAnymore(self):
  724.         self.assertSoupEquals('\x91Foo\x92 <!--blah-->', '‘Foo’ <!--blah-->')
  725.  
  726.     
  727.     def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
  728.         smartQuotes = 'Il a dit, \x8bSacré bleu!\x9b'
  729.         soup = BeautifulSoup(smartQuotes)
  730.         self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›')
  731.         soup = BeautifulSoup(smartQuotes, convertEntities = 'html')
  732.         self.assertEquals(soup.encode('utf-8'), 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
  733.  
  734.     
  735.     def testDontSeeSmartQuotesWhereThereAreNone(self):
  736.         utf_8 = '\xe3\x82\xb1\xe3\x83\xbc\xe3\x82\xbf\xe3\x82\xa4 Watch'
  737.         self.assertSoupEquals(utf_8, encoding = 'utf-8')
  738.  
  739.  
  740.  
  741. class Whitewash(SoupTest):
  742.     
  743.     def testPreservedWhitespace(self):
  744.         self.assertSoupEquals('<pre>   </pre>')
  745.         self.assertSoupEquals('<pre> woo  </pre>')
  746.  
  747.     
  748.     def testCollapsedWhitespace(self):
  749.         self.assertSoupEquals('<p>   </p>', '<p> </p>')
  750.  
  751.  
  752. if __name__ == '__main__':
  753.     unittest.main()
  754.  
  755.