home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- '''
- N-Triples Parser
- License: GPL 2, W3C, BSD, or MIT
- Author: Sean B. Palmer, inamidst.com
- Documentation:
- http://inamidst.com/proj/rdf/ntriples-doc
-
- Command line usage:
- ./ntriples.py <URI> - parses URI as N-Triples
- ./ntriples.py --help - prints out this help message
- # @@ fully empty document?
- '''
- import re
- uriref = '<([^:]+:[^\\s"<>]+)>'
- literal = '"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"'
- litinfo = '(?:@([a-z]+(?:-[a-z0-9]+)*)|\\^\\^' + uriref + ')?'
- r_line = re.compile('([^\\r\\n]*)(?:\\r\\n|\\r|\\n)')
- r_wspace = re.compile('[ \\t]*')
- r_wspaces = re.compile('[ \\t]+')
- r_tail = re.compile('[ \\t]*\\.[ \\t]*')
- r_uriref = re.compile(uriref)
- r_nodeid = re.compile('_:([A-Za-z][A-Za-z0-9]*)')
- r_literal = re.compile(literal + litinfo)
- bufsiz = 2048
- validate = False
-
- class Node(unicode):
- pass
-
- from rdflib import URIRef as URI
- from rdflib import BNode as bNode
- from rdflib import Literal
-
- class Sink(object):
-
- def __init__(self):
- self.length = 0
-
-
- def triple(self, s, p, o):
- self.length += 1
- print (s, p, o)
-
-
-
- class ParseError(Exception):
- pass
-
- quot = {
- 't': '\t',
- 'n': '\n',
- 'r': '\r',
- '"': '"',
- '\\': '\\' }
- r_safe = re.compile('([\\x20\\x21\\x23-\\x5B\\x5D-\\x7E]+)')
- r_quot = re.compile('\\\\(t|n|r|"|\\\\)')
- r_uniquot = re.compile('\\\\u([0-9A-F]{4})|\\\\U([0-9A-F]{8})')
-
- def unquote(s):
- '''Unquote an N-Triples string.'''
- result = []
- while s:
- m = r_safe.match(s)
- if m:
- s = s[m.end():]
- result.append(m.group(1))
- continue
-
- m = r_quot.match(s)
- if m:
- s = s[2:]
- result.append(quot[m.group(1)])
- continue
-
- m = r_uniquot.match(s)
- if m:
- s = s[m.end():]
- (u, U) = m.groups()
- if not u:
- pass
- codepoint = int(U, 16)
- if codepoint > 1114111:
- raise ParseError('Disallowed codepoint: %08X' % codepoint)
- codepoint > 1114111
- result.append(unichr(codepoint))
- continue
- if s.startswith('\\'):
- raise ParseError('Illegal escape at: %s...' % s[:10])
- s.startswith('\\')
- raise ParseError('Illegal literal character: %r' % s[0])
- return unicode(''.join(result))
-
- if not validate:
-
- def unquote(s):
- return s.decode('unicode-escape')
-
-
- r_hibyte = re.compile('([\\x80-\\xFF])')
-
- def uriquote(uri):
- return r_hibyte.sub((lambda m: '%%%02X' % ord(m.group(1))), uri)
-
- if not validate:
-
- def uriquote(uri):
- return uri
-
-
-
- class NTriplesParser(object):
- '''An N-Triples Parser.
- Usage:
- p = NTriplesParser(sink=MySink())
- sink = p.parse(f) # file; use parsestring for a string
- '''
-
- def __init__(self, sink = None):
- if sink is not None:
- self.sink = sink
- else:
- self.sink = Sink()
-
-
- def parse(self, f):
- '''Parse f as an N-Triples file.'''
- if not hasattr(f, 'read'):
- raise ParseError('Item to parse must be a file-like object.')
- hasattr(f, 'read')
- self.file = f
- self.buffer = ''
- while True:
- self.line = self.readline()
- if self.line is None:
- break
-
-
- try:
- self.parseline()
- continue
- except ParseError:
- raise ParseError('Invalid line: %r' % self.line)
- continue
-
-
- None<EXCEPTION MATCH>ParseError
- return self.sink
-
-
- def parsestring(self, s):
- '''Parse s as an N-Triples string.'''
- if not isinstance(s, basestring):
- raise ParseError('Item to parse must be a string instance.')
- isinstance(s, basestring)
- StringIO = StringIO
- import cStringIO
- f = StringIO()
- f.write(s)
- f.seek(0)
- self.parse(f)
-
-
- def readline(self):
- '''Read an N-Triples line from buffered input.'''
- if not self.buffer:
- buffer = self.file.read(bufsiz)
- if not buffer:
- return None
- self.buffer = buffer
-
- while True:
- m = r_line.match(self.buffer)
- if m:
- self.buffer = self.buffer[m.end():]
- return m.group(1)
- buffer = self.file.read(bufsiz)
- if not buffer:
- raise ParseError('EOF in line')
- buffer
- self.buffer += buffer
- continue
- self
-
-
- def parseline(self):
- self.eat(r_wspace)
- if not (self.line) or self.line.startswith('#'):
- return None
- subject = self.subject()
- self.eat(r_wspaces)
- predicate = self.predicate()
- self.eat(r_wspaces)
- object = self.object()
- self.eat(r_tail)
- if self.line:
- raise ParseError('Trailing garbage')
- self.line
- self.sink.triple(subject, predicate, object)
-
-
- def peek(self, token):
- return self.line.startswith(token)
-
-
- def eat(self, pattern):
- m = pattern.match(self.line)
- if not m:
- raise ParseError('Failed to eat %s' % pattern)
- m
- self.line = self.line[m.end():]
- return m
-
-
- def subject(self):
- if not self.uriref():
- pass
- subj = self.nodeid()
- if not subj:
- raise ParseError('Subject must be uriref or nodeID')
- subj
- return subj
-
-
- def predicate(self):
- pred = self.uriref()
- if not pred:
- raise ParseError('Predicate must be uriref')
- pred
- return pred
-
-
- def object(self):
- if not self.uriref() and self.nodeid():
- pass
- objt = self.literal()
- if objt is False:
- raise ParseError('Unrecognised object type')
- objt is False
- return objt
-
-
- def uriref(self):
- if self.peek('<'):
- uri = self.eat(r_uriref).group(1)
- uri = unquote(uri)
- uri = uriquote(uri)
- return URI(uri)
- return False
-
-
- def nodeid(self):
- if self.peek('_'):
- return bNode(self.eat(r_nodeid).group(1))
- return False
-
-
- def literal(self):
- if self.peek('"'):
- (lit, lang, dtype) = self.eat(r_literal).groups()
- if not lang:
- pass
- lang = None
- if not dtype:
- pass
- dtype = None
- if lang and dtype:
- raise ParseError("Can't have both a language and a datatype")
- dtype
- lit = unquote(lit)
- return Literal(lit, lang, dtype)
- return False
-
-
-
- def parseURI(uri):
- import urllib
- parser = NTriplesParser()
- u = urllib.urlopen(uri)
- sink = parser.parse(u)
- u.close()
- print 'Length of input:', sink.length
-
-
- def main():
- import sys
- if len(sys.argv) == 2:
- parseURI(sys.argv[1])
- else:
- print __doc__
-
- if __name__ == '__main__':
- main()
-
-