Dynamic HTML in Action

home *** CD-ROM | disk | FTP | other *** search

/ Dynamic HTML in Action / Dynamicke-HTML-v-akci-covermount.bin / XML / PARSER / XMLINST.EXE / classes / com / ms / xml / parser / Parser.java < prev next >

Wrap

Java Source | 1998-05-26 | 96.2 KB | 3,007 lines

/* * @(#)Parser.java 1.0 6/3/97 * * Copyright (c) 1997 Microsoft, Corp. All Rights Reserved. * */ package com.ms.xml.parser; import com.ms.xml.om.Element; import com.ms.xml.om.ElementImpl; import com.ms.xml.om.ElementFactory; import com.ms.xml.om.ElementFactoryImpl; import com.ms.xml.util.EnumWrapper; import com.ms.xml.util.Name; import com.ms.xml.util.Atom; import com.ms.xml.util.XMLInputStream; import com.ms.xml.util.XMLOutputStream; import java.lang.String; import java.util.Hashtable; import java.util.Stack; import java.util.Enumeration; import java.util.Vector; import java.io.*; import java.net.*; /** * This class implements an eXtensible Markup Language (XML) parser according to the * latest World Wide Web Consortium (W3C) working draft of the XML specification. * This parser class is used internally by the XML document * load method, so you shouldn't need to use it directly. * @version 1.0, 6/3/97 */ public class Parser { static final int TAGSTART = '<'; static final int TAGEND = '>'; static final int SLASH = '/'; static final int EQ = '='; static final int LPAREN = '('; static final int RPAREN = ')'; static final int BANG = '!'; static final int QMARK = '?'; static final int DASH = '-'; static final int PERCENT = '%'; static final int AMP = '&'; static final int LEFTSQB = '['; static final int RIGHTSQB = ']'; static final int QUOTE = '\''; static final int OR = '|'; static final int ASTERISK = '*'; static final int PLUS = '+'; static final int HASH = '#'; static final int COMMA = ','; static final int INVALIDTOKEN = 0; static final int EOF = -1; static final int WHITESPACE = -2; static final int WORDCHAR = -3; static final int NAME = -4; static final int TEXT = -5; static final int PITAGSTART = -6; static final int PITAGEND = -7; static final int DECLTAGSTART = -8; static final int CLOSETAGSTART = -9; static final int EMPTYTAGEND = -10; static final int COMMENT = -11; static final int DOCTYPE = -12; static final int SYSTEM = -13; static final int CDATATAGSTART = -14; static final int ELEMENT = -15; static final int EMPTY = -16; static final int ANY = -17; static final int PCDATA = -18; static final int ATTLIST = -19; static final int CDATA = -20; static final int ID = -21; static final int IDREF = -22; static final int IDREFS = -23; static final int ENTITY = -24; static final int ENTITIES = -25; static final int NMTOKEN = -26; static final int NMTOKENS = -27; static final int NOTATION = -28; static final int ENUMERATION = -29; static final int FIXED = -30; static final int REQUIRED = -31; static final int IMPLIED = -32; static final int NDATA = -33; static final int INCLUDETAGSTART= -34; static final int IGNORETAGSTART = -35; static final int NAMESPACE = -36; static final int EXTENDS = -37; static final int IMPLEMENTS = -38; static final int XML = -39; static final int VERSION = -40; static final int ENCODING = -41; static final int STANDALONE = -42; static final int CDEND = -43; static final int PUBLIC = -100; /** * Creates a new parser object. */ public Parser() { String version = System.getProperty("java.version"); Float fver = new Float(version); jdk11 = (fver.doubleValue() >= 1.1) ? true : false; caseInsensitive = false; } /** * Parses the XML document pointed to by the given URL and * creates the corresponding XML document hierarchy. * @param url the url points to the XML document to parse. * @param factory used to create XML Elements during parsing. * @param dtd the object that the parser stores DTD information in. * @param root the root node to start with and add children to * during parsing. * @param loadext whether to load external DTD's and/or entities * @exception ParseException if syntax or other error encountered. */ public final void parse(URL url, ElementFactory factory, DTD dtd, Element root, boolean caseInsensitive, boolean loadExt) throws ParseException { this.dtd = dtd; this.root = root; this.loadexternal = loadExt; setURL(url); setFactory(factory); this.caseInsensitive = caseInsensitive; safeParse(); } final void safeParse() throws ParseException { try { parseDocument(); } catch (ParseException e) { if (xmlIn != null) { try { xmlIn.close(); } catch (Exception f) { } } throw e; } try { xmlIn.close(); } catch (Exception f) { } } /** * Parses the XML from given input stream. * @param in the input stream containing XML data to parse. * @param factory used to create XML Elements during parsing. * @param dtd the object that the parser stores DTD information in. * @param root the root node to start with and add children to * during parsing. * @exception ParseException if syntax or other error encountered. */ final public void parse(InputStream in, ElementFactory factory, DTD dtd, Element root, boolean caseInsensitive, boolean loadExt) throws ParseException { this.dtd = dtd; url = null; this.root = root; this.loadexternal = loadExt; setInputStream(in); setFactory(factory); this.caseInsensitive = caseInsensitive; safeParse(); } /** * Reports errors to the specified output stream including parsing * context and stack trace where the error occurred. * @param e The exception to report. * @param out The output stream to write the report to. * @return No return value. */ final public void report(ParseException e, OutputStream out) { PrintStream o = new PrintStream(out); String s = null; o.println(e.getMessage()); if (e.owner instanceof Parser) { URL u = ((Parser)e.owner).url; if (u != null) s = u.toString(); } else if (e.owner instanceof Entity) { s = "Parsing <" + ((Entity)e.owner).name + ">"; } else { s = "Parsing"; } o.println("Location: " + s + "(" + e.line + "," + e.column + ")"); o.print("Context: "); for (int i = 0; i < contextAt; i++) { Name name = ((Context)contexts.elementAt(i)).e.getTagName(); if (name != null) o.print("<" + name + ">"); } o.print("<"); if (current != null) o.print(current.e.getTagName()); o.println(">"); } /** * Creates an output stream that best matches the XML data format * found during parsing. * For example, this will match big endian or little endian * XML data formats. * @param out The output stream. * @return an <code>XMLOutputStream</code> object that uses the newline * separator * defined by the system property "line.separator". */ public final XMLOutputStream createOutputStream(OutputStream out) { if (xmlIn != null) return xmlIn.createOutputStream(out); return null; } /** * throw error */ final void error(String s) throws ParseException { int i = 1; // BUGBUG: the position may still be incorrect if (token == NAME) i = name.toString().length(); throw new ParseException(s, reader.line, reader.column - 1 - i, reader.owner); } /** * get next char and update line number / char position */ final void advance() throws ParseException { lookahead = reader.read(); // if EOF and reading 'included' entity pop it... while (lookahead == -1 && reader.owner != this) { // For external text entities there may be some PCDATA // left over that needs to be added also. if (charAt != 0) { addPCDATA(); } reader = reader.prev; pop(); // pop the entity element if (! inTag) charAt = 0; lookahead = reader.read(); } } /** * return next token * @exception ParseException when syntax or other error is encountered. */ final int nextToken() throws ParseException { bufAt = 0; int bufStart = bufAt; if (inTag || ! current.preserveWS) { while (isWhiteSpaceChar((char)lookahead)) { if (! inTag) { buf[bufAt++] = (char)lookahead; seenWS = true; } advance(); } } if (inTag) { switch (lookahead) { case -1: token = EOF; break; case '>': token = TAGEND; inTag = false; advance(); break; case '/': advance(); if (lookahead == '>') { token = EMPTYTAGEND; inTag = false; advance(); } break; case '?': advance(); if (current.type == Element.ELEMENTDECL) { token = QMARK; } else { if (lookahead == '>') { token = PITAGEND; inTag = false; advance(); } else { token = QMARK; } } break; case '=': case '(': case ')': case ',': case '|': case '[': case ']': case '*': case '+': case '#': token = lookahead; advance(); break; case '%': advance(); if (substitution > 0 && isNameChar((char)lookahead)) { scanEntityRef(true); return nextToken(); } token = PERCENT; break; case '\"': case '\'': quote = (char)lookahead; token = QUOTE; advance(); break; default: if (isNameChar((char)lookahead) || nameSpaceSeparator == (char)lookahead) { scanName("name"); if (keyword > 0) { token = lookup(name.getName()); } } else { error("Unexpected token '" + (char)lookahead + "' inside tag <" + current.e.getTagName() + ">"); } } } else { if (seenWS && ! current.lastWasWS && (lookahead == -1 || lookahead == '<') ) { addNewElement( Element.WHITESPACE , null, false, new String(buf, bufStart, bufAt - bufStart)); } switch (lookahead) { case -1: token = EOF; break; case '<': inTag = true; seenWS = false; // reset advance(); switch (lookahead) { case '?': token = PITAGSTART; advance(); break; case '!': token = DECLTAGSTART; advance(); if (lookahead == '-') { advance(); if (lookahead == '-') { token = COMMENT; advance(); } else { error("Bad comment start syntax. Expected '' </code> */ final Element parseComment() throws ParseException { Element e = addNewElement(Element.COMMENT, nameComment, false, null); charAt = 0; boolean stop = false; while (lookahead != -1) { chars[charAt++] = (char)lookahead; if (lookahead == '-') { advance(); if (lookahead == '-') { advance(); if (lookahead == TAGEND) { charAt--; stop = true; } else if (strict) { error("Bad comment syntax. Expected '>'."); } else { reader.push((char)lookahead); lookahead = '-'; } } } else { advance(); } if (charAt == chars.length || stop) { push(e,nameComment,Element.COMMENT); addNewElement(Element.CDATA, null, false, new String(chars, 0, charAt)); pop(); charAt = 0; if (stop) break; } } parseToken(TAGEND, "comment end"); return e; } /** * Parses CDATA <code> '<![CDATA[' data ']]>' </code> */ final void parseCDATA() throws ParseException { charAt = 0; boolean stop = false; while (lookahead != -1) { chars[charAt++] = (char)lookahead; if (lookahead == ']') { advance(); if (lookahead == ']') { advance(); if (lookahead == TAGEND) { charAt--; stop = true; } else { reader.push((char)lookahead); lookahead = ']'; } } } else { advance(); } if (charAt == chars.length || stop) { addNewElement(Element.CDATA, nameCDATA, false, new String(chars, 0, charAt)); charAt = 0; if (stop) break; } } parseToken(TAGEND, "CDATA end"); } /** * Set url and open an input stream for it */ final private void setURL(URL u) throws ParseException { url = u; String vendor = System.getProperty("java.vendor"); boolean msft = (vendor.indexOf("Microsoft") == 0); String os = System.getProperty("os.name"); boolean windows = (os.indexOf("Windows") == 0); boolean opt = msft && jdk11 && windows; String err; // // on windows system, try to use optimized inputstream // if (opt) { try { xmlIn = new XMLInputStream(u); reader = new EntityReader(xmlIn, 1, 1, null, this); advance(); return; } catch (IOException e) { opt = false; err = e.toString(); } } // // If not on windows, or if on windows but failed to open the optimized // input stream, then use the I/O facilities provided by Java // if (!opt) { try { setInputStream( new BufferedInputStream(url.openStream())); } catch (IOException e) { throw new ParseException("Error opening input stream for \"" + url.toString() + "\": " + e.toString()); } } } final private void setInputStream(InputStream in) throws ParseException { xmlIn = new XMLInputStream(in); reader = new EntityReader(xmlIn, 1, 1, null, this); advance(); } final private void setFactory(ElementFactory f) { factory = f; } /** * Factory to create elements on the parse tree. */ ElementFactory factory; /** * DTD object. */ DTD dtd; boolean validating; // whether we loaded any DTD's for validation. /** * Root of tree. */ Element root; /** * Stack to keep track of contexts */ Vector contexts = new Vector(16); int contextAt = 0; /** * Current element context. */ Context current; /** * Inputstream used to read Unicode chars */ EntityReader reader; XMLInputStream xmlIn; /** * True if parsing inside tag */ boolean inTag; /** * true when scanner is still collapsing white space. */ boolean seenWS; /** * next character */ int lookahead; /** * quote char */ char quote; /** * chars collected up to 8K */ char chars[] = new char[8192]; /** * char index into chars[] */ int charAt; /** * buf collected up to 8K */ char buf[] = new char[8192]; /** * char index into buf[] */ int bufAt; /** * counter to allow collecting multiple names into buf */ int nameappend; /** * Token matching hashtable */ static Hashtable tokens; /** * token type */ int token; /** * counter to allow keyword checking */ int keyword; /** * counter to disable name uppercasing */ int nouppercase; /** * counter to allow parameter substition in token */ int substitution; /** * break char for parseInternalSubset */ int breakText; /** * switch to allow parsing name tokens in scanName */ int nametoken; /** * switch to allow parsing unqualified name in scanName */ int simplename; /** * switch to allow namespace when scan entity reference name in scanName */ int inEntityRef; /** * whether or not to expand named entities. */ boolean expandNamedEntities; static boolean jdk11; /** * current name fetched */ Name name; /** * current string or text fetched */ String text; /** * document name string */ URL url; /** * name in DOCTYPE tag */ Name docType; /** * whether we are parsing internal subset */ boolean internalSubset; /** * whether the document is caseInsensitive */ boolean caseInsensitive; /** * whether the parser is parsing the first line of a document */ boolean firstLine = true; /** * XML element declaration */ static ElementDecl XMLDecl; /** * If the keyword of the latest conditional section is a parameter entity reference, this * variable records the reference name; otherwise null. */ Name conditionRef; /** * Whether standalone was specified in XML declaration. * (default is false). */ boolean standAlone; /** * Whether to load external DTD's */ boolean loadexternal; /** * Char type table */ static int chartype[] = new int[256]; /** * Char upper case table */ static char charupper[] = new char[256]; static final int FWHITESPACE = 1; static final int FDIGIT = 2; static final int FLETTER = 4; static final int FMISCNAME = 8; static final int FSTARTNAME = 16; static final char nameSpaceSeparator = ':'; /** * This flag specifies whether the parser should enforce strict * XML compliance rules or whether to allow some SGML like things * like SGML comment syntax. */ static boolean strict = false; /** * predefined names */ static Name nameComment; static Name nameCDATA; static Name namePCDATA; static Name nameVERSION; static Name nameENCODING; static Name nameDOCTYPE; static Name nameXML; static Name nameStandalone; static Name nameYes; static Name nameNo; static Name nameURL; static Name namePUBLICID; static Name nameNAME; static Name nameXMLSpace; static Name nameXMLSpace2; static Name nameXMLAS; static Name nameXMLHREF; static Name nameXMLNS; static Name nameXMLNameSpace; static Atom atomXML; static Name nameXMLLang; static { nameComment = Name.create("--"); nameCDATA = Name.create("[CDATA["); namePCDATA = Name.create("PCDATA"); nameVERSION = Name.create("version"); nameENCODING = Name.create("encoding"); nameStandalone = Name.create("standalone"); nameDOCTYPE = Name.create("DOCTYPE"); nameXML = Name.create("xml"); nameYes = Name.create("yes"); nameNo = Name.create("no"); nameURL = Name.create("URL"); namePUBLICID = Name.create("PUBLICID"); nameNAME = Name.create("NAME"); nameXMLSpace = Name.create("xml-space","xml"); nameXMLSpace2 = Name.create("space","xml"); nameXMLAS = Name.create("prefix", "xml"); nameXMLHREF = Name.create("src", "xml"); nameXMLNS = Name.create("ns", "xml"); nameXMLNameSpace = Name.create("namespace", "xml"); atomXML = Atom.create("xml"); nameXMLLang = Name.create("lang","xml"); // // '<?XML' VersionInfo EncodingDecl? SDDecl? S? '?>' XMLDecl = new ElementDecl(nameXML); // // VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") XMLDecl.addAttDef(new AttDef(nameVERSION, AttDef.CDATA, "1.0", AttDef.FIXED)); // // S 'encoding' Eq QEncoding XMLDecl.addAttDef(new AttDef(nameENCODING, AttDef.CDATA, "UTF-8", AttDef.IMPLIED)); // SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" Vector an = new Vector(2); an.addElement(nameYes); an.addElement(nameNo); XMLDecl.addAttDef(new AttDef(nameStandalone, AttDef.ENUMERATION, (Name)an.elementAt(0), AttDef.IMPLIED, an)); // add recognized names to the hashtable tokens = new Hashtable(); tokens.put("DOCTYPE", new Integer(DOCTYPE)); tokens.put("SYSTEM", new Integer(SYSTEM)); tokens.put("PUBLIC", new Integer(PUBLIC)); tokens.put("ENTITY", new Integer(ENTITY)); tokens.put("ELEMENT", new Integer(ELEMENT)); tokens.put("EMPTY", new Integer(EMPTY)); tokens.put("ANY", new Integer(ANY)); tokens.put("PCDATA", new Integer(PCDATA)); tokens.put("ATTLIST", new Integer(ATTLIST)); tokens.put("CDATA", new Integer(CDATA)); tokens.put("ID", new Integer(ID)); tokens.put("IDREF", new Integer(IDREF)); tokens.put("IDREFS", new Integer(IDREFS)); tokens.put("ENTITY", new Integer(ENTITY)); tokens.put("ENTITIES", new Integer(ENTITIES)); tokens.put("NMTOKEN", new Integer(NMTOKEN)); tokens.put("NMTOKENS", new Integer(NMTOKENS)); tokens.put("FIXED", new Integer(FIXED)); tokens.put("REQUIRED", new Integer(REQUIRED)); tokens.put("IMPLIED", new Integer(IMPLIED)); tokens.put("NDATA", new Integer(NDATA)); tokens.put("NOTATION", new Integer(NOTATION)); tokens.put("INCLUDE", new Integer(INCLUDETAGSTART)); tokens.put("IGNORE", new Integer(IGNORETAGSTART)); tokens.put("namespace", new Integer(NAMESPACE)); tokens.put("EXTENDS", new Integer(EXTENDS)); tokens.put("IMPLEMENTS", new Integer(IMPLEMENTS)); tokens.put("xml", new Integer(XML)); tokens.put("version", new Integer(VERSION)); tokens.put("encoding", new Integer(ENCODING)); tokens.put("standalone", new Integer(STANDALONE)); for (int i = 0; i < 256; i++) { char c = (char)i; chartype[i] = 0; if ((jdk11 && Character.isWhitespace(c)) || (Character.isSpace(c) || c == 13)) chartype[i] = FWHITESPACE; if (Character.isLetter(c)) chartype[i] |= FLETTER; if (Character.isDigit(c)) chartype[i] |= FDIGIT; charupper[i] = Character.toUpperCase(c); } chartype['.'] |= FMISCNAME; chartype['-'] |= FMISCNAME; chartype['_'] |= FMISCNAME | FSTARTNAME; chartype[0xb7] |= FMISCNAME; // Extender } }