home *** CD-ROM | disk | FTP | other *** search
Java Source | 1997-11-05 | 15.7 KB | 499 lines |
- /*
- * @(#)XMLInputStream.java 1.0 6/10/97
- *
- * Copyright (c) 1997 Microsoft, Corp. All Rights Reserved.
- *
- */
- package com.ms.xml.util;
-
- import java.io.*;
- import java.net.*;
-
- /**
- *
- * A Reader specifically for dealing with different encoding formats
- * as well as liitleendian files.
- *
- * @version 1.0, 6/10/97
- */
- public class XMLInputStream extends InputStream
- {
- /**
- * The state enumerators
- */
- static final int INPUTSR = 1; // The different states that concern the read()
- static final int UCS2 = 2; // The POP states imply that there are characters
- static final int ASCII = 3; // stored in the next[] stack. There are separate
- static final int INPUTSR_POP = 4; // states for POP in order to speed up the read().
- static final int UCS2_POP = 5; // There is still room for future additions, such
- static final int ASCII_POP = 6; // as a UCS-4 state.
- // NOTE: All encodings that can use InputStreamReader
- // fall into the INPUTSR state
- /**
- * input buffer size on windows platforms
- */
- static final int SIZE = 1024; // input buffer size
-
- /**
- * encoding numbers
- */
- static final int INTUTF8 = 0;
- static final int INTASCII = 1;
- static final int INTUCS2 = 2;
- static final int INTUCS4 = 3;
- static final int INTEBCDIC = 4;
- static final int INT1252 = 5;
-
-
- /**
- * Builds the XMLInputStream.
- * This constructor is intended to be called on Windows to speed up input
- * speed. The decoding is done by IXMLStream
- *
- * Note:
- * This constructor relies on CXMLStream class in xmlurlstream.dll. If
- * xmlurlstream.dll or xmlurlstream.tlb is not properly registered on the
- * system, or the encoding of the input stream cannot be handled,
- * this constructor throws an IOException
- */
- public XMLInputStream(URL url) throws IOException
- {
- InputStream in;
- try
- {
- in = new BufferedInputStream(url.openStream());
- }
- catch (IOException e)
- {
- throw new IOException("Error opening input stream for \"" +
- url.toString() + "\": " + e.toString());
- }
- setInputStream(in);
- }
-
-
- /**
- * Builds the XMLInputStream.
- * Reads the first four bytes of the InputStream in order to make a guess
- * as to the character encoding of the file.
- * Assumes that the document is following the XML standard and that
- * any non-UTF-8 file will begin with a <?XML> tag.
- */
- public XMLInputStream(InputStream in)
- {
- setInputStream(in);
- }
-
- public void setInputStream(InputStream in)
- {
- String version = System.getProperty("java.version");
- jdk11 = version.equals("1.1") ? true : false;
-
- littleendian = false;
- caseInsensitive = false;
-
- byteOrderMark = false;
- encoding = "UTF-8"; // Default encoding
-
- this.in = in;
- this.insr = null;
-
- readState = ASCII;
-
- boolean setDefault = false;
-
- try
- {
- char c1, c2, c3, c4;
-
- c1 = (char)in.read();
- c2 = (char)in.read();
- c3 = (char)in.read();
- c4 = (char)in.read();
- if( c1 == 0xFE && c2 == 0xFF && c3 == 0x00 && c4 == 0x3C )
- {
- // UCS-2, big-endian
- littleendian = false;
- byteOrderMark = true;
- readState = UCS2_POP;
- encoding = "UCS-2";
- }
- else if( c1 == 0xFF && c2 == 0xFE && c3 == 0x3C && c4 == 0x00 )
- {
- // UCS-2, little-endian
- littleendian = true;
- byteOrderMark = true;
- readState = UCS2_POP;
- encoding = "UCS-2";
-
- this.in = new ByteSwapInputStream( in );
- }
- else if( c1 == 0x00 && c2 == 0x3C && c3 == 0x00 && c4 == 0x3F )
- {
- // UCS-2, big-endian, no Byte Order Mark
- littleendian = false;
- readState = UCS2_POP;
- encoding = "UCS-2";
- }
- else if( c1 == 0x3C && c2 == 0x00 && c3 == 0x3F && c4 == 0x00 )
- {
- // UCS-2, little-endian, no Byte Order Mark
- littleendian = true;
- readState = UCS2_POP;
- encoding = "UCS-2";
-
- this.in = new ByteSwapInputStream( in );
- }
- else if( c1 == 0x3C && c2 == 0x3F &&
- Character.toUpperCase(c3) == 0x58 &&
- Character.toUpperCase(c4) == 0x4D )
- {
- // UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC,
- // or any other encoding that ensures that ASCII has normal positions
- readState = ASCII_POP;
- encoding = "ASCII";
- }
- else if( c1 == 0x00 && c2 == 0x00 && c3 == 0x00 && c4 == 0x3C )
- {
- // UCS-4, big-endian machine (1234 order)
- readState = ASCII_POP; // Until UCS-4 is implemented
- encoding = "UCS-4";
- }
- else if( c1 == 0x3C && c2 == 0x00 && c3 == 0x00 && c4 == 0x00 )
- {
- // UCS-4, little-endian machine (4321 order)
- readState = ASCII_POP; // Until UCS-4 is implemented
- encoding = "UCS-4";
- }
- else if( c1 == 0x00 && c2 == 0x00 && c3 == 0x3C && c4 == 0x00 )
- {
- // UCS-4, unusual octet order (2143 order)
- readState = ASCII_POP; // Until UCS-4 is implemented
- encoding = "UCS-4";
- }
- else if( c1 == 0x00 && c2 == 0x3C && c3 == 0x00 && c4 == 0x00 )
- {
- // UCS-4, unusual octet order (3412 order)
- readState = ASCII_POP; // Until UCS-4 is implemented
- encoding = "UCS-4";
- }
- else if( c1 == 0x4C && c2 == 0x6F && c3 == 0xE7 && c4 == 0xD4 )
- {
- // EBCDIC - We do NOT support this!
- readState = ASCII_POP; // Until EBCDIC is implemented
- encoding = "EBCDIC";
- }
- else
- {
- // UTF-8 without an <?XML> tag (assuming data is not corrupt)
- setDefault = true;
- }
-
- if( !encoding.equals( "UCS-2" ) )
- {
- push(c4);
- push(c3);
- push(c2);
- push(c1);
- }
- else
- {
- if( littleendian )
- {
- push(c3);
- push(c4);
- if( !byteOrderMark )
- {
- push(c1);
- push(c2);
- }
- }
- else
- {
- push(c4);
- push(c3);
- if( !byteOrderMark )
- {
- push(c2);
- push(c1);
- }
- }
- }
- }
- catch (IOException e)
- {
- // Can't do lookahead, so use default UTF-8
- setDefault = true;
- }
-
- pos = -1;
-
- if( setDefault )
- {
- try
- {
- if (! jdk11)
- throw new IOException("Readers not supported in JDK 1.0");
- // guess that the <?xml encoding=...?> tag will be read in
- // less that 4096 bytes.
- if (! in.markSupported())
- {
- in = new BufferedInputStream(in);
- }
- in.mark(4096);
- insr = new InputStreamReader( in, "UTF8" );
- readState = INPUTSR_POP;
- encoding = "UTF-8";
- }
- catch( IOException e2 )
- {
- // If there is an exception we can
- // just continue and treat file like ASCII text.
- readState = ASCII_POP;
- encoding = "ASCII";
- }
- }
- }
-
- private void push(char next)
- {
- if (index == 3)
- {
- System.exit(0);
- }
- this.next[++index] = next;
- }
-
- /**
- * Returns the next unicode char in the stream. The read done
- * depends on the current readState. POP states imply that there
- * are characters that have been pushed onto the next[] stack.
- */
- public int read() throws IOException
- {
- // On other platform
- switch( readState )
- {
- case INPUTSR:
- pos++;
- return insr.read();
- case ASCII:
- return in.read();
- case UCS2:
- {
- int b1, b2;
-
- b1 = in.read();
-
- if( b1 == -1 )
- return -1;
-
- b2 = in.read();
-
- return ((b1 << 8) | b2);
- }
- case INPUTSR_POP:
- if (index >= 0)
- {
- return next[index--];
- }
- else
- {
- readState = INPUTSR;
- return read();
- }
- case UCS2_POP:
- {
- int b1, b2;
-
- if (index >= 0)
- {
- b1 = next[index--];
- }
- else
- {
- readState = UCS2;
- b1 = in.read();
- }
-
- if( b1 == -1 )
- return -1;
-
- if (index >= 0)
- {
- b2 = next[index--];
- }
- else
- {
- readState = UCS2;
- b2 = in.read();
- }
-
- return ((b1 << 8) | b2);
- }
- case ASCII_POP:
- default:
- if (index >= 0)
- {
- return next[index--];
- }
- else
- {
- readState = ASCII;
- return in.read();
- }
- }
- }
-
-
- /**
- * Defines the character encoding of the stream. The new character encoding
- * must agree with the encoding determined by the constructer. setEncoding
- * is used to clarify between encodings that are not fully determinable
- * through the first four bytes in a stream and not to change the encoding.
- * This method must be called within 4096 reads() after construction.
- */
- public void setEncoding( String encoding ) throws IOException
- {
- insr = null;
- String encvm; // Java VM's version of encoding.
- int newEncoding = 0;
-
- if( encoding.equalsIgnoreCase( "ISO-10646-UCS-2" ) ||
- encoding.equalsIgnoreCase( "UCS-2" ) )
- {
- if( !this.encoding.equalsIgnoreCase( "UCS-2" ) )
- throw new IOException( "Illegal Change of Encoding" );
-
- readState = UCS2;
- this.encoding = "UCS-2";
- return;
- }
- else if( encoding.equalsIgnoreCase( "Shift_JIS" ) )
- {
- encvm = "SJIS";
- }
- else if( encoding.equalsIgnoreCase( "ISO-8859-1" ) )
- {
- encvm = "8859_1";
- }
- else if( encoding.equalsIgnoreCase( "ISO-10646-UCS-4" ) )
- { // UCS-4 NOT YET SUPPORTED!
- throw new IOException( "UCS-4 not yet supported" );
- }
- else if( encoding.equalsIgnoreCase( "UTF-8" ) )
- {
- encvm = "UTF8";
- newEncoding = INTUTF8;
- }
- else
- {
- encvm = encoding; // try passing through to VM...
- }
-
- if( !this.encoding.equalsIgnoreCase( "ASCII" ) &&
- !this.encoding.equalsIgnoreCase( "UTF-8" ) )
- throw new IOException( "Illegal Change of Encoding" );
-
-
- if (this.encoding.equalsIgnoreCase( "ASCII" ))
- {
- insr = null;
- readState = ASCII_POP;
- }
- else
- {
- if (jdk11) {
- if (pos != -1)
- {
- in.reset(); // This fixes a nasty bug in that InputStreamReaders
- in.skip(pos+1); // now buffer their input.
- }
- insr = new InputStreamReader( in, encvm );
- readState = INPUTSR;
- this.encoding = encoding;
- }
- else
- {
- throw new IOException( encvm + " is not supported by your Java Virtual Machine." +
- " Try installing the latest VM from http://www.microsoft.com/java/download.htm");
- }
- }
- }
-
-
- /**
- * Creates a new XMLOutputStream with the proper initial state.
- * XMLOutputStreams should always be created through this method
- * if the output stream is to mimic this input stream.
- */
- public XMLOutputStream createOutputStream( OutputStream out)
- {
- XMLOutputStream xmlOut = new XMLOutputStream( out );
- try {
- xmlOut.setEncoding( encoding,
- littleendian,
- byteOrderMark );
- } catch (IOException e ) {
- // Hmm. This should never happen because we already
- // successfully created the input stream.
- }
- return xmlOut;
- }
-
- /**
- * Close the stream and release system resources.
- */
- public void close() throws IOException
- {
- if (insr != null)
- insr.close();
- else if (in != null)
- in.close();
- }
-
- /**
- * Character pushed back into the stream.
- * Need to be able push back four characters for auto-encoding
- * detection support.
- */
- private int next[] = new int[4];
- private int index = -1;
-
- /**
- * The stream readers
- */
- private InputStream in;
- private InputStreamReader insr;
-
- /**
- * We remember the current position in the input stream so that
- * we can re-scan input after doing a reset() when setEncoding
- * is called. We have to do a mark()/reset() on setEncoding because
- * the first UTF8 InputStreamReader may buffer the input !
- */
- private int pos = 0;
-
- /**
- * Character encoding state
- */
- private String encoding;
- private boolean littleendian; // file littleendian (only applies to UCS-2 encoded files)
- private boolean byteOrderMark; // byteOrderMark at the beginning of file (UCS-2)
- private int readState;
- private boolean jdk11;
-
- /**
- * Instance varibales for Windows platforms
- */
- private int intEncoding = -1; // encoding of input stream
- private boolean eof = false; // whether the end of input stream has been reached
-
- /**
- * Whether the document to be parsed caseInsensitive.
- * (if so, all names are folded to upper case).
- * Default is 'false'.
- */
- public boolean caseInsensitive;
- }
-