home *** CD-ROM | disk | FTP | other *** search
- /*
- *******************************************************************************
- * *
- * COPYRIGHT: *
- * (C) Copyright International Business Machines Corporation, 1998, 1999 *
- * Licensed Material - Program-Property of IBM - All Rights Reserved. *
- * US Government Users Restricted Rights - Use, duplication, or disclosure *
- * restricted by GSA ADP Schedule Contract with IBM Corp. *
- * *
- *******************************************************************************
- *
- */
- // XMLConverter.cpp
- // To convert one encoded XML file to another
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <assert.h>
-
- #ifdef _WIN32
- # include <windows.h>
- #endif
-
- #include "utypes.h"
- #include "ustring.h"
- #include "ucnv.h"
- #include "ucnv_err.h"
- #include "uloc.h"
- #include "uchar.h"
-
- #define MAXFILENAMELEN 1024
- #define RAWBUFSIZE 4096
- #define ENCODINGCOUNT 5
- #define FIRSTLINEBUF 256
- typedef unsigned char BYTE;
-
-
- char firstLine[128];
- char encodingNameInFile[256];
- bool verbose = false;
-
- extern void convertFile(char*, char*, char*, UConverter*);
- extern void usage();
- extern void printChars(unsigned char*, int);
- extern int getInputEncodingType(const BYTE* rawBuffer,
- unsigned long byteCount);
- extern long convertFirstLine(FILE* inF,
- char* inEncName,
- FILE* outF,
- char* outEncName,
- char* ptrBuf,
- unsigned long toRead,
- UChar* uBuf);
- extern void catString(char* thisString, bool quote);
- extern int32_t XMLUConvert( UConverter* inConverter,
- UConverter* outConverter,
- const char* inBuffer,
- int32_t* inBufSize,
- char* outBuffer,
- int32_t outBufCapacity,
- bool_t flush,
- UErrorCode* err);
- extern void XMLU_fromCodepageToCodepage( UConverter* outConverter,
- UConverter* inConverter,
- char** target,
- const char* targetLimit,
- const char** source,
- const char* sourceLimit,
- int32_t* offsets,
- bool_t flush,
- UErrorCode* err);
-
- static const BYTE gEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94 };
- #if 0
- //not supported encodings
- static const BYTE gUCS4BPre[] = { 0x00, 0x00, 0x00, 0x3C };
- static const BYTE gUCS4LPre[] = { 0x3C, 0x00, 0x00, 0x00 };
- #endif
- static const BYTE gUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F };
- static const BYTE gUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00 };
- static const char gXMLDecl_ASCII[]= { 0x3C, 0x3F, 0x78, 0x6D, 0x6C };
-
- enum Encodings
- {
- EBCDIC = 0,
- UCS_4B = 1,
- UCS_4L = 2,
- US_ASCII = 3,
- UTF_8 = 4,
- UTF_16B = 5,
- UTF_16L = 6,
-
- Encodings_Count = ENCODINGCOUNT,
- Encodings_Min = EBCDIC,
- Encodings_Max = UTF_16L,
-
- OtherEncoding = 999
- };
-
-
- void usage(char * exeName)
- {
- fprintf(stdout, "\n USAGE: \n \t%s [-h] [-v] -e trgEncName inputFile outputFile \n\n", exeName);
- fprintf(stdout, " %s = Exe name \n ", exeName);
- fprintf(stdout, "-h \t= to get help (this information!) \n ");
- fprintf(stdout, "-v \t= set verbose on; \n \t\t to get more information about the conversion process \n ");
- fprintf(stdout, "-e \t= This is a mandatory option and follows with the targetEncName");
- fprintf(stdout, " \t\t E.g., output encoding can be like : \n \t\t ascii, utf8, utf-16be, utf-16le, ebcdic-cp-us \n");
- fprintf(stdout, "trgEncName \t= The output encoding type needed. \n \t\t It always should follow the -e switch\n");
- fprintf(stdout, "inputFile \t= The input XML file name \n");
- fprintf(stdout, "outputFile \t= The output XML file name \n");
- fprintf(stdout, " \n For example: \n ");
- fprintf(stdout, " \t %s -e utf8 pr-utf-16.xml pr-utf-8.xml \n\n\n ", exeName);
- }
-
-
-
- int main(int argc, char** argv)
- {
- UErrorCode err = U_ZERO_ERROR;
- char* inFileName;
- char* outFileName;
- char * encName = NULL;
-
- UConverter* conv = NULL;
-
- for (int i=0; i< argc; i++)
- {
- if (!strcmp( argv[i], "-h") || (argc < 5) )
- {
- usage(argv[0]);
- exit(1);
- }
- if (!strcmp( argv[i], "-v"))
- verbose = true;
- if (!strcmp( argv[i], "-e"))
- {
- if ( argc == i+4)
- {
- encName = new char[strlen(argv[i+1]) +1];
- strcpy(encName, argv[i+1]);
- inFileName = new char[strlen(argv[i+2]) +1];
- strcpy(inFileName, argv[i+2]);
- outFileName = new char[strlen(argv[i+3]) +1];
- strcpy(outFileName, argv[i+3]);
- break;
- }
- else
- {
- usage(argv[0]);
- exit(1);
- }
- }
- }
-
- conv = ucnv_open(encName, &err);
- if (U_FAILURE(err))
- {
- if (verbose)
- {
- fprintf(stderr, "Could not create converter to: %s\n", encName);
- #if defined(_DEBUG) && defined(XP_CPLUSPLUS)
- fprintf (stderr,"FAILURE! (%s) (%d)\n", errorName(err), err);
- #endif
- }
- ucnv_close(conv);
- exit(1);
- }
-
- fprintf(stdout, "Converting %s to %s...\n", inFileName, outFileName);
- convertFile(encName, inFileName, outFileName, conv);
- fprintf(stdout, "Finished transcoding file: %s\n", inFileName);
-
- ucnv_close(conv);
- if (encName)
- delete encName;
- return 0;
- }
-
- void convertFile(char* encName, char* iFN, char* oFN, UConverter* outConvrtr)
- {
- //Read the input file
- //
- FILE* inFile = fopen( iFN, "rb");
- if (inFile == NULL) {
- if (verbose)
- fprintf(stderr, "Could not open input file - %s for reading \n", iFN);
- exit(1);
- }
-
- FILE* outFile = fopen(oFN, "wb");
- if (outFile == NULL)
- {
- if (verbose)
- fprintf(stderr, "Could not open output file - %s for writing \n", oFN);
- fclose(inFile);
- return;
- }
-
- char rawBuf[RAWBUFSIZE];
- char* pRawBuf = NULL;
- unsigned long bytesRead = 0;
- UErrorCode err = U_ZERO_ERROR;
-
- //get the file size
- //
- unsigned int curPos = ftell(inFile);
-
- if(verbose)
- fprintf(stderr, "curPos = %d\n", curPos);
-
- if (curPos == 0xFFFFFFFF)
- {
- fprintf(stderr, "fileSize - Could not save current pos \n");
- exit(1);
- }
-
- // Seek to the end and save that value for return
- //
- if ( fseek(inFile, 0 , SEEK_END) )
- {
- fprintf(stderr, "fileSize - Could not seek to end \n");
- exit(1);
- }
-
- const unsigned int endPos = ftell(inFile);
- if (endPos == 0xFFFFFFFF)
- {
- fprintf(stderr, "fileSize - Could not get the end pos \n");
- exit(1);
- }
-
- // And put the pointer back
- //
- if (fseek(inFile, curPos, SEEK_SET))
- {
- fprintf(stderr, "fileSize - Could not seek back to original pos \n");
- exit(1);
- }
-
- if (curPos >= endPos)
- {
- fprintf(stderr,"Reached end of input file while reading \n");
- exit(1);
- }
-
- unsigned int bytesLeft = endPos - curPos;
- if (verbose)
- fprintf(stdout,"Input file size is %d \n", bytesLeft);
-
- unsigned int toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE;
-
- //Read the infile
- //
- bytesRead = fread( (void*)rawBuf, 1, toRead, inFile);
- if (ferror(inFile))
- {
- fprintf(stderr," couldnot read file for input encoding \n");
- exit(1);
- }
-
- if (bytesRead == 0)
- {
- fprintf(stderr," couldnot fill raw buffer \n");
- exit(1);
- }
- pRawBuf = rawBuf;
-
- // get the input encoding type
- int inputEnc = getInputEncodingType((const BYTE*)rawBuf, bytesRead);
- if (inputEnc == OtherEncoding)
- {
- fprintf(stderr, " Unknown encoded input file. \n Only input encodings supported in the first line are \n");
- fprintf(stderr, " ascii, ebcdic-cp-us, utf8, utf-16be, utf-16le \n");
- exit(1);
- }
-
- //transcoding the first line from inEncodName to ascii and then replacing
- //the encoding=inEncodingName to encoding=outEncodingName
- //
-
- UChar ucBuf[RAWBUFSIZE];
- char * inEncodName;
- char* tmpPtr = (char*) rawBuf;
-
- //get the input encoding name
- //
- switch (inputEnc)
- {
- case 0 :
- inEncodName = new char[strlen("ebcdic-cp-us") +1];
- strcpy(inEncodName, "ebcdic-cp-us");
- break;
- case 3 :
- inEncodName = new char[strlen("ascii") +1];
- strcpy(inEncodName, "ascii");
- break;
- case 4 :
- inEncodName = new char[strlen("utf8") +1];
- strcpy(inEncodName, "utf8");
- break;
- case 5 :
- inEncodName = new char[strlen("utf-16be") +1];
- strcpy(inEncodName, "utf-16be");
- break;
- case 6 :
- inEncodName = new char[strlen("utf-16le") +1];
- strcpy(inEncodName, "utf-16le");
- break;
- default :
- break;
- };
-
- if(verbose)
- {
- fprintf(stderr, "inConverter = %s\n", inEncodName);
- }
-
- UConverter* inConvrtr = ucnv_open(inEncodName, &err);
- //now read and transcode the input to output file
- //Process the firstline separately
- //
- long afterFirstLine = convertFirstLine(inFile, inEncodName, outFile, encName,
- pRawBuf, toRead, (UChar*)ucBuf);
-
- //move the pointer after the first line
- //
- if (fseek(inFile, (unsigned long) afterFirstLine, SEEK_SET))
- {
- fprintf(stderr, "fileSize - Could not set the cursor to %d after the first line \n", afterFirstLine);
- exit(1);
- }
- else
- if(verbose)
- fprintf(stderr,"Seeked to %d OK \n", afterFirstLine);
- bytesLeft = endPos - afterFirstLine;
- toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE;
-
- // read the rest of the input file
- //
- if (verbose)
- fprintf(stdout,"The first line consists of %d bytes \n", afterFirstLine);
- if (encodingNameInFile !=NULL)
- {
- if (inEncodName)
- delete inEncodName;
- inEncodName = new char[strlen(encodingNameInFile)+1];
- strcpy(inEncodName, encodingNameInFile);
- ucnv_close(inConvrtr);
- inConvrtr = ucnv_open(inEncodName, &err);
- }
- if (verbose)
- fprintf(stdout, "Input Encoding type = %s, Output Encoding type = %s \n", inEncodName, encName);
-
- char *outBuf = new char[RAWBUFSIZE];
- int outBufSize = RAWBUFSIZE;
- bool tFlush = false;
- err = U_ZERO_ERROR;
-
- if (verbose)
- fprintf(stdout, "processing the rest of the file \n");
- while( (bytesRead = fread((void *) rawBuf, 1, toRead, inFile)) > 0 || !tFlush)
- {
- int32_t bytesNeeded = XMLUConvert( inConvrtr,
- outConvrtr,
- pRawBuf,
- (int32_t*)&bytesRead,
- outBuf,
- outBufSize,
- tFlush,
- &err);
- if (bytesNeeded > 0)
- {
- long bout =
- fwrite((void *) outBuf, 1, bytesNeeded, outFile);
- if (bout != bytesNeeded)
- {
- fprintf(stderr, "Wrote only %d bytes.\n", bout);
- fclose(inFile);
- fclose(outFile);
- }
- }
-
- if ((err != U_BUFFER_OVERFLOW_ERROR) && U_FAILURE(err) )
- {
- #if defined(_DEBUG)
- fprintf (stderr, "Error transcoding rest of the file: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inFile);
- fclose(outFile);
- exit(1);
- }
- if ((bytesRead > 0) && (err !=U_ZERO_ERROR))
- {
- if(verbose)
- fprintf(stderr, "err=%d * read %d bytes\n", err,bytesRead);
-
- if (fseek(inFile, (curPos+bytesRead), SEEK_SET))
- {
- fprintf(stderr, "fileSize - Could not set the input cursor to %d (curpos=%d, bytesRead=%d)\n", curPos+bytesRead,curPos,bytesRead);
- exit(1);
- }
- curPos = ftell(inFile);
- bytesLeft = endPos - curPos;
- }
- else
- {
- curPos = ftell(inFile);
- bytesLeft = endPos - curPos;
- }
- toRead = (RAWBUFSIZE > bytesLeft) ? bytesLeft : RAWBUFSIZE;
- if (toRead < RAWBUFSIZE) tFlush = true;
- if (err == U_BUFFER_OVERFLOW_ERROR)
- err = U_ZERO_ERROR;
- }
- ucnv_close(inConvrtr);
- delete inEncodName;
- fclose(inFile);
- fclose(outFile);
- };
-
-
-
- int getInputEncodingType(const BYTE* rawBuffer, unsigned long byteCount)
- {
- //match the first four bytes of the input buffer with the encoding types available
- //checking for ASCII
- //
- if (byteCount > 5)
- {
- if (!memcmp(rawBuffer, gXMLDecl_ASCII, 5))
- return US_ASCII;
- }
-
- // If the count of raw bytes is less than 2, it cannot be anything
- // we understand, so return UTF-8 as a fallback.
- //
- if (byteCount < 2)
- return UTF_8;
-
- // We know its at least two bytes, so lets check for a UTF-16 BOM.
- //
- if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
- return UTF_16B;
- else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
- return UTF_16L;
-
- // Oh well, not one of those. So now lets see if we have at least 4
- // bytes. If not, then we are out of ideas and can return UTF-8 as the
- // fallback.
- //
- if (byteCount < 4)
- return OtherEncoding;
-
- // We have at least 4 bytes. So lets check the 4 byte sequences that
- // indicate other UTF-16 encodings.
- //
- if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
- {
- #if 0
- //not supported encodings
- if (!memcmp(rawBuffer, gUCS4BPre, 4))
- return UCS_4B;
- else if (!memcmp(rawBuffer, gUCS4LPre, 4))
- return UCS_4L;
- else
- #endif
- if (!memcmp(rawBuffer, gUTF16BPre, 4))
- return UTF_16B;
- else if (!memcmp(rawBuffer, gUTF16LPre, 4))
- return UTF_16L;
- }
-
- // See if we have enough bytes to possibly match the EBCDIC prefix.
- // If so, try it.
- //
- if (!memcmp(rawBuffer, gEBCDICPre, 4))
- return EBCDIC;
-
- // Does not seem to be anything we know, so go with UTF-8 to get at
- // least through the first line and see what it really is.
- //
- return OtherEncoding;
- }
-
-
- long convertFirstLine( FILE* inF, char* inEncName,
- FILE* outF, char* outEncName,
- char* ptrBuf, unsigned long toRead,
- UChar* uBuf)
- {
- //Here we read the inputFile with the specified buffer size.
- //Then convert this to ascii. then read the first line and convert to
- //output and input encoding types and return for rest of the conversion
- //
-
- if (fseek(inF, 0, SEEK_SET))
- {
- fprintf(stderr, "file - Could not seek the begin pos \n");
- exit(1);
- }
-
- unsigned long bytesRead = fread( (void*)ptrBuf, 1, toRead, inF);
-
- char tempBuf[RAWBUFSIZE];
- int bufLength = 0;
- long bytesNeeded = 0;
- UErrorCode err = U_ZERO_ERROR;
-
- bytesNeeded = ucnv_convert("ascii",
- inEncName,
- (char*) tempBuf,
- 0,
- (const char*) ptrBuf,
- bytesRead,
- &err);
-
- if (err == U_BUFFER_OVERFLOW_ERROR)
- {
- err = U_ZERO_ERROR;
- }
- else if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- printf ("Error transcoding first line of input file: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inF);
- fclose(outF);
- exit(1);
- }
-
- ucnv_convert("ascii",
- inEncName,
- (char*) tempBuf,
- bytesNeeded,
- (const char*) ptrBuf,
- bytesRead,
- &err);
-
- if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- printf ("Error transcoding2 first line of input file: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inF);
- fclose(outF);
- exit(1);
- }
- else
- {
- //read the tempBuf to get the first line
- //
- char firstLineBuf[FIRSTLINEBUF];
- int tempBufLength = 0;
-
- for( bufLength = 0, tempBufLength=0; bufLength < FIRSTLINEBUF; bufLength++, tempBufLength++)
- {
- if ((tempBufLength == 0) && ((inEncName == "utf-16be") || (inEncName == "utf-16le") || (inEncName == "utf16")) )
- tempBufLength++;
- firstLineBuf[bufLength] = (char)tempBuf[tempBufLength];
- if (tempBuf[tempBufLength] == 0x3E) {
- firstLineBuf[bufLength+1] = '\0';
- break;
- }
-
- }
- char* pFLB = new char[sizeof(firstLineBuf) +1];
- strcpy(pFLB, firstLineBuf);
-
- //if the file doesnot contain the version string line then its and illegal file
- //
- if (firstLineBuf[0] != 0x3C )
- {
- fprintf(stderr,"Illegal xml file: It doesnot contain the xml declaration statement on the first line \n");
- fclose(inF);
- fclose(outF);
- exit(1);
- }
-
- bool encString = true;
- bool stdString = true;
- bool encInsertMid = false;
- bool encInsertLast = false;
- bool dQuote = true;
- char* doubleQuote = "\"";
- char* singleQuote = "\'";
-
- if (!strstr( (const char*)pFLB, doubleQuote))
- {
- if (!strstr( (const char*)pFLB, singleQuote))
- {
- fprintf(stderr,"Illegal xml file: It doesnot contain the approprite xml declaration \n");
- fclose(inF);
- fclose(outF);
- exit(1);
- }
- dQuote = false;
- }
-
- char* newString = strstr( (const char*) pFLB, "encoding");
- char* stringWithEnc = 0;
-
- if (!newString)
- encString = false;
- else
- {
- stringWithEnc = new char[strlen(newString)+1];
- strcpy(stringWithEnc, newString);
- }
-
- newString = strstr( (const char*) pFLB, "standalone");
- char* stringWithStd = 0;
- if (!newString)
- stdString = false;
- else
- {
- stringWithStd = new char[strlen(newString)+1];
- strcpy(stringWithStd, newString);
- }
-
- if (!encString && !stdString)
- encInsertLast = true;
- if (!encString && stdString)
- encInsertMid = true;
-
- //Encodingname for the rest of the input file could be different.
- //If its not specified in the first line then assume it to be UTF8
- if (encInsertLast || encInsertMid)
- {
- //if the encoding type was found utf16 family or ebcdic and
- // the encoding string is not present in the file then its an error
- if (!strcmp(inEncName, "utf-16be")
- || !strcmp(inEncName, "utf-16le")
- || !strcmp(inEncName, "ebcdic-cp-us"))
- {
- fprintf(stderr, "Illegal xml file: it doesnot contain the encoding string in the first line of the input file\n");
- fclose(inF);
- fclose(outF);
- exit(1);
- }
- strcpy(encodingNameInFile, inEncName);
- }
-
- char* tempString = " encoding=";
- char* dupFLB = strdup(pFLB);
- int stringTwoLength = 0;
-
- /* build up the length */
- stringTwoLength = bufLength;
-
- if(tempString)
- stringTwoLength += strlen(tempString);
-
- if(outEncName)
- stringTwoLength += strlen(outEncName);
-
- if(stringWithStd)
- stringTwoLength += strlen(stringWithStd);
-
- stringTwoLength += 5;
-
- char* stringTwo = new char[stringTwoLength];
-
- if (encInsertLast) {
- char* stringOne = new char[bufLength];
- strncpy(stringOne, pFLB, bufLength-1);
- strcpy(stringOne+bufLength-1, "");
- stringTwo = strcpy(stringTwo, stringOne);
- strcat(stringTwo, tempString);
- catString(stringTwo, dQuote);
- strcat(stringTwo, outEncName);
- catString(stringTwo, dQuote);
- strcat(stringTwo , " ?>");
- delete stringOne;
- }
- //insert the string before 'standalone' statement
- else if (encInsertMid) {
- char* stringThree = new char[bufLength + strlen(tempString) + strlen(outEncName) + 5];
- if (dQuote)
- stringThree = strtok(dupFLB, doubleQuote);
- else
- stringThree = strtok(dupFLB, singleQuote);
-
- strcpy(stringTwo, stringThree);
- catString(stringTwo, dQuote);
-
- char* tmpString;
- if (dQuote)
- tmpString = strtok(0, doubleQuote);
- else
- tmpString = strtok(0, singleQuote);
- if (tmpString != NULL)
- strcat(stringTwo, tmpString);
-
- catString(stringTwo, dQuote);
- strcat(stringTwo, tempString);
- catString(stringTwo, dQuote);
-
- strcat(stringTwo, outEncName);
- if (dQuote)
- strcat(stringTwo, "\" ");
- else
- strcat(stringTwo, "\' ");
- strcat(stringTwo, stringWithStd);
- delete stringThree;
- }
- //if the encoding string is there then modify the output encoding name in it.
- else if (encString)
- {
- char* stringFive = new char[strlen(dupFLB)+1];
-
- if (dQuote)
- stringFive = strtok (dupFLB, doubleQuote);
- else
- stringFive = strtok (dupFLB, singleQuote);
-
- strcpy(stringTwo, stringFive);
- catString(stringTwo, dQuote);
- while (stringFive != NULL)
- {
- if (dQuote)
- stringFive = strtok(0,doubleQuote);
- else
- stringFive = strtok(0,singleQuote);
-
- if (stringFive == NULL)
- break;
- strcat(stringTwo, stringFive);
-
- char* n1String = strstr(stringFive, ">");
- if (!n1String)
- catString(stringTwo, dQuote);
-
- char* nString = strstr(stringFive, "encoding");
- if (nString)
- {
- strcat(stringTwo, outEncName);
- if (dQuote)
- stringFive = strtok(0, doubleQuote);
- else
- stringFive = strtok(0, singleQuote);
- strcpy(encodingNameInFile, stringFive); //this is the encoded string name
- catString(stringTwo, dQuote);
- }
- }
- if (stringFive != NULL)
- {
- delete stringFive;
- stringFive = 0;
- }
- }
-
- // introduce the first order bytes for utf16 be and le files
- //
- if (!strcmp(outEncName, "utf-16be") || !strcmp(outEncName, "utf16"))
- {
- uBuf[0] = 0xFE;
- fwrite( (void*) uBuf, 1, 1, outF);
- uBuf[0] = 0xFF;
- fwrite( (void*) uBuf, 1, 1, outF);
- } else if (!strcmp(outEncName , "utf-16le"))
- {
- uBuf[0] = 0xFF;
- fwrite( (void*) uBuf, 1, 1, outF);
- uBuf[0] = 0xFE;
- fwrite( (void*) uBuf, 1, 1, outF);
- }
-
- err = U_ZERO_ERROR;
- long oneChar = 0;
- while ( *stringTwo != '\0' )
- {
- //transcode character-by-character
- oneChar = ucnv_convert(outEncName,
- "ascii",
- (char*) uBuf,
- 0,
- (const char*) stringTwo,
- 1,
- &err);
- if (err == U_BUFFER_OVERFLOW_ERROR)
- {
- err = U_ZERO_ERROR;
- }
- else if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- fprintf (stderr, "Error transcoding char-by-char: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inF);
- fclose(outF);
- exit(1);
- }
-
- ucnv_convert(outEncName,
- "ascii",
- (char*) uBuf,
- oneChar,
- (const char*) stringTwo,
- 1,
- &err);
- if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- fprintf (stderr, "Error transcoding2 char-by-char: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inF);
- fclose(outF);
- exit(1);
- }
- fwrite( (void*) uBuf, 1, oneChar, outF);
- stringTwo++;
- }
- }
-
-
- //Now get the pointer offset after the first line in the input file
- //and return this position
- //
- char* newInEncName = new char[strlen(inEncName) +1];
- strcpy(newInEncName, inEncName);
- if (encodingNameInFile !=NULL)
- {
- if (inEncName)
- delete newInEncName;
- newInEncName = new char[strlen(encodingNameInFile)+1];
- strcpy(newInEncName, encodingNameInFile);
- }
-
- char oldBuf[RAWBUFSIZE];
- int bufHere = bufLength +1;
- if (!strcmp(newInEncName, "utf-16be") || !strcmp(newInEncName, "utf16") || !strcmp(newInEncName, "utf-16le"))
- {
- bufHere +=1;
- memcpy((void*)oldBuf, (void*) tempBuf, bufHere);
- }
- else
- memcpy((void*)oldBuf, (void*) tempBuf, bufHere);
-
- char newBuf[RAWBUFSIZE];
- long endBytes = 0;
- //transcode this ascii type to the input encoding type
- //and get the pointer to the end of first line in the input buffer
- //
- err = U_ZERO_ERROR;
- endBytes = ucnv_convert(newInEncName,
- "ascii",
- (char*) newBuf,
- 0,
- (const char*) oldBuf,
- bufHere,
- &err);
-
- if (err == U_BUFFER_OVERFLOW_ERROR)
- {
- err = U_ZERO_ERROR;
- }
- else if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- fprintf (stderr, "Error transcoding from ascii to input encoding: (%s) %d\n", errorName(err), err);
- #endif
- fclose(inF);
- fclose(outF);
- exit(1);
- }
- ucnv_convert(newInEncName,
- "ascii",
- (char*) newBuf,
- endBytes,
- (const char*) oldBuf,
- bufHere,
- &err);
- if (U_FAILURE(err))
- {
- #if defined(_DEBUG)
- fprintf (stderr, "Error transcoding2 from ascii to input encoding: (%s) %d\n", errorName(err), err);
- #endif
- delete newInEncName;
- fclose(inF);
- fclose(outF);
- exit(1);
- }
-
- return endBytes;
- }
-
-
- int32_t XMLUConvert( UConverter* inConverter,
- UConverter* outConverter,
- const char* inBuffer,
- int32_t* inBufSize,
- char* outBuffer,
- int32_t outBufCapacity,
- bool_t flush,
- UErrorCode* err)
- {
- const char* inBufferAlias = inBuffer;
- char* outBufferAlias = outBuffer;
- const char* inBufferEnd = inBuffer + *inBufSize;
- const char* outBufferEnd = outBuffer + outBufCapacity;
- //const char* consumed;
-
- if (U_FAILURE(*err)) return 0;
-
- XMLU_fromCodepageToCodepage(outConverter,
- inConverter,
- &outBufferAlias,
- outBufferEnd,
- &inBufferAlias,
- inBufferEnd,
- NULL,
- flush,
- err);
-
- if (*err == U_INDEX_OUTOFBOUNDS_ERROR) *err = U_BUFFER_OVERFLOW_ERROR;
-
- // *inBufSize = inBufferAlias;
- return outBufferAlias - outBuffer;
- }
-
- void XMLU_fromCodepageToCodepage( UConverter* outConverter,
- UConverter* inConverter,
- char** target,
- const char* targetLimit,
- const char** source,
- const char* sourceLimit,
- int32_t* offsets,
- bool_t flush,
- UErrorCode* err)
- {
-
- #if 0
- UChar out_chunk[RAWBUFSIZE];
- const UChar* out_chunk_limit = out_chunk + RAWBUFSIZE;
- UChar* out_chunk_alias;
- UChar const* out_chunk_alias2;
- UChar const* consumed_UChars;
-
-
- if (U_FAILURE(*err)) return;
-
- *consumed = *source;
- /*loops until the input buffer is completely consumed
- *or if an error has be encountered
- *first we convert from inConverter codepage to Unicode
- *then from Unicode to outConverter codepage
- */
-
- while ((sourceLimit != *source) && U_SUCCESS(*err))
- {
- out_chunk_alias = out_chunk;
- *source = *consumed;
- ucnv_reset(inConverter);
- ucnv_toUnicode(inConverter,
- &out_chunk_alias,
- out_chunk_limit,
- source,
- sourceLimit,
- consumed,
- flush,
- err);
-
- /*U_INDEX_OUTOFBOUNDS_ERROR means that the output "CHUNK" is full
- *we will require at least another loop (it's a recoverable error)
- */
-
- if (U_SUCCESS(*err) || (*err == U_INDEX_OUTOFBOUNDS_ERROR))
- {
- *err = U_ZERO_ERROR;
- out_chunk_alias2 = out_chunk;
-
- while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS(*err))
- {
- ucnv_fromUnicode(outConverter,
- target,
- targetLimit,
- &out_chunk_alias2,
- out_chunk_alias,
- &consumed_UChars,
- FALSE,
- err);
-
- }
- }
- else break;
- }
- return;
-
- #endif
-
-
- UChar out_chunk[RAWBUFSIZE];
- const UChar *out_chunk_limit = out_chunk + RAWBUFSIZE;
- UChar *out_chunk_alias;
- UChar const *out_chunk_alias2;
-
-
- if (U_FAILURE (*err)) return;
-
-
- /*loops until the input buffer is completely consumed
- *or if an error has be encountered
- *first we convert from inConverter codepage to Unicode
- *then from Unicode to outConverter codepage
- */
- while ((*source != sourceLimit) && U_SUCCESS (*err))
- {
- out_chunk_alias = out_chunk;
- ucnv_toUnicode (inConverter,
- &out_chunk_alias,
- out_chunk_limit,
- source,
- sourceLimit,
- NULL,
- flush,
- err);
-
- /*U_INDEX_OUTOFBOUNDS_ERROR means that the output "CHUNK" is full
- *we will require at least another loop (it's a recoverable error)
- */
-
- if (U_SUCCESS (*err) || (*err == U_INDEX_OUTOFBOUNDS_ERROR))
- {
- *err = U_ZERO_ERROR;
- out_chunk_alias2 = out_chunk;
-
- while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS (*err))
- {
- ucnv_fromUnicode (outConverter,
- target,
- targetLimit,
- &out_chunk_alias2,
- out_chunk_alias,
- NULL,
- TRUE,
- err);
-
- }
- }
- else
- break;
- }
-
- return;
- }
-
- void catString(char* thisString, bool quote)
- {
- if (quote)
- strcat(thisString, "\"");
- else
- strcat(thisString, "\'");
- }
-