home *** CD-ROM | disk | FTP | other *** search
- /*
- * (C) Copyright IBM Corp. 1998 - All Rights Reserved
- * @version 1.0 06/19/98
- * @author Helena Shih
- * Based on Taligent international support for C++
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <iostream.h>
- #include <string.h>
- #include <assert.h>
-
- #include "ucmp16.h"
- CompactShortArray* ulxfrmArray = 0;
-
- enum ECharTypeMapping {
- UNASSIGNED = 0,
- UPPERCASE_LETTER = 1,
- LOWERCASE_LETTER = 2,
- TITLECASE_LETTER = 3,
- MODIFIER_LETTER = 4,
- OTHER_LETTER = 5,
- NON_SPACING_MARK = 6,
- ENCLOSING_MARK = 7,
- COMBINING_SPACING_MARK = 8,
- DECIMAL_DIGIT_NUMBER = 9,
- LETTER_NUMBER = 10,
- OTHER_NUMBER = 11,
- SPACE_SEPARATOR = 12,
- LINE_SEPARATOR = 13,
- PARAGRAPH_SEPARATOR = 14,
- CONTROL = 15,
- FORMAT = 16,
- PRIVATE_USE = 17,
- SURROGATE = 18,
- DASH_PUNCTUATION = 19,
- START_PUNCTUATION = 20,
- END_PUNCTUATION = 21,
- CONNECTOR_PUNCTUATION = 22,
- OTHER_PUNCTUATION = 23,
- MATH_SYMBOL = 24,
- CURRENCY_SYMBOL = 25,
- MODIFIER_SYMBOL = 26,
- OTHER_SYMBOL = 27,
- INITIAL_PUNCTUATION = 28,
- FINAL_PUNCTUATION = 29
- };
-
- static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
- const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
- const int16_t tagValues[] =
- {
- /* Mn */ (int16_t)NON_SPACING_MARK,
- /* Mc */ (int16_t)COMBINING_SPACING_MARK,
- /* Me */ (int16_t)ENCLOSING_MARK,
- /* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
- /* Nl */ (int16_t)LETTER_NUMBER,
- /* No */ (int16_t)OTHER_NUMBER,
- /* Zs */ (int16_t)SPACE_SEPARATOR,
- /* Zl */ (int16_t)LINE_SEPARATOR,
- /* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
- /* Cc */ (int16_t)CONTROL,
- /* Cf */ (int16_t)FORMAT,
- /* Cs */ (int16_t)SURROGATE,
- /* Co */ (int16_t)PRIVATE_USE,
- /* Cn */ (int16_t)UNASSIGNED,
- /* Lu */ (int16_t)UPPERCASE_LETTER,
- /* Ll */ (int16_t)LOWERCASE_LETTER,
- /* Lt */ (int16_t)TITLECASE_LETTER,
- /* Lm */ (int16_t)MODIFIER_LETTER,
- /* Lo */ (int16_t)OTHER_LETTER,
- /* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
- /* Pd */ (int16_t)DASH_PUNCTUATION,
- /* Ps */ (int16_t)START_PUNCTUATION,
- /* Pe */ (int16_t)END_PUNCTUATION,
- /* Po */ (int16_t)OTHER_PUNCTUATION,
- /* Sm */ (int16_t)MATH_SYMBOL,
- /* Sc */ (int16_t)CURRENCY_SYMBOL,
- /* Sk */ (int16_t)MODIFIER_SYMBOL,
- /* So */ (int16_t)OTHER_SYMBOL,
- /* Pi */ (int16_t)INITIAL_PUNCTUATION,
- /* Pf */ (int16_t)FINAL_PUNCTUATION
- };
- int
- MakeProp(char* str)
- {
- int result = 0;
- char* matchPosition;
-
- matchPosition = strstr(tagStrings, str);
- if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
- else result = ((matchPosition - tagStrings) / 2);
- return result;
- }
-
- CompactShortArray*
- getArray(FILE *input)
- {
- if (ulxfrmArray == 0) {
- char buffer[1000];
- char* bufferPtr;
- int set = FALSE;
- char type[3];
-
- try {
- ulxfrmArray = ucmp16_open((int16_t)0xffff);
- int32_t unicode, otherunicode, digit, i;
- while (TRUE) {
- otherunicode = 0xffff;
- digit = -1;
- bufferPtr = fgets(buffer, 999, input);
- if (bufferPtr == NULL) break;
- if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
- sscanf(bufferPtr, "%X", &unicode);
- assert(0 <= unicode && unicode < 65536);
- bufferPtr = strchr(bufferPtr, ';');
- assert(bufferPtr != NULL);
- bufferPtr = strchr(bufferPtr + 1, ';');
- strncpy(type, ++bufferPtr, 2); // go to start of third field
- assert(type != NULL);
- type[2] = 0;
- int typeResult = tagValues[MakeProp(type)];
- // check for the decimal values
- bufferPtr++;
- for (i = 3; i < 8; i++) {
- bufferPtr = strchr(bufferPtr, ';');
- assert(bufferPtr != NULL);
- bufferPtr++;
- }
- sscanf(bufferPtr, "%X", &digit);
- if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) &&
- (digit >= 0 && digit <= 9)){
- buffer[10];
- sprintf(buffer, "0x%04X", unicode);
- cout << " { " << buffer << ", " << digit << "}, \n";
- }
- bufferPtr++;
- for (i = 8; i < 12; i++) {
- bufferPtr = strchr(bufferPtr, ';');
- assert(bufferPtr != NULL);
- bufferPtr++;
- }
- sscanf(bufferPtr, "%X", &otherunicode);
- // the Unicode char has a equivalent uppercase
- if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
- set = TRUE;
- }
- if ((typeResult == UPPERCASE_LETTER) && !set) {
- bufferPtr++;
- sscanf(bufferPtr, "%X", &otherunicode);
- if (0 <= otherunicode && otherunicode < 65536) {
- set = TRUE;
- }
- }
- if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
- ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
- set = FALSE;
- }
-
- if (input) fclose(input);
- ucmp16_compact(ulxfrmArray);
- }
- catch (...) {
- fprintf(stderr, "Error Occured while parsing unicode data file.\n");
- }
- }
- return ulxfrmArray;
- }
-
- void
- writeArrays()
- {
- const int16_t* values = ucmp16_getArray(ulxfrmArray);
- const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
- int32_t i;
- int32_t cnt = ucmp16_getCount(ulxfrmArray);
- cout << "\nconst uint32_t Unicode::caseIndex[] = {\n ";
- for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
- {
- cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
- << ", ";
- if (i != 0)
- if (i % 3 == 0)
- cout << "\n ";
- }
- cout << " (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
- << " };\n";
- cout << "\nconst int16_t Unicode::caseValues[] = {\n ";
- for (i = 0; i < cnt-1; i++)
- {
- cout << "(int16_t)" << (int16_t)values[i] << ", ";
- if (i != 0)
- if (i % 5 == 0)
- cout << "\n ";
- }
- cout << " (char)" << (int16_t)values[cnt-1] << " }\n";
- cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
- }
- /**
- * The main function builds the CharType data array and prints it to System.out
- */
- void main(int argc, char** argv)
- {
- CompactShortArray* arrays = 0;
- FILE *input = 0;
- if (argc != 2) {
- printf("Usage : chartype filename\n\n");
- exit(1);
- }
- input = fopen(argv[1], "r");
- if (input == 0) {
- printf("Cannot open the input file: %s\n\n", argv[1]);
- exit(1);
- }
- arrays = getArray(input);
- writeArrays();
- }
-
-