kermit.columbia.edu

home *** CD-ROM | disk | FTP | other *** search

/ kermit.columbia.edu / kermit.columbia.edu.tar / kermit.columbia.edu / archives / charsets.zip / utf8.c < prev next >

Wrap

C/C++ Source or Header | 2004-08-15 | 13KB | 429 lines

/* U T F 8 . C Dumps a selected portion of Unicode Plane 0 in UTF8 to standard output. Output is one line per character: [c] xxxx name where: c is the character in UTF-8, xxxx is the 4-digit hex code, name is the character's name from the Unicode database. Usage: utf8 Dump all of BMP except controls. utf8 hex Dump <hex> through FFFF. utf8 hex1 hex2 Dump <hex1> through <hex2>. utf8 -w [hex [hex]] As above but suitable for the Web. utf8 -p directory Include this to specify directory for database files. Default location for Unicode database files is: /pub/ftp/kermit/charsets/ /www/data/ftp/kermit/charsets/ Obtain up-to-date copies of database files from: http://www.unicode.org/Public/UNIDATA/UnicodeData.txt http://www.unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt If the Unicode database can't be found, the characters are dumped without names. Assumes Unicode database is in ascending order with one record per line: first field is hex code, second field is name; field separator is semicolon (;). The DerivedBidiClass file need not be in code order. ANSI C required. F. da Cruz, Columbia University, May 2000. Updated 10 June 2003: new -w option makes a Web version that: - Puts < for <, > for >, & for &. - Puts a space before each combining mark - Puts U+200E (LTRM) after each RTL character. - Automatically substitutes space for control/formatting characters. Updated 18 Jun 2003: - new -p option to specify path for database files. - Reads DerivedBidiClass.txt to get BIDI class for undefined code points. - getfields() strips leading and trailing blanks from each field. Updated 25 Jun 2003: - Show hex code as U+xxxx to avoid having the digits in certain entries turned into Hindi digits (don't ask!) Updated 15 Aug 2004: - Show decimal NCRs. */ #include <stdio.h> #define USHORT unsigned short #define ULONG unsigned long #define CONST const #define CHAR unsigned char #ifndef MAXPATHLEN #define MAXPATHLEN 1024 #endif /* MAXPATHLEN */ CHAR firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; /* Default directories for Unicode database files */ char * ucdata = "/pub/ftp/kermit/charsets/"; char * ucdata2 = "/www/data/ftp/kermit/charsets/"; char * argv0; /* My name */ char ltrm[4]; /* Left To Right Mark */ char line[1024]; /* Database line buffer */ char * field[16]; /* Fields within line */ struct lohi { /* Struct for ranges */ int lo; int hi; }; struct lohi bidi[256]; /* Default RTL blocks */ int b = 0; /* Number of RTL blocks */ int ucs2_to_utf8(USHORT ucs2, CHAR ** utf8) { /* Convert UCS-2 to UTF-8 */ static CHAR utf8return[8]={0,0,0,0,0,0,0,0}; register CONST ULONG byteMask = 0xBF; register CONST ULONG byteMark = 0x80; int utf8len = 0; int i = 0; if (ucs2 < 0x80) { utf8len = 1; } else if (ucs2 < 0x800) { utf8len = 2; } else #ifdef DO_UCS4 /* This is always true for UCS-2 but would be needed for UCS-4*/ /* When ucs2 is USHORT this gives compiler warnings. */ if (ucs2 <= 0xffff) #endif /* DO_UCS4 */ { utf8len = 3; } #ifdef DO_UCS4 /* The following would be for UCS-4 */ else if (ucs2 < 0x200000) { utf8len = 4; } else if (ucs2 < 0x4000000) { utf8len = 5; } else if (ucs2 <= 0x7FFFFFFFUL) { /* 31 bits = max for UCS4 */ utf8len = 6; } else { utf8len = 2; ucs2 = 0xFFFD; /* Replacement for invalid char */ } #endif /* DO_UCS4 */ i = utf8len; /* index into utf8return */ utf8return[i--] = 0; /* Null terminate the string */ switch (utf8len) { /* code falls through cases! */ case 6: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6; case 5: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6; case 4: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6; case 3: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6; case 2: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6; case 1: utf8return[i--] = ucs2 | firstByteMark[utf8len]; } *utf8 = utf8return; return(utf8len); } usage(s) char * s; { fprintf(stderr,"Usage: %s [-w] [hex [hex]]\n",s); exit(1); } int hextoint(s) char * s; { /* Convert hex string to integer */ unsigned int x = 0; int d; char c; while (c = *s++) { if (c >= 'A' && c <= 'F') d = c - 'A' + 10; else if (c >= 'a' && c <= 'f') d = c - 'a' + 10; else if (c >= '0' && c <= '9') d = c - '0'; else usage(argv0); x = (x << 4) + d; } return(x); } void clearfields() { /* Clear database fields */ int i; for (i = 0; i < 16; i++) field[i] = (char *)0; } void getfields(s) char *s; { /* Get fields from database entry */ int i = 0; char * p = s, * q = s, * r = s, * t; while (*p && i < 16) { if (*p == ';' || *p == '#') { /* Field separator */ t = q; /* Beginning of this field */ while (*t == ' ' || *t == '\t') /* Trim leading whitespace */ t++; field[i++] = t; *p = '\0'; /* End of this field */ t = p - 1; /* Trim trailing whitespace */ while (t > r && *t && (*t == ' ' || *t == '\t')) { *t = '\0'; t--; } if (*p == '#') /* Comment introducer terminates */ return; q = p+1; /* Advance to next field */ } p++; } } FILE * fileopen(char * path, char * name) { char filename[MAXPATHLEN+2]; /* Buffer for filespec */ int i, n; if (!path || !name) return((FILE *)0); n = (int) strlen(path); if (n + (int)strlen(name) > MAXPATHLEN) return((FILE *)0); if (n > 0) { strncpy(filename,path,MAXPATHLEN); if (path[n-1] != '/') { filename[n++] = '/'; } } strncpy(&filename[n],name,MAXPATHLEN-n+1); return(fopen(filename,"r")); } /* Given s == hex "XXXX" or "XXXX..XXXX" constructs lo,hi pair */ struct lohi splitpair(char * s) { struct lohi x; char * p, * q; p = s; for (q = p; *q; q++) if (*q == '.') break; if (*q == '.') { while (*q == '.') *q++ = '\0'; } x.lo = hextoint(p); x.hi = *q ? hextoint(q) : x.lo; return(x); } static char ncrbuf[32]; int main(argc,argv) int argc; char *argv[]; { FILE * fp; /* Unicode database file pointer */ USHORT x; /* Unicode values */ CHAR * buf = NULL; /* UTF-8 buffer pointer */ char c, * s, * p, * bp; /* Workers... */ char * argv1 = (char *)0, * argv2 = (char *)0; int i, m, current = -1, all = 0, web = 0; int rtl, combining; unsigned int xx, from = 0, to = 0; struct lohi z; argv0 = argv[0]; /* My name */ all = 1; for (i = 1; i < argc; i++) { /* Parse command-line args */ if (*argv[i] == '-') { if (!strcmp(argv[i],"-w")) { web++; continue; } else if (!strcmp(argv[i],"-p")) { i++; ucdata = argv[i]; ucdata2 = (char *)0; } else { usage(argv0); } } else if (!argv1) { argv1 = argv[i]; all = 0; } else if (!argv2) { argv2 = argv[i]; } else usage(argv0); } if (!argv1) argv1 = "20"; /* Supply defaults */ if (!argv2) argv2 = "FFFF"; bp = (char *)buf; x = hextoint("200E"); /* UTF-8 for LTRM */ m = ucs2_to_utf8(x,&buf); if (m > 3) m = 3; for (i = 0; i < m; i++) ltrm[i] = buf[i]; ltrm[3] = '\0'; from = hextoint(argv1); /* Get range as ints */ if (from < 32) /* Check range and sanity */ usage(argv0); to = hextoint(argv2); if (to > 0xffff) usage(argv0); /* Load table of default BIDI class for character blocks */ fp = fileopen(ucdata,"DerivedBidiClass.txt"); if (!fp && ucdata2) fp = fileopen(ucdata2,"DerivedBidiClass.txt"); while (fp) { /* Get entry for code x */ if ((bp = fgets(line,1023,fp))) { /* Read a line */ clearfields(); if (line[0] == '#' || !line[0]) continue; getfields(line); /* Separate fields */ if (!field[0]) continue; /* Comment or blank line */ if (!field[1]) continue; /* No properties */ z = splitpair(field[0]); if (*(field[1]) == 'R' || !strcmp(field[1],"AL")) { bidi[b] = z; /* (might not be portable)*/ b++; if (b > 254) { fprintf(stderr,"Too many BIDI blocks (255 max)\n"); exit(1); } } } else break; } if (fp) { /* Close file */ fclose(fp); fp = (FILE *)0; } if (b > 1) { /* Have to sort these */ int i, j; /* Bubble sort is fine */ struct lohi t; /* it's a small array */ for (i = 0; i < b-1; i++) { for (j = i+1; j < b; j++) { if (bidi[i].lo > bidi[j].lo) { t = bidi[i]; /* warning: struct assignment */ bidi[i] = bidi[j]; /* might not be portable */ bidi[j] = t; } } } } /* Open Unicode Character Database file */ fp = fileopen(ucdata,"UnicodeData.txt"); /* Unicode Data file */ if (!fp && ucdata2) fp = fileopen(ucdata2,"UnicodeData.txt"); /* Main loop... */ clearfields; /* Initialize database fields */ for (xx = from; xx <= to; xx++) { /* Loop through range */ x = xx; /* Convert index to unsigned short */ if (all && (x == 0x7F || (x >= 0x80 && x < 0xA0))) /* Skip controls */ continue; if (web) { /* If making a Web table */ /* Including Han crashes all known browsers (2003) */ if (x >= 0x2b0e && x <= 0x303f || /* Skip CJK */ x >= 0x3130 && x <= 0x319f || /* but keep kana and bopomofu */ x >= 0x3200 && x <= 0xfff8) { if (x == 0x2b0e || x == 0x3130 || x == 0x3200) printf("...\n"); continue; } } p = (char *)0; /* Initialize name */ rtl = combining = 0; /* and attributes */ while (fp && current < x) { /* Get entry for code x */ if ((bp = fgets(line,1023,fp))) { /* Read a record */ if (line[0] == '#' || !line[0]) continue; getfields(line); /* Separate the fields */ current = (unsigned) hextoint(field[0]); /* Get code */ } else { /* Read failed */ fclose(fp); /* Close the database */ fp = NULL; /* and don't try reading it again */ break; } } if (current == x) { /* If it's the desired record */ p = field[1]; /* get name of this character */ if (web) { /* and if making a web table */ if (field[3]) /* get character properties */ combining = atoi(field[3]); if (field[4]) rtl = (*(field[4]) == 'R' || !strcmp(field[4],"AL")); } } else { /* This char is undefined */ int i; /* Get its default bidi category */ for (i = 0; i < b; i++) { if (bidi[i].lo > x) /* (Table is sorted) */ break; if (x >= bidi[i].lo && x <= bidi[i].hi) { rtl = 1; break; } } } if (!p) p = "(unknown)"; /* Supply this if name not known */ putchar('['); /* Print UTF-8 character in brackets */ buf = (CHAR *)0; /* Initialize value */ if (web) { /* Sensitive HTML characters */ switch(x) { case '<': buf = (CHAR *)"<" ; m = 4; break; case '&': buf = (CHAR *)"&"; m = 5; break; case '>': buf = (CHAR *)">" ; m = 4; break; default: if (x == current && field[2]) { /* Print Controls as Space */ char * t = field[2]; if (*t == 'C') { buf = (CHAR *)" "; m = 1; } else if (*t == 'Z' && *(t+1) != 's') { /* LS and PS */ buf = (CHAR *)" "; m = 1; } } } } if (!buf) /* Anything but ">&<" or Control */ m = ucs2_to_utf8(x,&buf); /* convert to UTF-8 */ if (combining > 0) /* If combining */ putchar(' '); /* put a space to combine with. */ for (i = 0; i < m; i++) /* Copy UTF-8 bytes */ putchar(buf[i]); if (combining == 233 || combining == 234) /* Combining double */ putchar(' '); /* Another space after */ if (rtl) /* If RTL put LTR Mark after */ printf("%s",ltrm); putchar(']'); /* Closing bracket */ sprintf(ncrbuf,"&#%d;",x); printf(" U+%04X %12s %s\n",x,ncrbuf,p); /* Print codes and name */ if (current == x) /* Clear data */ clearfields(); } }