home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
kermit.columbia.edu
/
kermit.columbia.edu.tar
/
kermit.columbia.edu
/
archives
/
charsets.zip
/
utf8.c
< prev
next >
Wrap
C/C++ Source or Header
|
2004-08-15
|
13KB
|
429 lines
/*
U T F 8 . C
Dumps a selected portion of Unicode Plane 0 in UTF8 to standard output.
Output is one line per character:
[c] xxxx name
where:
c is the character in UTF-8,
xxxx is the 4-digit hex code,
name is the character's name from the Unicode database.
Usage:
utf8 Dump all of BMP except controls.
utf8 hex Dump <hex> through FFFF.
utf8 hex1 hex2 Dump <hex1> through <hex2>.
utf8 -w [hex [hex]] As above but suitable for the Web.
utf8 -p directory Include this to specify directory for database files.
Default location for Unicode database files is:
/pub/ftp/kermit/charsets/
/www/data/ftp/kermit/charsets/
Obtain up-to-date copies of database files from:
http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
http://www.unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt
If the Unicode database can't be found, the characters are dumped
without names.
Assumes Unicode database is in ascending order with one record
per line: first field is hex code, second field is name; field
separator is semicolon (;). The DerivedBidiClass file need not be
in code order.
ANSI C required.
F. da Cruz, Columbia University, May 2000.
Updated 10 June 2003: new -w option makes a Web version that:
- Puts < for <, > for >, & for &.
- Puts a space before each combining mark
- Puts U+200E (LTRM) after each RTL character.
- Automatically substitutes space for control/formatting characters.
Updated 18 Jun 2003:
- new -p option to specify path for database files.
- Reads DerivedBidiClass.txt to get BIDI class for undefined code points.
- getfields() strips leading and trailing blanks from each field.
Updated 25 Jun 2003:
- Show hex code as U+xxxx to avoid having the digits in certain entries
turned into Hindi digits (don't ask!)
Updated 15 Aug 2004:
- Show decimal NCRs.
*/
#include <stdio.h>
#define USHORT unsigned short
#define ULONG unsigned long
#define CONST const
#define CHAR unsigned char
#ifndef MAXPATHLEN
#define MAXPATHLEN 1024
#endif /* MAXPATHLEN */
CHAR firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
/* Default directories for Unicode database files */
char * ucdata = "/pub/ftp/kermit/charsets/";
char * ucdata2 = "/www/data/ftp/kermit/charsets/";
char * argv0; /* My name */
char ltrm[4]; /* Left To Right Mark */
char line[1024]; /* Database line buffer */
char * field[16]; /* Fields within line */
struct lohi { /* Struct for ranges */
int lo;
int hi;
};
struct lohi bidi[256]; /* Default RTL blocks */
int b = 0; /* Number of RTL blocks */
int
ucs2_to_utf8(USHORT ucs2, CHAR ** utf8) { /* Convert UCS-2 to UTF-8 */
static CHAR utf8return[8]={0,0,0,0,0,0,0,0};
register CONST ULONG byteMask = 0xBF;
register CONST ULONG byteMark = 0x80;
int utf8len = 0;
int i = 0;
if (ucs2 < 0x80) {
utf8len = 1;
} else if (ucs2 < 0x800) {
utf8len = 2;
} else
#ifdef DO_UCS4
/* This is always true for UCS-2 but would be needed for UCS-4*/
/* When ucs2 is USHORT this gives compiler warnings. */
if (ucs2 <= 0xffff)
#endif /* DO_UCS4 */
{
utf8len = 3;
}
#ifdef DO_UCS4
/* The following would be for UCS-4 */
else if (ucs2 < 0x200000) {
utf8len = 4;
} else if (ucs2 < 0x4000000) {
utf8len = 5;
} else if (ucs2 <= 0x7FFFFFFFUL) { /* 31 bits = max for UCS4 */
utf8len = 6;
} else {
utf8len = 2;
ucs2 = 0xFFFD; /* Replacement for invalid char */
}
#endif /* DO_UCS4 */
i = utf8len; /* index into utf8return */
utf8return[i--] = 0; /* Null terminate the string */
switch (utf8len) { /* code falls through cases! */
case 6: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
case 5: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
case 4: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
case 3: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
case 2: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
case 1: utf8return[i--] = ucs2 | firstByteMark[utf8len];
}
*utf8 = utf8return;
return(utf8len);
}
usage(s) char * s; {
fprintf(stderr,"Usage: %s [-w] [hex [hex]]\n",s);
exit(1);
}
int
hextoint(s) char * s; { /* Convert hex string to integer */
unsigned int x = 0;
int d;
char c;
while (c = *s++) {
if (c >= 'A' && c <= 'F')
d = c - 'A' + 10;
else if (c >= 'a' && c <= 'f')
d = c - 'a' + 10;
else if (c >= '0' && c <= '9')
d = c - '0';
else
usage(argv0);
x = (x << 4) + d;
}
return(x);
}
void
clearfields() { /* Clear database fields */
int i;
for (i = 0; i < 16; i++)
field[i] = (char *)0;
}
void
getfields(s) char *s; { /* Get fields from database entry */
int i = 0;
char * p = s, * q = s, * r = s, * t;
while (*p && i < 16) {
if (*p == ';' || *p == '#') { /* Field separator */
t = q; /* Beginning of this field */
while (*t == ' ' || *t == '\t') /* Trim leading whitespace */
t++;
field[i++] = t;
*p = '\0'; /* End of this field */
t = p - 1; /* Trim trailing whitespace */
while (t > r && *t && (*t == ' ' || *t == '\t')) {
*t = '\0';
t--;
}
if (*p == '#') /* Comment introducer terminates */
return;
q = p+1; /* Advance to next field */
}
p++;
}
}
FILE *
fileopen(char * path, char * name) {
char filename[MAXPATHLEN+2]; /* Buffer for filespec */
int i, n;
if (!path || !name)
return((FILE *)0);
n = (int) strlen(path);
if (n + (int)strlen(name) > MAXPATHLEN)
return((FILE *)0);
if (n > 0) {
strncpy(filename,path,MAXPATHLEN);
if (path[n-1] != '/') {
filename[n++] = '/';
}
}
strncpy(&filename[n],name,MAXPATHLEN-n+1);
return(fopen(filename,"r"));
}
/* Given s == hex "XXXX" or "XXXX..XXXX" constructs lo,hi pair */
struct lohi
splitpair(char * s) {
struct lohi x;
char * p, * q;
p = s;
for (q = p; *q; q++)
if (*q == '.')
break;
if (*q == '.') {
while (*q == '.')
*q++ = '\0';
}
x.lo = hextoint(p);
x.hi = *q ? hextoint(q) : x.lo;
return(x);
}
static char ncrbuf[32];
int
main(argc,argv) int argc; char *argv[]; {
FILE * fp; /* Unicode database file pointer */
USHORT x; /* Unicode values */
CHAR * buf = NULL; /* UTF-8 buffer pointer */
char c, * s, * p, * bp; /* Workers... */
char * argv1 = (char *)0, * argv2 = (char *)0;
int i, m, current = -1, all = 0, web = 0;
int rtl, combining;
unsigned int xx, from = 0, to = 0;
struct lohi z;
argv0 = argv[0]; /* My name */
all = 1;
for (i = 1; i < argc; i++) { /* Parse command-line args */
if (*argv[i] == '-') {
if (!strcmp(argv[i],"-w")) {
web++;
continue;
} else if (!strcmp(argv[i],"-p")) {
i++;
ucdata = argv[i];
ucdata2 = (char *)0;
} else {
usage(argv0);
}
} else if (!argv1) {
argv1 = argv[i];
all = 0;
} else if (!argv2) {
argv2 = argv[i];
} else
usage(argv0);
}
if (!argv1) argv1 = "20"; /* Supply defaults */
if (!argv2) argv2 = "FFFF";
bp = (char *)buf;
x = hextoint("200E"); /* UTF-8 for LTRM */
m = ucs2_to_utf8(x,&buf);
if (m > 3) m = 3;
for (i = 0; i < m; i++)
ltrm[i] = buf[i];
ltrm[3] = '\0';
from = hextoint(argv1); /* Get range as ints */
if (from < 32) /* Check range and sanity */
usage(argv0);
to = hextoint(argv2);
if (to > 0xffff)
usage(argv0);
/* Load table of default BIDI class for character blocks */
fp = fileopen(ucdata,"DerivedBidiClass.txt");
if (!fp && ucdata2)
fp = fileopen(ucdata2,"DerivedBidiClass.txt");
while (fp) { /* Get entry for code x */
if ((bp = fgets(line,1023,fp))) { /* Read a line */
clearfields();
if (line[0] == '#' || !line[0])
continue;
getfields(line); /* Separate fields */
if (!field[0]) continue; /* Comment or blank line */
if (!field[1]) continue; /* No properties */
z = splitpair(field[0]);
if (*(field[1]) == 'R' || !strcmp(field[1],"AL")) {
bidi[b] = z; /* (might not be portable)*/
b++;
if (b > 254) {
fprintf(stderr,"Too many BIDI blocks (255 max)\n");
exit(1);
}
}
} else
break;
}
if (fp) { /* Close file */
fclose(fp);
fp = (FILE *)0;
}
if (b > 1) { /* Have to sort these */
int i, j; /* Bubble sort is fine */
struct lohi t; /* it's a small array */
for (i = 0; i < b-1; i++) {
for (j = i+1; j < b; j++) {
if (bidi[i].lo > bidi[j].lo) {
t = bidi[i]; /* warning: struct assignment */
bidi[i] = bidi[j]; /* might not be portable */
bidi[j] = t;
}
}
}
}
/* Open Unicode Character Database file */
fp = fileopen(ucdata,"UnicodeData.txt"); /* Unicode Data file */
if (!fp && ucdata2)
fp = fileopen(ucdata2,"UnicodeData.txt");
/* Main loop... */
clearfields; /* Initialize database fields */
for (xx = from; xx <= to; xx++) { /* Loop through range */
x = xx; /* Convert index to unsigned short */
if (all && (x == 0x7F || (x >= 0x80 && x < 0xA0))) /* Skip controls */
continue;
if (web) { /* If making a Web table */
/* Including Han crashes all known browsers (2003) */
if (x >= 0x2b0e && x <= 0x303f || /* Skip CJK */
x >= 0x3130 && x <= 0x319f || /* but keep kana and bopomofu */
x >= 0x3200 && x <= 0xfff8) {
if (x == 0x2b0e || x == 0x3130 || x == 0x3200)
printf("...\n");
continue;
}
}
p = (char *)0; /* Initialize name */
rtl = combining = 0; /* and attributes */
while (fp && current < x) { /* Get entry for code x */
if ((bp = fgets(line,1023,fp))) { /* Read a record */
if (line[0] == '#' || !line[0])
continue;
getfields(line); /* Separate the fields */
current = (unsigned) hextoint(field[0]); /* Get code */
} else { /* Read failed */
fclose(fp); /* Close the database */
fp = NULL; /* and don't try reading it again */
break;
}
}
if (current == x) { /* If it's the desired record */
p = field[1]; /* get name of this character */
if (web) { /* and if making a web table */
if (field[3]) /* get character properties */
combining = atoi(field[3]);
if (field[4])
rtl = (*(field[4]) == 'R' || !strcmp(field[4],"AL"));
}
} else { /* This char is undefined */
int i; /* Get its default bidi category */
for (i = 0; i < b; i++) {
if (bidi[i].lo > x) /* (Table is sorted) */
break;
if (x >= bidi[i].lo && x <= bidi[i].hi) {
rtl = 1;
break;
}
}
}
if (!p) p = "(unknown)"; /* Supply this if name not known */
putchar('['); /* Print UTF-8 character in brackets */
buf = (CHAR *)0; /* Initialize value */
if (web) { /* Sensitive HTML characters */
switch(x) {
case '<': buf = (CHAR *)"<" ; m = 4; break;
case '&': buf = (CHAR *)"&"; m = 5; break;
case '>': buf = (CHAR *)">" ; m = 4; break;
default:
if (x == current && field[2]) { /* Print Controls as Space */
char * t = field[2];
if (*t == 'C') {
buf = (CHAR *)" ";
m = 1;
} else if (*t == 'Z' && *(t+1) != 's') { /* LS and PS */
buf = (CHAR *)" ";
m = 1;
}
}
}
}
if (!buf) /* Anything but ">&<" or Control */
m = ucs2_to_utf8(x,&buf); /* convert to UTF-8 */
if (combining > 0) /* If combining */
putchar(' '); /* put a space to combine with. */
for (i = 0; i < m; i++) /* Copy UTF-8 bytes */
putchar(buf[i]);
if (combining == 233 || combining == 234) /* Combining double */
putchar(' '); /* Another space after */
if (rtl) /* If RTL put LTR Mark after */
printf("%s",ltrm);
putchar(']'); /* Closing bracket */
sprintf(ncrbuf,"&#%d;",x);
printf(" U+%04X %12s %s\n",x,ncrbuf,p); /* Print codes and name */
if (current == x) /* Clear data */
clearfields();
}
}