Tools

home *** CD-ROM | disk | FTP | other *** search

/ Tools / WinSN5.0Ver.iso / NETSCAP.50 / WIN1998.ZIP / ns / lib / libi18n / unicvt.c < prev next >

Wrap

C/C++ Source or Header | 1998-04-08 | 51.0 KB | 1,787 lines

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * * The contents of this file are subject to the Netscape Public License * Version 1.0 (the "NPL"); you may not use this file except in * compliance with the NPL. You may obtain a copy of the NPL at * http://www.mozilla.org/NPL/ * * Software distributed under the NPL is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL * for the specific language governing rights and limitations under the * NPL. * * The Initial Developer of this code under the NPL is Netscape * Communications Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All Rights * Reserved. */ /* unicvrt.c * --------- * * * This file implements conversions from one Unicode format to another * Unicode format. * * There are no conversions to/from other encodings. * * There are streams conversion between UTF8 and UCS2, and UTF8 and UTF7. * It generates a DLL on Win 32, and at present, normal libraries on mac, X, and * Win16. */ #define _UNICVT_DLL_ 1 #include "intlpriv.h" #include "unicpriv.h" #include "xp.h" #include <string.h> #ifdef XP_WIN32 #define XP_ALLOC_PRIV malloc #else #define XP_ALLOC_PRIV XP_ALLOC #endif typedef struct utf7_encoding_method_data { int16 *fromb64; unsigned char *tob64; unsigned char *shift; unsigned char startshift; unsigned char endshift; } utf7_encoding_method_data; int32 ucs2_to_utf8_buffer(const uint16 *ucs2p, int32 num_chars, unsigned char *utf8p, int32 num_utf8_bytes, int32 *utf8_bytes_written); /* Private Helper function prototypes */ PRIVATE int16 one_utf8_to_ucs2_char(const unsigned char *utf8p, const unsigned char *utf8endp, uint16 *onecharp); PRIVATE int16 one_ucs2_to_utf8_char(unsigned char *tobufp, unsigned char *tobufendp, uint16 onechar); PRIVATE unsigned char *intl_utf72utf8( CCCDataObject obj, const unsigned char *utf7buf, int32 utf7bufsz, utf7_encoding_method_data* opt ); PRIVATE unsigned char *intl_utf82utf7( CCCDataObject obj, const unsigned char *utf8buf, int32 utf8bufsz, utf7_encoding_method_data* opt ); PRIVATE uint16 pad_and_write(uint32 buffer, unsigned char *tobufp, int16 bufferBitCount, utf7_encoding_method_data* opt); PRIVATE void swap_ucs2_bytes(unsigned char *ucsbuf, int32 ucsbufsz); /* Private constants */ #define MAX_UCS2 0xFFFF #define DEFAULT_CHAR 0x003F /* Default char is "?" */ #define BYTE_MASK 0xBF #define BYTE_MARK 0x80 #define MAX_ASCII 0x7F #define NOT_BASE64 -1 /* Take care of different API for different platforms */ #ifdef XP_WIN32 /* UNICVTAPI def now accomplished in libi18n.h */ /*#define UNICVTAPI __declspec(dllexport)*/ /* THIS #define IS VERY BAD AND SHOULD BE CHANGED WHEN WE REVISIT * THE ERROR HANDLING STUFF AND MOVE IT ALL OUT OF XPSTR.H * THE CALL SHOULD BE: extern int MK_OUT_OF_MEMORY; BUT WE HAVE * CHICKEN AND EGG LINKING PROBLEMS ON WIN32 BECAUSE THE DLL * MUST BE COMPILED BEFORE THE int IS DECLARED. */ #define MK_OUT_OF_MEMORY -207 #else /* !XP_WIN32 */ /* UNICVTAPI def now accomplished in libi18n.h */ /*#define UNICVTAPI*/ extern int MK_OUT_OF_MEMORY; #endif /*!XP_WIN32 */ /* UCS-2 to UTF-8 conversion routines */ /* * mz_ucs2utf8 * ----------- * * Takes a CCCDataObject, a buffer of UCS-2 data, and the size of that buffer. * Allocates and returns the translation of the UCS-2 data in UTF-8. The caller * is responsible for freeing the allocated memory. If the UCS-2 data is not * complete, and ends on a character boundary, the extra byte of data is stored * in uncvtbuf, and will be used the next time this function is called. * * Note about swapping: UCS-2 data can come in big-endian or little-endian * order, so we need to be aware of the need to potentially swap the data. * On the very first block of the stream we will discover (because UCS-2 * always begins with a byte order mark) whether the data is of the same or * opposite endian-ness from us. * The information is store in FromCSID * The use of uncvtbuf: * uncvtbuf[0] is 0 or 1 * uncvtbuf[0] == 0 - there are no left over last time * uncvtbuf[0] == 1 - there one byte left over last time stored in uncvtbuf[1] * */ MODULE_PRIVATE UNICVTAPI unsigned char * mz_ucs2utf8( CCCDataObject obj, const unsigned char *ucsbuf, /* UCS-2 buf for conv */ int32 ucsbufsz) /* UCS-2 buf size in bytes */ { int32 tobufsz; unsigned char *tobuf = NULL; unsigned char *tobufp, *tobufendp,*ucsp, *ucsendp; int16 numUTF8bytes; uint16 onechar; XP_Bool needToSwap = FALSE; int scanstate = 0; unsigned p1=0, p2; unsigned char *uncvtbuf =INTL_GetCCCUncvtbuf(obj); if(INTL_GetCCCFromCSID(obj) == CS_UCS2_SWAP) needToSwap = TRUE; /* Allocate Memory */ /* In the worst case, one UCS2 could expand to three byte */ /* so, the ration is 2:3 */ tobufsz = (3*(ucsbufsz + 1)) / 2 + 2; if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) { INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY); return(NULL); } /* do the set up */ tobufendp = tobuf + tobufsz; /* point to the end of buffer */ tobufp = tobuf; /* point to the begining of buffer */ ucsp = (unsigned char *)ucsbuf; ucsendp = (unsigned char *)ucsbuf + ucsbufsz; /* Get the unconvert byte */ if(uncvtbuf[0] > 0) { p1 = uncvtbuf[1]; scanstate++; } /* Do the conversion */ while( ucsp < ucsendp ) { if(scanstate++ == 0) { p1 = *ucsp; } else { p2 = *ucsp; scanstate = 0; onechar = (p1 << 8) | (p2); /* Look for (and strip) BYTE_ORDER_MARK */ if(onechar == NEEDS_SWAP_MARK) { INTL_SetCCCFromCSID(obj, CS_UCS2_SWAP); needToSwap = TRUE; } else if(onechar == BYTE_ORDER_MARK) { INTL_SetCCCFromCSID(obj, CS_UCS2); needToSwap = FALSE; } else { if(needToSwap) numUTF8bytes = one_ucs2_to_utf8_char(tobufp, tobufendp, (uint16)((p2 << 8) | (p1))); else numUTF8bytes = one_ucs2_to_utf8_char(tobufp, tobufendp, onechar); if(numUTF8bytes == -1) break; /* out of space in tobuf */ tobufp += numUTF8bytes; } } ucsp ++; } *tobufp = '\0'; /* NULL terminate dest. data */ INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */ /* If there are left over, set it to uncvtbuf[1] */ if((uncvtbuf[0] = scanstate) != 0) uncvtbuf[1] = p1; return(tobuf); } /* UTF-8 to UCS-2 */ /* * mz_utf82ucs * ----------- * * This function takes a streams object, a buffer of utf8 data, and the * size of that buffer. It allocates, fills, and returns a buffer of the * equivalent UCS-2 data. The caller is responsible for freeing that * data. If the UTF-8 data cannot be completely converted, the unconverted * final bytes will be stored in uncvtbuf and used on the next call. * * Note: UCS-2 data must always begin with a byte order mark, so we * must write that at the beginning of our stream. This function * employs obj->cvtflag to determine if it is indeed at the beginning * of the stream. obj->cvtflag starts at 0, and we switch it to 1 * as we write the byte order mark. * * A note on endian-ness: This function will return UCS-2 data of the * same endian-ness as the machine we are running on. To generate data * of the opposite endian-ness, use mz_utf82ucsswap. */ MODULE_PRIVATE UNICVTAPI unsigned char * mz_utf82ucs( CCCDataObject obj, const unsigned char *utf8buf, /* UTF-8 buf for conv */ int32 utf8bufsz) /* UTF-8 buf size in bytes */ { unsigned char *tobuf = NULL; int32 tobufsz; unsigned char *tobufp, *utf8p; /* current byte in bufs */ unsigned char *tobufendp, *utf8endp; /* end of buffers */ int32 uncvtlen; unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj); uint16 onechar; int16 numoctets; #define ucsbufsz tobufsz #define ucsbuf tobuf #define ucsp tobufp #define ucsendp tobufendp /* Allocate a dest buffer: */ /* At worst, all the octets are ASCII, and each 1 byte of UTF 8 * will take 2 bytes of UCS-2, plus 2 for NULL termination (and * possibly 2 for byte order mark) */ uncvtlen = strlen((char *)uncvtbuf); tobufsz = 2*(utf8bufsz + uncvtlen) + 4; if (!tobufsz) { return NULL; } if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) { INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY); return(NULL); } /* Initialize pointers, etc. */ utf8p = (unsigned char *)utf8buf; utf8endp = utf8p + utf8bufsz - 1; /* leave room for NULL termination (as sentinel?)*/ #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */ /* If prev. unconverted chars, append unconverted * chars w/new chars and try to process. */ if (uncvtbuf[0] != '\0') { uncvtp = uncvtbuf + uncvtlen; while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) && utf8p <= utf8endp) *uncvtp++ = *utf8p++; *uncvtp = '\0'; /* nul terminate as sentinel */ utf8p = uncvtbuf; /* process unconverted first */ utf8endp = uncvtp - 1; } #undef uncvtp tobufp = tobuf; tobufendp = tobufp + tobufsz - 3; /* save space for terminating null */ /* write byte order mark */ if(!(INTL_GetCCCCvtflag(obj))) { *((uint16 *) tobufp) = (uint16) BYTE_ORDER_MARK; tobufp += 2; INTL_SetCCCCvtflag(obj, TRUE); } WHILELOOP: while( (tobufp <= tobufendp) && (utf8p <= utf8endp) ) { numoctets = one_utf8_to_ucs2_char(utf8p, utf8endp, &onechar); if(numoctets == -1) break; /* not enought utf8 data */ utf8p += numoctets; /* Check to make sure there's space to write onechar */ if((tobufp+2) >= tobufendp) break; *((uint16 *) tobufp) = (onechar <= MAX_UCS2 ? onechar : DEFAULT_CHAR); tobufp +=2; } if(uncvtbuf[0] != '\0') { /* Just processed unconverted chars. * ucsp points to 1st unprocessed char * in ucsbuf. Some may have been * processed while processing unconverted * chars, so setup ptrs. not to process * them twice. */ /* If nothing was converted, there wasn't * enough UCS-2 data. Stop and get more * data. */ if(utf8p == uncvtbuf) { /* nothing was converted */ *tobufp = '\0'; return(NULL); } utf8endp = (unsigned char *) utf8buf + utf8bufsz - 1; utf8p = (unsigned char *) utf8buf + (utf8p - uncvtbuf - uncvtlen); uncvtbuf[0] = '\0'; /* No more unconverted chars.*/ goto WHILELOOP; /* Process new data */ } *tobufp = '\0'; /* NULL terminate dest. data */ INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */ if(utf8p <= utf8endp) { /* unconverted utf8 left? */ tobufp = uncvtbuf; /* just using tobufp as a temp index. */ while (utf8p <= utf8endp) *tobufp++ = *utf8p++; *tobufp = '\0'; /* NULL terminate, as a sentinel */ } #undef ucsbufsz #undef ucsbuf #undef ucsp #undef ucsendp return(tobuf); } /* * mz_utf82ucsswap * --------------- * * mz_utf82ucs will convert the UTF-8 data to UCS-2 data of the same * endian-ness of the platform the client is running on. Occasionally, * this is not what is desired. mz_utf82ucsswap converts the UTF-8 * data to UCS-2 of the opposite endian-ness. */ MODULE_PRIVATE UNICVTAPI unsigned char * mz_utf82ucsswap( CCCDataObject obj, const unsigned char *utf8buf, /* UTF-8 buf for conv */ int32 utf8bufsz) /* UTF-8 buf size in bytes */ { unsigned char *result; result = mz_utf82ucs(obj, utf8buf, utf8bufsz); swap_ucs2_bytes(result, INTL_GetCCCLen(obj)); return(result); } /* UTF-7 to UTF-8 conversion routines */ /* mz_utf72utf8 * ------------ * * Takes a streams object, a buffer of UTF-7 data, and the size of * that buffer. Allocates, fills, and returns a buffer of UTF-8 * data. (Its size is returned in the CCCDataObject.) The caller * is responsible for freeing the returned buffer. * * Note: UTF-7 has the property that multiple characters of UTF-7 * may make up a single character of UTF-8. Also, a single UTF-7 char * may contribute bits to more than one utf8 character. If such a * UTF-7 character is involved at the end of the current chunk, it won't * be save-able in uncvtbuf. For this reason, we also need to * save the bit buffer. It turns out that we also need to save the * fact that we are within a shifted sequence, because there is no * other way for that information to persist between chunks of a * stream. If we save a buffer, then we are certainly in the middle * of a shifted sequence, but even if there is no buffer to save, we * may still be in a shifted sequence. * * The streams module gives me one int32 - obj->cvtflag - in which * to save my state. This means that to save all my data, I'll need * to do a few bit-wise operations. * * Arbitrarily, the top two bytes will hold the buffer, the next byte * holds the count of relevant bits in the buffer, and the low order * byte will hold 0 if we are not in a shiftSequence, 1 if we are. * * Since we will only save a buffer and bufferBitCount if we are * in a shift sequence when this chunk terminates, obj->cvtflag == 0 * when we do not terminate in a shift sequence. */ /* tables for RFC1642- UTF7 */ PRIVATE int16 rfc1642_fromb64[128] = { /* 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 10 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 20 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 30 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 40 */ -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, /* 50 */ 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, /* 60 */ -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, /* 70 */ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* 80 */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, /* 90 */ 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, /* 100 */ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, /* 110 */ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, /* 120 */ 49, 50, 51, -1, -1, -1, -1, -1 }; PRIVATE unsigned char rfc1642_tob64[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; PRIVATE unsigned char rfc1642_shift[128] = { /* 0 1 2 3 4 5 6 7 */ /* 8 9 A B C D E F */ /* 0x00 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x08 */ TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, /* 0x10 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x18 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x20 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x28 */ FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, /* 0x30 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x38 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x40 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x48 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x50 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x58 */ FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, /* 0x60 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x68 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x70 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x78 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE }; PRIVATE utf7_encoding_method_data rfc1642_utf7 = { rfc1642_fromb64, rfc1642_tob64, rfc1642_shift, (unsigned char)'+', (unsigned char)'-' }; /* tables for RFC2060- IMAP4rev1 Mail Box Name */ PRIVATE int16 rfc2060_fromb64[128] = { /* 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 10 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 20 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 30 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 40 */ -1, -1, -1, 62, 63, -1, -1, -1, 52, 53, /* 50 */ 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, /* 60 */ -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, /* 70 */ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* 80 */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, /* 90 */ 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, /* 100 */ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, /* 110 */ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, /* 120 */ 49, 50, 51, -1, -1, -1, -1, -1 }; PRIVATE unsigned char rfc2060_tob64[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; PRIVATE unsigned char rfc2060_shift[128] = { /* 0 1 2 3 4 5 6 7 */ /* 8 9 A B C D E F */ /* 0x00 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x08 */ TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, /* 0x10 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x18 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, /* 0x20 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, /* 0x28 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x30 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x38 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x40 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x48 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x50 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x58 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x60 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x68 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x70 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, /* 0x78 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE }; PRIVATE utf7_encoding_method_data rfc2060_utf7 = { rfc2060_fromb64, rfc2060_tob64, rfc2060_shift, (unsigned char)'&', (unsigned char)'-' }; MODULE_PRIVATE UNICVTAPI unsigned char * mz_utf72utf8( CCCDataObject obj, const unsigned char *utf7buf, /* UTF-7 buf for conv */ int32 utf7bufsz) /* UTF-7 buf size in bytes */ { return intl_utf72utf8(obj,utf7buf, utf7bufsz, &rfc1642_utf7); } MODULE_PRIVATE UNICVTAPI unsigned char * mz_imap4utf72utf8( CCCDataObject obj, const unsigned char *utf7buf, /* UTF-7 buf for conv */ int32 utf7bufsz) /* UTF-7 buf size in bytes */ { return intl_utf72utf8(obj,utf7buf, utf7bufsz, &rfc2060_utf7); } PRIVATE unsigned char * intl_utf72utf8( CCCDataObject obj, const unsigned char *utf7buf, /* UTF-7 buf for conv */ int32 utf7bufsz, /* UTF-7 buf size in bytes */ utf7_encoding_method_data* opt) { unsigned char *tobuf = NULL; int32 tobufsz; unsigned char *tobufp, *utf7p; /* current byte in bufs */ unsigned char *tobufendp, *utf7endp; /* end of buffers */ int32 uncvtlen; uint16 oneUCS2char; unsigned char onechar; int16 numoctets; int16 mustnotshift = 0; int16 inShiftSequence; uint32 buffer; uint32 buffertemp = 0; int16 bufferBitCount; unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj); /* set up table to convert ASCII values of base64 chars to * their base 64 value. If there is no conversion, use -1 as sentinel. */ /* initialize data saved from previous stream */ int32 flag = INTL_GetCCCCvtflag(obj); inShiftSequence = flag & 1; buffer = 0xFFFF0000 & flag; bufferBitCount = (uint16) ((0x0000FF00 & flag) >> 8); #define utf8bufsz tobufsz #define utf8buf tobuf #define utf8p tobufp #define utf8endp tobufendp /* Allocate a dest buffer: */ /* UTF-7 characters that are directly encoded will be one octet UTF-8 * chars. Shifted chars will take 2.7 octets (plus shift in or out chars) * to make 2 or 3 octet UTF-8 chars. So in the worst input, all the UTF-7 * data would convert to 3 octet UTF-8 data, and we would need 1/9th as * many UTF-7 characters, plus 1 to round up, plus 1 for NULL termination. */ uncvtlen = strlen((char *)uncvtbuf); tobufsz = (int32) (1.2*(utf7bufsz + uncvtlen) + 2); if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) { INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY); return(NULL); } /* Initialize pointers, etc. */ utf7p = (unsigned char *)utf7buf; utf7endp = utf7p + utf7bufsz - 1; #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */ /* If prev. unconverted chars, append unconverted * chars w/new chars and try to process. */ if (uncvtbuf[0] != '\0') { uncvtp = uncvtbuf + uncvtlen; while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) && utf7p <= utf7endp) *uncvtp++ = *utf7p++; *uncvtp = '\0'; /* nul terminate as sentinel */ utf7p = uncvtbuf; /* process unconverted first */ utf7endp = uncvtp - 1; } #undef uncvtp tobufp = tobuf; tobufendp = tobufp + tobufsz - 2; WHILELOOP: while( (tobufp <= tobufendp) && (utf7p <= utf7endp) ) { onechar = *utf7p++; /* If I'm not in the shift sequence, and I have the start symbol, * absorb it and loop again. Otherwise, if I have a legal character * for a non-shifted sequence, (ASCII) write it directly. This is * ok, because ASCII is just ASCII in UTF-8, so don't need to worry * about UCS-2 conversion. */ if(!inShiftSequence) { if(onechar == opt->startshift) { if(*utf7p == opt->endshift) { *tobufp++ = opt->startshift; utf7p++; } else inShiftSequence = TRUE; continue; } if(onechar <= MAX_ASCII) *tobufp++ = onechar; else continue; } else { /* inShiftSequence is TRUE */ /* onechar is not a base64 allowable char if it is non-ASCII or * if it is a non-base64 char from the ASCII set. */ mustnotshift = (onechar > MAX_ASCII || (opt->fromb64[onechar] == NOT_BASE64)); /* If I'm in the shift sequence, and get the opt->endshift character, * I want to absorb it and turn off shifting. If I get another * non-shiftable character, I want to write it and turn off shifting. * If I get an illegal character, I discard it and keep looping. */ if(mustnotshift) { if(!(onechar == opt->endshift)) { if(onechar > MAX_ASCII) continue; *tobufp++ = onechar; } inShiftSequence = FALSE; buffer = 0; /* flush buffer at end of shift sequence */ bufferBitCount = 0; } else { buffertemp = opt->fromb64[onechar] & 0x0000003F; /* grab 6-bit base64 char */ buffer |= buffertemp << (26 - bufferBitCount); /* 26 is 32 - 6 bits */ bufferBitCount += 6; /* Flush the buffer of a UCS-2 character (won't be more than one) */ if(bufferBitCount > 15) { oneUCS2char = (int16) ((buffer & 0xFFFF0000) >> 16); numoctets = one_ucs2_to_utf8_char(tobufp, tobufendp, oneUCS2char); if(numoctets == -1) break; /* out of space in tobuf */ tobufp += numoctets; bufferBitCount -= 16; buffer <<= 16; } } } /* end of inShiftSequence == TRUE */ } /* end of conversion while loop */ if(uncvtbuf[0] != '\0') { /* Just processed unconverted chars. * ucsp points to 1st unprocessed char * in ucsbuf. Some may have been * processed while processing unconverted * chars, so setup ptrs. not to process * them twice. */ /* If nothing was converted, there wasn't * enough UCS-2 data. Stop and get more * data. */ if(utf7p == uncvtbuf) { /* nothing was converted */ *tobufp = '\0'; INTL_SetCCCLen(obj, 0); return(NULL); } /* set up to read ucsbuf */ utf7endp = (unsigned char *) utf7buf + utf7bufsz - 1; utf7p = (unsigned char *) utf7buf + (utf7p - uncvtbuf - uncvtlen); uncvtbuf[0] = '\0'; /* No more unconverted chars.*/ goto WHILELOOP; /* Process new data */ } *tobufp = '\0'; /* NULL terminate dest. data */ INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */ /* If we're in a shift sequence, we need to save away our buffer * and the buffer bit count (although if all that's left in the buffer * is padding 0's, we don't need to worry about it and should reset * the bitCount to 0.) */ INTL_SetCCCCvtflag(obj,((inShiftSequence ? 1 : 0 ) | (buffer & 0xFFFF0000) | ((bufferBitCount << 8) & 0x0000FF00))); /* Now check for unconverted data from utf7p */ if(utf7p <= utf7endp) { int l = utf7endp - utf7p + 1; memcpy(uncvtbuf, utf7p, l); uncvtbuf[l] = '\0'; } #undef utf8bufsz #undef utf8buf #undef utf8p #undef utf8endp return(tobuf); } /* UTF-8 to UTF-7 */ /* * mz_utf82utf7 * ------------ * * This function takes a CCCDataObject, a buffer of UTF-8 data, and the * size of that buffer. It allocates and returns a buffer of the * corresponding UTF-7 data (returning the size as a field in the * CCCDataObject). The caller is responsible for freeing the returned * data. If there are extra data at the end of the UTF-8 buffer which * cannot be translated into UTF-7 (ie, an incomplete character), it * will be saved in the uncvtbuf of the CCCDataObject and used on the * next call. * * UTF-7 is a variant of base-64, and like base-64, it accumulates * bits in a bit buffer, transforming them to UTF-7 chars when it * has multiples of 6 bits. If the UTF-8 data being translated does * not happen to terminate with a multiple of 6 bits, the final * char will be padded with 0's, and the shift sequence terminated. * For this reason, we will *never* be inside a shift sequence in * between chunks of data. This may mean that the final stream of * data has sequences that look like +[some UTF-7 data]-+[more data]-, * with a plus immediately following a -. Although unconventional, * this is in fact legal UTF-7. * * Finally, there are two formats of UTF-7, one extremely conservative * fashion which shifts every character which could possibly be * considered unsafe, and another which is somewhat more lax. Which * of these is used is determined by obj->cvtflag. By default (cvtflag == 0) * we employ the safer form of conversion. The differing characters * are: !\"#$%&*;<=>@[]^_`{|} */ /* Tables */ MODULE_PRIVATE UNICVTAPI unsigned char * mz_utf82utf7( CCCDataObject obj, const unsigned char *utf8buf, /* UTF-8 buf for conv */ int32 utf8bufsz) /* UTF-8 buf size in bytes */ { return intl_utf82utf7(obj,utf8buf, utf8bufsz, &rfc1642_utf7); } MODULE_PRIVATE UNICVTAPI unsigned char * mz_utf82imap4utf7( CCCDataObject obj, const unsigned char *utf8buf, /* UTF-8 buf for conv */ int32 utf8bufsz) /* UTF-8 buf size in bytes */ { return intl_utf82utf7(obj,utf8buf, utf8bufsz, &rfc2060_utf7); } PRIVATE unsigned char * intl_utf82utf7( CCCDataObject obj, const unsigned char *utf8buf, /* UTF-8 buf for conv */ int32 utf8bufsz, /* UTF-8 buf size in bytes */ utf7_encoding_method_data* opt) { unsigned char *tobuf = NULL; int32 tobufsz; unsigned char *tobufp, *utf8p; /* current byte in bufs */ unsigned char *tobufendp, *utf8endp; /* end of buffers */ int32 uncvtlen; unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj); uint16 onechar; int16 numoctets; int16 inShiftSequence = FALSE; int16 needToShift = FALSE; uint32 buffer = 0; uint32 buffertemp = 0; int16 bufferBitCount = 0; unsigned char oneBase64char; #define utf7bufsz tobufsz #define utf7buf tobuf #define utf7p tobufp #define utf7endp tobufendp /* Allocate a dest buffer: */ uncvtlen = strlen((char *)uncvtbuf); tobufsz = 3*(utf8bufsz + uncvtlen) +1; if (!tobufsz) { return NULL; } if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) { INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY); return(NULL); } /* Initialize pointers, etc. */ utf8p = (unsigned char *)utf8buf; utf8endp = utf8p + utf8bufsz - 1; /* leave room for NULL termination (as sentinel?)*/ #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */ /* If prev. unconverted chars, append unconverted * chars w/new chars and try to process. */ if (uncvtbuf[0] != '\0') { uncvtp = uncvtbuf + uncvtlen; /* This is not leaving space for a NULL !!!!!!!!!!!! */ while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) && utf8p <= utf8endp) *uncvtp++ = *utf8p++; *uncvtp = '\0'; /* nul terminate as sentinel */ utf8p = uncvtbuf; /* process unconverted first */ utf8endp = uncvtp - 1; } #undef uncvtp tobufp = tobuf; tobufendp = tobufp + tobufsz - 2; /* save space for terminating null*/ WHILELOOP: while( (tobufp <= tobufendp) && (utf8p <= utf8endp) ) { /* convert one char's worth of utf8 to ucs2 */ numoctets = one_utf8_to_ucs2_char(utf8p, utf8endp, &onechar); if(numoctets == -1) break; /* out of input*/ utf8p += numoctets; /* we need to be shifted if the character is non-ASCII or * is an ASCII character that should be shifted. */ needToShift = (onechar > MAX_ASCII) || (opt->shift[onechar]); if(!needToShift && inShiftSequence) { if(bufferBitCount > 0) { if((tobufp+2) > tobufendp) break; bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt); if (!bufferBitCount) { /* buffer successfully flushed */ tobufp+=2; buffer = 0; } } else { if((tobufp+1) > tobufendp) break; *tobufp++ = opt->endshift; } inShiftSequence = FALSE; /* now just fallthrough to next case*/ } if(!needToShift && !inShiftSequence) { if((tobufp+1) > tobufendp) break; *tobufp++ = (char) onechar; } if(needToShift && !inShiftSequence) { *tobufp++ = opt->startshift; if(onechar == opt->startshift) { /* special-case behavior if onechar is a + */ if((tobufp+1) > tobufendp) break; *tobufp++ = opt->endshift; } else inShiftSequence = TRUE; } if(needToShift && inShiftSequence) { buffertemp = onechar & 0x0000FFFF; buffer |= buffertemp << (16 - bufferBitCount); /* ^--16 is the size of the int32 minus * the size of onechar */ bufferBitCount += 16; /* Flush the buffer of as many base64 characters as we can form */ while(bufferBitCount>5) { if(tobufp > tobufendp) break; oneBase64char = (char) ((buffer & 0xFC000000) >> 26); *tobufp++ = opt->tob64[oneBase64char]; buffer <<= 6; bufferBitCount -= 6; } } } /* end of while loop */ if(uncvtbuf[0] != '\0') { /* Just processed unconverted chars. * ucsp points to 1st unprocessed char * in ucsbuf. Some may have been * processed while processing unconverted * chars, so setup ptrs. not to process * them twice. */ /* If nothing was converted, there wasn't * enough UTF-8 data. Stop and get more * data. */ if(utf8p == uncvtbuf) { /* nothing was converted */ *tobufp = '\0'; return(NULL); } utf8endp = (unsigned char *) utf8buf + utf8bufsz - 1; utf8p = (unsigned char *) utf8buf + (utf8p - uncvtbuf - uncvtlen); uncvtbuf[0] = '\0'; /* No more unconverted chars.*/ goto WHILELOOP; /* Process new data */ } /* Anything left in the buffer at this point should be padded with 0's * and appended to tobuf. */ if(inShiftSequence) { if(bufferBitCount > 0) { if((tobufp+2) <= tobufendp) { bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt); if (!bufferBitCount) { /* buffer successfully flushed */ tobufp+=2; buffer = 0; } } } else { if((tobufp+1) <= tobufendp) *tobufp++ = opt->endshift; } inShiftSequence = FALSE; } *tobufp = '\0'; /* NULL terminate dest. data */ INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */ if(utf8p <= utf8endp) { /* unconverted utf8 left? */ tobufp = uncvtbuf; /* just using tobufp as a temp index. */ while (utf8p <= utf8endp) *tobufp++ = *utf8p++; *tobufp = '\0'; /* NULL terminate, as a sentinel if nothing else.*/ } #undef utf7bufsz #undef utf7buf #undef utf7p #undef utf7endp return(tobuf); } /* Function: one_ucs2_to_utf8_char * * Function takes one UCS-2 char and writes it to a UTF-8 buffer. * We need a UTF-8 buffer because we don't know before this * function how many bytes of utf-8 data will be written. It also * takes a pointer to the end of the UTF-8 buffer so that we don't * overwrite data. This function returns the number of UTF-8 bytes * of data written, or -1 if the buffer would have been overrun. */ #define LINE_SEPARATOR 0x2028 #define PARAGRAPH_SEPARATOR 0x2029 PRIVATE int16 one_ucs2_to_utf8_char(unsigned char *tobufp, unsigned char *tobufendp, uint16 onechar) { int16 numUTF8bytes = 0; if((onechar == LINE_SEPARATOR)||(onechar == PARAGRAPH_SEPARATOR)) { strcpy((char*)tobufp, "\n"); return strlen((char*)tobufp);; } if (onechar < 0x80) { numUTF8bytes = 1; } else if (onechar < 0x800) { numUTF8bytes = 2; } else if (onechar <= MAX_UCS2) { numUTF8bytes = 3; } else { numUTF8bytes = 2; onechar = DEFAULT_CHAR; } tobufp += numUTF8bytes; /* return error if we don't have space for the whole character */ if (tobufp > tobufendp) { return(-1); } switch(numUTF8bytes) { case 3: *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6; *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6; *--tobufp = onechar | THREE_OCTET_BASE; break; case 2: *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6; *--tobufp = onechar | TWO_OCTET_BASE; break; case 1: *--tobufp = (unsigned char)onechar; break; } return(numUTF8bytes); } /* * utf8_to_ucs2_char * * Convert a utf8 multibyte character to ucs2 * * inputs: pointer to utf8 character(s) * length of utf8 buffer ("read" length limit) * pointer to return ucs2 character * * outputs: number of bytes in the utf8 character * -1 if not a valid utf8 character sequence * -2 if the buffer is too short */ MODULE_PRIVATE UNICVTAPI int16 utf8_to_ucs2_char(const unsigned char *utf8p, int16 buflen, uint16 *ucs2p) { uint16 lead, cont1, cont2; /* * Check for minimum buffer length */ if ((buflen < 1) || (utf8p == NULL)) { return -2; } lead = (uint16) (*utf8p); /* * Check for a one octet sequence */ if (IS_UTF8_1ST_OF_1(lead)) { *ucs2p = lead & ONE_OCTET_MASK; return 1; } /* * Check for a two octet sequence */ if (IS_UTF8_1ST_OF_2(*utf8p)) { if (buflen < 2) return -2; cont1 = (uint16) *(utf8p+1); if (!IS_UTF8_2ND_THRU_6TH(cont1)) return -1; *ucs2p = (lead & TWO_OCTET_MASK) << 6; *ucs2p |= cont1 & CONTINUING_OCTET_MASK; return 2; } /* * Check for a three octet sequence */ else if (IS_UTF8_1ST_OF_3(lead)) { if (buflen < 3) return -2; cont1 = (uint16) *(utf8p+1); cont2 = (uint16) *(utf8p+2); if ( (!IS_UTF8_2ND_THRU_6TH(cont1)) || (!IS_UTF8_2ND_THRU_6TH(cont2))) return -1; *ucs2p = (lead & THREE_OCTET_MASK) << 12; *ucs2p |= (cont1 & CONTINUING_OCTET_MASK) << 6; *ucs2p |= cont2 & CONTINUING_OCTET_MASK; return 3; } else { /* not a valid utf8/ucs2 character */ return -1; } } UNICVTAPI int32 INTL_NumUTF8Chars(const unsigned char *utf8p) { int num_chars = 0; while (*utf8p) { /* * Check for a one octet sequence */ if (IS_UTF8_1ST_OF_1(*utf8p)) { num_chars += 1; utf8p += 1; continue; } /* * Check for a two octet sequence */ else if (IS_UTF8_1ST_OF_2(*utf8p) && IS_UTF8_2ND_THRU_6TH(*(utf8p+1))) { num_chars += 2; utf8p += 2; continue; } /* * Check for a three octet sequence */ else if (IS_UTF8_1ST_OF_3(*utf8p) && IS_UTF8_2ND_THRU_6TH(*(utf8p+1)) && IS_UTF8_2ND_THRU_6TH(*(utf8p+2))) { num_chars += 3; utf8p += 3; continue; } /* * Not UTF8 : just muddle forward */ else { num_chars += 1; utf8p += 1; } } return num_chars; } PUBLIC UNICVTAPI uint16 * INTL_UTF8ToUCS2(const unsigned char *utf8p, int32 *num_chars) { uint16 *ucs2_chars; int32 num_utf8_chars, ucs2_len, num_ucs2_chars; int parse_cnt, inval_cnt; /* * Figure the number of chars */ num_utf8_chars = INTL_NumUTF8Chars(utf8p); ucs2_len = num_utf8_chars*2; ucs2_chars = (uint16 *)XP_ALLOC_PRIV(ucs2_len + 2); if (!ucs2_chars) return NULL; /* * Do the conversion */ num_ucs2_chars = utf8_to_ucs2_buffer(utf8p, strlen((char*)utf8p), &parse_cnt, &inval_cnt, ucs2_chars, ucs2_len); ucs2_chars[num_ucs2_chars] = 0; /* null terminator */ /* * return the result */ if (num_ucs2_chars > 0) *num_chars = num_ucs2_chars; else *num_chars = 0; return ucs2_chars; } PUBLIC UNICVTAPI unsigned char * INTL_UCS2ToUTF8(const uint16 *ucs2p, int32 num_chars) { unsigned char *utf8_chars; int32 num_utf8_bytes, num_bytes_written, dummy; int i; /* * Figure the number of bytes for the utf8 string */ num_utf8_bytes =0; for (i=0; i<num_chars; i++) { if (ucs2p[i] <= 0x7F) /* 0-0x7f only need one byte */ num_utf8_bytes += 1; else if (ucs2p[i] <= 0x3FF) /* 0x80-0x3ff only need two bytes */ num_utf8_bytes += 2; else /* 0x400-0xffff need three bytes */ num_utf8_bytes += 3; } utf8_chars = (unsigned char *)XP_ALLOC_PRIV(num_utf8_bytes + 1); if (!utf8_chars) return NULL; XP_MEMSET(utf8_chars, 0, num_utf8_bytes + 1); /* * Do the conversion */ num_bytes_written = ucs2_to_utf8_buffer(ucs2p, num_chars, utf8_chars, num_utf8_bytes, &dummy); /* * return the result */ return utf8_chars; } /* * ucs2_to_utf8_buffer * * Convert a ucs2 buffer to a utf8 multibyte character string * * inputs: * pointer to return ucs2 buffer * length of ucs2 buffer ("read" length limit) * pointer to utf8 character(s) * length of utf8 buffer ("write" length limit) * * outputs: returns number of charecters "read" from the ucs2 string * sets *num_bytes_written to # of utf8 characters "written" */ int32 ucs2_to_utf8_buffer(const uint16 *ucs2p, int32 num_chars, unsigned char *utf8p, int32 num_utf8_bytes, int32 *utf8_bytes_written) { int i; /* * Init values */ *utf8_bytes_written = 0; /* * Convert the data */ for (i=0; i<num_chars; i++) { if (ucs2p[i] <= 0x7F) { /* 0-0x7f only need one byte */ if (num_utf8_bytes < 1) break; utf8p[*utf8_bytes_written] = (unsigned char)ucs2p[i]; num_utf8_bytes -= 1; *utf8_bytes_written += 1; } else if (ucs2p[i] <= 0x3FF) { /* 0x80-0x3ff only need two bytes */ if (num_utf8_bytes < 2) break; utf8p[*utf8_bytes_written+0] = (unsigned char) (TWO_OCTET_BASE | ((ucs2p[i]>>6)&TWO_OCTET_MASK)); utf8p[*utf8_bytes_written+1] = (unsigned char) (CONTINUING_OCTET_BASE | (ucs2p[i]&CONTINUING_OCTET_MASK)); num_utf8_bytes -= 2; *utf8_bytes_written += 2; } else { /* 0x400-0xffff need three bytes */ if (num_utf8_bytes < 3) break; utf8p[*utf8_bytes_written+0] = (unsigned char) (THREE_OCTET_BASE | ((ucs2p[i]>>12)&THREE_OCTET_MASK)); utf8p[*utf8_bytes_written+1] = (unsigned char) (CONTINUING_OCTET_BASE | ((ucs2p[i]>>6)&CONTINUING_OCTET_MASK)); utf8p[*utf8_bytes_written+2] = (unsigned char) (CONTINUING_OCTET_BASE | (ucs2p[i]&CONTINUING_OCTET_MASK)); num_utf8_bytes -= 3; *utf8_bytes_written += 3; } } return i; } /* * utf8_to_ucs2_buffer * * Convert a utf8 multibyte character string and place in a ucs2 buffer * * inputs: pointer to utf8 character(s) * length of utf8 buffer ("read" length limit) * pointer to return ucs2 buffer * length of ucs2 buffer ("write" length limit) * pointer to return count of invalid bytes * * outputs: returns number of bytes "read" from the utf8 string * sets *invalid_cnt to # of invalid utf8 characters "read" */ UNICVTAPI int32 utf8_to_ucs2_buffer(const unsigned char *utf8p, int16 utf8len, int *parsed_cnt, int *invalid_cnt, uint16 *ucs2p, int32 ucs2len) { int read_len, write_len; int char_len; /* * Init the return values */ *parsed_cnt = 0; *invalid_cnt = 0; /* * Check for minimum buffer lengths */ if ((utf8len < 1) || (utf8p == NULL) || (ucs2len < 1) || (ucs2p == NULL)) { return 0; } /* * Do the conversion */ for (read_len=0,write_len=0; (read_len<utf8len) && (write_len<ucs2len); read_len +=char_len) { char_len = utf8_to_ucs2_char(utf8p+read_len, utf8len-read_len, (uint16*)ucs2p+write_len); if (char_len == -1) { /* invalid character */ *invalid_cnt += 1; char_len = 1; /* try to resynchronize */ *(ucs2p+write_len) = *(utf8p+read_len); } else if (char_len == -2) { /* buffer too short for last char */ /* return with what we have so far */ break; } /* * Note we converted one */ *parsed_cnt += char_len; write_len += 1; } return write_len; } /* Function: one_utf8_to_ucs2_char * * Converts one UTF8 char to one UCS2 char. Needs to get UTF-8 from a * buffer of utf8 data, because we don't know how many octets it will * be, not before this function is called. Take a pointer to the end of that * buffer to make sure we don't run past it. Put the resulting UCS-2 * char into an int16 we're given a pointer to. Returns the number of * octets used in the utf-8 char we converted, and returns -1 if it * runs out of utf-8 data without a complete UCS-2 character. */ PRIVATE int16 one_utf8_to_ucs2_char(const unsigned char *utf8p, const unsigned char *utf8endp, uint16 *onecharp) { int16 i, numoctets; uint32 ucs4 = 0; *onecharp = 0; if(*utf8p >= THREE_OCTET_BASE) numoctets = 3; else if (*utf8p >= TWO_OCTET_BASE) numoctets = 2; else numoctets = 1; /* See if all the data for the char is there */ if ((utf8p + numoctets - 1) > utf8endp) { return (-1); } for(i=numoctets; i>0; i--) { ucs4 += *utf8p++; if (i == 1) break; ucs4 <<= 6; } switch(numoctets) { case 3: ucs4 -= 0x000E2080UL; break; /* truncating... */ case 2: ucs4 -= 0x00003080UL; break; } *onecharp= (uint16)(ucs4 & 0x0000FFFFUL); return(numoctets); } /* * Internal Function: pad_and_write * Checks to make sure there is less than one full base64 character in the * buffer, pad it with 0 to make up a full base64 character, write that * to tobuf, and write the shift termination character. (-) */ PRIVATE uint16 pad_and_write(uint32 buffer, unsigned char *tobufp, int16 bufferBitCount, utf7_encoding_method_data* opt) { int16 oneBase64char; if(bufferBitCount >= 6) return(bufferBitCount); oneBase64char = ((unsigned char) (buffer >> 26)); *tobufp++ = opt->tob64[oneBase64char]; *tobufp = opt->endshift; return(0); } /* Function: swap_ucs2_bytes * * Takes a buffer of ucs2 chars, and its size in *bytes*. * * This function is meant to cope with the problem that sometimes * UCS-2 data (because of the big-endian, little-endian problem?) * comes in in reversed order, and needs to be swapped to be * dealt with appropriately. * * This case can be detected at the very beginning of the stream, * because the first two bytes of any UCS-2 stream should be the * Byte Order Mark, or 0xFEFF. If instead you see 0xFFFE, you know * you need to swap. Neither of these are legal UCS-2 characters * otherwise, so you know that there is no danger of accidentally * triggering swapping with a legitimate UCS-2 stream. * Unfortunately, this marker is only present at the very beginning * of a stream; future chunks of the stream won't have the marker. * So if we ever detect that a stream needs to be swapped, we * save that information by turning on the obj->cvtflag. If, on * future chunks, we see that that flag is turned on, we'll go * ahead and swap. * Notice that if swapping is unnecessary, this function has * no effect whatsoever. */ PRIVATE void swap_ucs2_bytes(unsigned char *ucsbuf, int32 ucsbufsz) { int32 i; unsigned char swapTemp = 0; if(ucsbufsz%2) ucsbufsz--; for(i=0; i<ucsbufsz; i+=2) { swapTemp = ucsbuf[i]; ucsbuf[i] = ucsbuf[i+1]; ucsbuf[i+1] = swapTemp; } return; } /* UCS-2 to UTF-7 jliu */ /* * mz_ucs2utf7 * ------------ * * This function takes a CCCDataObject, a buffer of UCS-2 data, and the * size of that buffer. It allocates and returns a buffer of the * corresponding UTF-7 data (returning the size as a field in the * CCCDataObject). The caller is responsible for freeing the returned * data. If there are extra data at the end of the UTF-8 buffer which * cannot be translated into UTF-7 (ie, an incomplete character), it * will be saved in the uncvtbuf of the CCCDataObject and used on the * next call. * * UTF-7 is a variant of base-64, and like base-64, it accumulates * bits in a bit buffer, transforming them to UTF-7 chars when it * has multiples of 6 bits. If the UTF-8 data being translated does * not happen to terminate with a multiple of 6 bits, the final * char will be padded with 0's, and the shift sequence terminated. * For this reason, we will *never* be inside a shift sequence in * between chunks of data. This may mean that the final stream of * data has sequences that look like +[some UTF-7 data]-+[more data]-, * with a plus immediately following a -. Although unconventional, * this is in fact legal UTF-7. * * Finally, there are two formats of UTF-7, one extremely conservative * fashion which shifts every character which could possibly be * considered unsafe, and another which is somewhat more lax. Which * of these is used is determined by obj->cvtflag. By default (cvtflag == 0) * we employ the safer form of conversion. The differing characters * are: !\"#$%&*;<=>@[]^_`{|} */ /* Tables */ MODULE_PRIVATE UNICVTAPI unsigned char * mz_ucs2utf7( CCCDataObject obj, const unsigned char *ucs2buf, /* UTF-8 buf for conv */ int32 ucs2bufsz) /* UTF-8 buf size in bytes */ { utf7_encoding_method_data* opt = &rfc1642_utf7; unsigned char *tobuf = NULL; int32 tobufsz; unsigned char *tobufp, *ucs2p; /* current byte in bufs */ unsigned char *tobufendp, *ucs2endp; /* end of buffers */ int32 uncvtlen = 0; unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj); uint16 onechar; int16 inShiftSequence = FALSE; int16 needToShift = FALSE; uint32 buffer = 0; uint32 buffertemp = 0; int16 bufferBitCount = 0; unsigned char oneBase64char; XP_Bool needToSwap = FALSE; if( INTL_GetCCCFromCSID( obj ) == CS_UCS2_SWAP ) needToSwap = TRUE; /* Allocate a dest buffer: ** in the worst case, every Unicode character will cost 2+4 = 6 octetes */ uncvtlen = uncvtbuf[0]; tobufsz = 6*( (ucs2bufsz + uncvtlen)/2 + 1 ) + 1; if (!tobufsz) { return NULL; } if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) { INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY); return(NULL); } /* Initialize pointers, etc. */ ucs2p = (unsigned char *)ucs2buf; ucs2endp = ucs2p + ucs2bufsz - 1; /* leave room for NULL termination (as sentinel?)*/ tobufp = tobuf; tobufendp = tobufp + tobufsz - 2; /* save space for terminating null*/ while( (tobufp <= tobufendp) && (ucs2p < ucs2endp) ) { if( uncvtbuf[0] != 0 ){ onechar = uncvtbuf[1]; uncvtbuf[0] = 0; } else onechar = *ucs2p++; onechar <<= 8; onechar |= *ucs2p++; /* do the swap stuff */ if( onechar == NEEDS_SWAP_MARK ){ INTL_SetCCCFromCSID( obj, CS_UCS2_SWAP ); needToSwap = TRUE; continue; } else if( onechar == BYTE_ORDER_MARK ){ INTL_SetCCCFromCSID( obj, CS_UCS2 ); needToSwap = FALSE; continue; } if( needToSwap ){ onechar = ( onechar << 8 ) | ( onechar >> 8 ); } /* we need to be shifted if the character is non-ASCII or * is an ASCII character that should be shifted. */ needToShift = (onechar > MAX_ASCII) || (opt->shift[onechar]); if(!needToShift && inShiftSequence) { if(bufferBitCount > 0) { if((tobufp+2) > tobufendp) break; bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt); if (!bufferBitCount) { /* buffer successfully flushed */ tobufp+=2; buffer = 0; } } else { if((tobufp+1) > tobufendp) break; *tobufp++ = opt->endshift; } inShiftSequence = FALSE; /* now just fallthrough to next case*/ } if(!needToShift && !inShiftSequence) { if((tobufp+1) > tobufendp) break; *tobufp++ = (char) onechar; } if(needToShift && !inShiftSequence) { *tobufp++ = opt->startshift; if(onechar == opt->startshift) { /* special-case behavior if onechar is a + */ if((tobufp+1) > tobufendp) break; *tobufp++ = opt->endshift; } else inShiftSequence = TRUE; } if(needToShift && inShiftSequence) { buffertemp = onechar & 0x0000FFFF; buffer |= buffertemp << (16 - bufferBitCount); /* ^--16 is the size of the int32 minus * the size of onechar */ bufferBitCount += 16; /* Flush the buffer of as many base64 characters as we can form */ while(bufferBitCount>5) { if(tobufp > tobufendp) break; oneBase64char = (char) ((buffer & 0xFC000000) >> 26); *tobufp++ = opt->tob64[oneBase64char]; buffer <<= 6; bufferBitCount -= 6; } } } /* end of while loop */ /* Anything left in the buffer at this point should be padded with 0's * and appended to tobuf. */ if(inShiftSequence) { if(bufferBitCount > 0) { if((tobufp+2) <= tobufendp) { bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt); if (!bufferBitCount) { /* buffer successfully flushed */ tobufp+=2; buffer = 0; } } } else { if((tobufp+1) <= tobufendp) *tobufp++ = opt->endshift; } inShiftSequence = FALSE; } *tobufp = '\0'; /* NULL terminate dest. data */ INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */ if(ucs2p <= ucs2endp) { /* unconverted ucs2 left? */ uncvtbuf[0] = 1; uncvtbuf[1] = *ucs2endp; } else uncvtbuf[0] = 0; return(tobuf); }