home *** CD-ROM | disk | FTP | other *** search
- /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
- *
- * The contents of this file are subject to the Netscape Public License
- * Version 1.0 (the "NPL"); you may not use this file except in
- * compliance with the NPL. You may obtain a copy of the NPL at
- * http://www.mozilla.org/NPL/
- *
- * Software distributed under the NPL is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
- * for the specific language governing rights and limitations under the
- * NPL.
- *
- * The Initial Developer of this code under the NPL is Netscape
- * Communications Corporation. Portions created by Netscape are
- * Copyright (C) 1998 Netscape Communications Corporation. All Rights
- * Reserved.
- */
- /* unicvrt.c
- * ---------
- *
- *
- * This file implements conversions from one Unicode format to another
- * Unicode format.
- *
- * There are no conversions to/from other encodings.
- *
- * There are streams conversion between UTF8 and UCS2, and UTF8 and UTF7.
- * It generates a DLL on Win 32, and at present, normal libraries on mac, X, and
- * Win16.
- */
-
- #define _UNICVT_DLL_ 1
-
- #include "intlpriv.h"
- #include "unicpriv.h"
- #include "xp.h"
- #include <string.h>
-
- #ifdef XP_WIN32
- #define XP_ALLOC_PRIV malloc
- #else
- #define XP_ALLOC_PRIV XP_ALLOC
- #endif
-
- typedef struct utf7_encoding_method_data {
- int16 *fromb64;
- unsigned char *tob64;
- unsigned char *shift;
- unsigned char startshift;
- unsigned char endshift;
- } utf7_encoding_method_data;
-
-
- int32
- ucs2_to_utf8_buffer(const uint16 *ucs2p, int32 num_chars,
- unsigned char *utf8p, int32 num_utf8_bytes, int32 *utf8_bytes_written);
-
-
- /* Private Helper function prototypes */
-
- PRIVATE int16 one_utf8_to_ucs2_char(const unsigned char *utf8p, const unsigned char *utf8endp,
- uint16 *onecharp);
-
- PRIVATE int16 one_ucs2_to_utf8_char(unsigned char *tobufp,
- unsigned char *tobufendp, uint16 onechar);
-
- PRIVATE unsigned char *intl_utf72utf8( CCCDataObject obj,
- const unsigned char *utf7buf,
- int32 utf7bufsz,
- utf7_encoding_method_data* opt
- );
- PRIVATE unsigned char *intl_utf82utf7( CCCDataObject obj,
- const unsigned char *utf8buf,
- int32 utf8bufsz,
- utf7_encoding_method_data* opt
- );
-
-
- PRIVATE uint16 pad_and_write(uint32 buffer, unsigned char *tobufp,
- int16 bufferBitCount, utf7_encoding_method_data* opt);
-
- PRIVATE void swap_ucs2_bytes(unsigned char *ucsbuf, int32 ucsbufsz);
-
-
- /* Private constants */
-
- #define MAX_UCS2 0xFFFF
- #define DEFAULT_CHAR 0x003F /* Default char is "?" */
- #define BYTE_MASK 0xBF
- #define BYTE_MARK 0x80
-
- #define MAX_ASCII 0x7F
- #define NOT_BASE64 -1
-
-
-
-
-
- /* Take care of different API for different platforms */
-
-
- #ifdef XP_WIN32
-
- /* UNICVTAPI def now accomplished in libi18n.h */
- /*#define UNICVTAPI __declspec(dllexport)*/
-
-
- /* THIS #define IS VERY BAD AND SHOULD BE CHANGED WHEN WE REVISIT
- * THE ERROR HANDLING STUFF AND MOVE IT ALL OUT OF XPSTR.H
- * THE CALL SHOULD BE: extern int MK_OUT_OF_MEMORY; BUT WE HAVE
- * CHICKEN AND EGG LINKING PROBLEMS ON WIN32 BECAUSE THE DLL
- * MUST BE COMPILED BEFORE THE int IS DECLARED.
- */
-
- #define MK_OUT_OF_MEMORY -207
-
- #else /* !XP_WIN32 */
-
- /* UNICVTAPI def now accomplished in libi18n.h */
- /*#define UNICVTAPI*/
-
- extern int MK_OUT_OF_MEMORY;
-
- #endif /*!XP_WIN32 */
-
-
-
- /* UCS-2 to UTF-8 conversion routines */
-
- /*
- * mz_ucs2utf8
- * -----------
- *
- * Takes a CCCDataObject, a buffer of UCS-2 data, and the size of that buffer.
- * Allocates and returns the translation of the UCS-2 data in UTF-8. The caller
- * is responsible for freeing the allocated memory. If the UCS-2 data is not
- * complete, and ends on a character boundary, the extra byte of data is stored
- * in uncvtbuf, and will be used the next time this function is called.
- *
- * Note about swapping: UCS-2 data can come in big-endian or little-endian
- * order, so we need to be aware of the need to potentially swap the data.
- * On the very first block of the stream we will discover (because UCS-2
- * always begins with a byte order mark) whether the data is of the same or
- * opposite endian-ness from us.
- * The information is store in FromCSID
- * The use of uncvtbuf:
- * uncvtbuf[0] is 0 or 1
- * uncvtbuf[0] == 0 - there are no left over last time
- * uncvtbuf[0] == 1 - there one byte left over last time stored in uncvtbuf[1]
- *
- */
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_ucs2utf8( CCCDataObject obj,
- const unsigned char *ucsbuf, /* UCS-2 buf for conv */
- int32 ucsbufsz) /* UCS-2 buf size in bytes */
- {
- int32 tobufsz;
- unsigned char *tobuf = NULL;
- unsigned char *tobufp, *tobufendp,*ucsp, *ucsendp;
- int16 numUTF8bytes;
- uint16 onechar;
- XP_Bool needToSwap = FALSE;
- int scanstate = 0;
- unsigned p1=0, p2;
- unsigned char *uncvtbuf =INTL_GetCCCUncvtbuf(obj);
-
-
- if(INTL_GetCCCFromCSID(obj) == CS_UCS2_SWAP)
- needToSwap = TRUE;
-
- /* Allocate Memory */
- /* In the worst case, one UCS2 could expand to three byte */
- /* so, the ration is 2:3 */
- tobufsz = (3*(ucsbufsz + 1)) / 2 + 2;
- if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL)
- {
- INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
- return(NULL);
- }
-
- /* do the set up */
- tobufendp = tobuf + tobufsz; /* point to the end of buffer */
- tobufp = tobuf; /* point to the begining of buffer */
- ucsp = (unsigned char *)ucsbuf;
- ucsendp = (unsigned char *)ucsbuf + ucsbufsz;
-
- /* Get the unconvert byte */
- if(uncvtbuf[0] > 0)
- {
- p1 = uncvtbuf[1];
- scanstate++;
- }
- /* Do the conversion */
- while( ucsp < ucsendp )
- {
- if(scanstate++ == 0)
- {
- p1 = *ucsp;
- }
- else
- {
- p2 = *ucsp;
- scanstate = 0;
- onechar = (p1 << 8) | (p2);
- /* Look for (and strip) BYTE_ORDER_MARK */
- if(onechar == NEEDS_SWAP_MARK)
- {
- INTL_SetCCCFromCSID(obj, CS_UCS2_SWAP);
- needToSwap = TRUE;
- }
- else if(onechar == BYTE_ORDER_MARK)
- {
- INTL_SetCCCFromCSID(obj, CS_UCS2);
- needToSwap = FALSE;
- }
- else
- {
- if(needToSwap)
- numUTF8bytes = one_ucs2_to_utf8_char(tobufp, tobufendp,
- (uint16)((p2 << 8) | (p1)));
- else
- numUTF8bytes = one_ucs2_to_utf8_char(tobufp, tobufendp, onechar);
-
- if(numUTF8bytes == -1)
- break; /* out of space in tobuf */
-
- tobufp += numUTF8bytes;
- }
- }
- ucsp ++;
- }
- *tobufp = '\0'; /* NULL terminate dest. data */
- INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */
-
- /* If there are left over, set it to uncvtbuf[1] */
- if((uncvtbuf[0] = scanstate) != 0)
- uncvtbuf[1] = p1;
- return(tobuf);
- }
-
- /* UTF-8 to UCS-2 */
-
- /*
- * mz_utf82ucs
- * -----------
- *
- * This function takes a streams object, a buffer of utf8 data, and the
- * size of that buffer. It allocates, fills, and returns a buffer of the
- * equivalent UCS-2 data. The caller is responsible for freeing that
- * data. If the UTF-8 data cannot be completely converted, the unconverted
- * final bytes will be stored in uncvtbuf and used on the next call.
- *
- * Note: UCS-2 data must always begin with a byte order mark, so we
- * must write that at the beginning of our stream. This function
- * employs obj->cvtflag to determine if it is indeed at the beginning
- * of the stream. obj->cvtflag starts at 0, and we switch it to 1
- * as we write the byte order mark.
- *
- * A note on endian-ness: This function will return UCS-2 data of the
- * same endian-ness as the machine we are running on. To generate data
- * of the opposite endian-ness, use mz_utf82ucsswap.
- */
-
-
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_utf82ucs( CCCDataObject obj,
- const unsigned char *utf8buf, /* UTF-8 buf for conv */
- int32 utf8bufsz) /* UTF-8 buf size in bytes */
-
-
- {
-
- unsigned char *tobuf = NULL;
- int32 tobufsz;
- unsigned char *tobufp, *utf8p; /* current byte in bufs */
- unsigned char *tobufendp, *utf8endp; /* end of buffers */
- int32 uncvtlen;
- unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);
-
-
-
- uint16 onechar;
- int16 numoctets;
-
-
- #define ucsbufsz tobufsz
- #define ucsbuf tobuf
- #define ucsp tobufp
- #define ucsendp tobufendp
- /* Allocate a dest buffer: */
-
-
- /* At worst, all the octets are ASCII, and each 1 byte of UTF 8
- * will take 2 bytes of UCS-2, plus 2 for NULL termination (and
- * possibly 2 for byte order mark)
- */
-
- uncvtlen = strlen((char *)uncvtbuf);
- tobufsz = 2*(utf8bufsz + uncvtlen) + 4;
- if (!tobufsz) {
- return NULL;
- }
-
- if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) {
- INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
- return(NULL);
- }
-
-
- /* Initialize pointers, etc. */
- utf8p = (unsigned char *)utf8buf;
- utf8endp = utf8p + utf8bufsz - 1; /* leave room for NULL termination (as sentinel?)*/
-
- #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */
- /* If prev. unconverted chars, append unconverted
- * chars w/new chars and try to process.
- */
-
- if (uncvtbuf[0] != '\0') {
- uncvtp = uncvtbuf + uncvtlen;
- while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) &&
- utf8p <= utf8endp)
- *uncvtp++ = *utf8p++;
-
- *uncvtp = '\0'; /* nul terminate as sentinel */
- utf8p = uncvtbuf; /* process unconverted first */
- utf8endp = uncvtp - 1;
-
- }
-
- #undef uncvtp
-
- tobufp = tobuf;
- tobufendp = tobufp + tobufsz - 3; /* save space for terminating null */
-
- /* write byte order mark */
-
- if(!(INTL_GetCCCCvtflag(obj))) {
- *((uint16 *) tobufp) = (uint16) BYTE_ORDER_MARK;
- tobufp += 2;
- INTL_SetCCCCvtflag(obj, TRUE);
- }
-
- WHILELOOP:
-
- while( (tobufp <= tobufendp) && (utf8p <= utf8endp) ) {
-
-
- numoctets = one_utf8_to_ucs2_char(utf8p, utf8endp, &onechar);
- if(numoctets == -1) break; /* not enought utf8 data */
- utf8p += numoctets;
-
-
-
- /* Check to make sure there's space to write onechar */
- if((tobufp+2) >= tobufendp) break;
-
- *((uint16 *) tobufp) = (onechar <= MAX_UCS2 ? onechar : DEFAULT_CHAR);
-
- tobufp +=2;
-
- }
- if(uncvtbuf[0] != '\0') { /* Just processed unconverted chars.
- * ucsp points to 1st unprocessed char
- * in ucsbuf. Some may have been
- * processed while processing unconverted
- * chars, so setup ptrs. not to process
- * them twice.
- */
-
- /* If nothing was converted, there wasn't
- * enough UCS-2 data. Stop and get more
- * data.
- */
-
- if(utf8p == uncvtbuf) { /* nothing was converted */
- *tobufp = '\0';
- return(NULL);
- }
- utf8endp = (unsigned char *) utf8buf + utf8bufsz - 1;
- utf8p = (unsigned char *) utf8buf + (utf8p - uncvtbuf - uncvtlen);
- uncvtbuf[0] = '\0'; /* No more unconverted chars.*/
- goto WHILELOOP; /* Process new data */
- }
-
- *tobufp = '\0'; /* NULL terminate dest. data */
-
- INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */
-
- if(utf8p <= utf8endp) { /* unconverted utf8 left? */
- tobufp = uncvtbuf; /* just using tobufp as a temp index. */
- while (utf8p <= utf8endp)
- *tobufp++ = *utf8p++;
- *tobufp = '\0'; /* NULL terminate, as a sentinel */
- }
-
-
- #undef ucsbufsz
- #undef ucsbuf
- #undef ucsp
- #undef ucsendp
-
-
- return(tobuf);
- }
-
-
-
- /*
- * mz_utf82ucsswap
- * ---------------
- *
- * mz_utf82ucs will convert the UTF-8 data to UCS-2 data of the same
- * endian-ness of the platform the client is running on. Occasionally,
- * this is not what is desired. mz_utf82ucsswap converts the UTF-8
- * data to UCS-2 of the opposite endian-ness.
- */
-
-
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_utf82ucsswap( CCCDataObject obj,
- const unsigned char *utf8buf, /* UTF-8 buf for conv */
- int32 utf8bufsz) /* UTF-8 buf size in bytes */
- {
-
- unsigned char *result;
-
- result = mz_utf82ucs(obj, utf8buf, utf8bufsz);
- swap_ucs2_bytes(result, INTL_GetCCCLen(obj));
- return(result);
-
- }
-
-
- /* UTF-7 to UTF-8 conversion routines */
-
-
-
-
- /* mz_utf72utf8
- * ------------
- *
- * Takes a streams object, a buffer of UTF-7 data, and the size of
- * that buffer. Allocates, fills, and returns a buffer of UTF-8
- * data. (Its size is returned in the CCCDataObject.) The caller
- * is responsible for freeing the returned buffer.
- *
- * Note: UTF-7 has the property that multiple characters of UTF-7
- * may make up a single character of UTF-8. Also, a single UTF-7 char
- * may contribute bits to more than one utf8 character. If such a
- * UTF-7 character is involved at the end of the current chunk, it won't
- * be save-able in uncvtbuf. For this reason, we also need to
- * save the bit buffer. It turns out that we also need to save the
- * fact that we are within a shifted sequence, because there is no
- * other way for that information to persist between chunks of a
- * stream. If we save a buffer, then we are certainly in the middle
- * of a shifted sequence, but even if there is no buffer to save, we
- * may still be in a shifted sequence.
- *
- * The streams module gives me one int32 - obj->cvtflag - in which
- * to save my state. This means that to save all my data, I'll need
- * to do a few bit-wise operations.
- *
- * Arbitrarily, the top two bytes will hold the buffer, the next byte
- * holds the count of relevant bits in the buffer, and the low order
- * byte will hold 0 if we are not in a shiftSequence, 1 if we are.
- *
- * Since we will only save a buffer and bufferBitCount if we are
- * in a shift sequence when this chunk terminates, obj->cvtflag == 0
- * when we do not terminate in a shift sequence.
- */
-
-
- /*
- tables for RFC1642- UTF7
- */
-
- PRIVATE int16 rfc1642_fromb64[128] =
- {
- /* 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 10 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 20 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 30 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 40 */ -1, -1, -1, 62, -1, -1, -1, 63, 52, 53,
- /* 50 */ 54, 55, 56, 57, 58, 59, 60, 61, -1, -1,
- /* 60 */ -1, -1, -1, -1, -1, 0, 1, 2, 3, 4,
- /* 70 */ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- /* 80 */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- /* 90 */ 25, -1, -1, -1, -1, -1, -1, 26, 27, 28,
- /* 100 */ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
- /* 110 */ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
- /* 120 */ 49, 50, 51, -1, -1, -1, -1, -1
- };
- PRIVATE unsigned char rfc1642_tob64[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- PRIVATE unsigned char rfc1642_shift[128] = {
- /* 0 1 2 3 4 5 6 7 */
- /* 8 9 A B C D E F */
- /* 0x00 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x08 */ TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE,
- /* 0x10 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x18 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x20 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x28 */ FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
- /* 0x30 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x38 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x40 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x48 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x50 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x58 */ FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
- /* 0x60 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x68 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x70 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x78 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE
- };
-
- PRIVATE utf7_encoding_method_data rfc1642_utf7 = {
- rfc1642_fromb64,
- rfc1642_tob64,
- rfc1642_shift,
- (unsigned char)'+',
- (unsigned char)'-'
- };
-
-
- /*
- tables for RFC2060- IMAP4rev1 Mail Box Name
- */
- PRIVATE int16 rfc2060_fromb64[128] =
- {
- /* 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 10 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 20 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 30 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- /* 40 */ -1, -1, -1, 62, 63, -1, -1, -1, 52, 53,
- /* 50 */ 54, 55, 56, 57, 58, 59, 60, 61, -1, -1,
- /* 60 */ -1, -1, -1, -1, -1, 0, 1, 2, 3, 4,
- /* 70 */ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- /* 80 */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- /* 90 */ 25, -1, -1, -1, -1, -1, -1, 26, 27, 28,
- /* 100 */ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
- /* 110 */ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
- /* 120 */ 49, 50, 51, -1, -1, -1, -1, -1
- };
- PRIVATE unsigned char rfc2060_tob64[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
- PRIVATE unsigned char rfc2060_shift[128] = {
- /* 0 1 2 3 4 5 6 7 */
- /* 8 9 A B C D E F */
- /* 0x00 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x08 */ TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE,
- /* 0x10 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x18 */ TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
- /* 0x20 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,
- /* 0x28 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x30 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x38 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x40 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x48 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x50 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x58 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x60 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x68 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x70 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- /* 0x78 */ FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE
- };
-
- PRIVATE utf7_encoding_method_data rfc2060_utf7 = {
- rfc2060_fromb64,
- rfc2060_tob64,
- rfc2060_shift,
- (unsigned char)'&',
- (unsigned char)'-'
- };
-
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_utf72utf8( CCCDataObject obj,
- const unsigned char *utf7buf, /* UTF-7 buf for conv */
- int32 utf7bufsz) /* UTF-7 buf size in bytes */
- {
- return intl_utf72utf8(obj,utf7buf, utf7bufsz, &rfc1642_utf7);
- }
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_imap4utf72utf8( CCCDataObject obj,
- const unsigned char *utf7buf, /* UTF-7 buf for conv */
- int32 utf7bufsz) /* UTF-7 buf size in bytes */
- {
- return intl_utf72utf8(obj,utf7buf, utf7bufsz, &rfc2060_utf7);
- }
-
- PRIVATE unsigned char *
- intl_utf72utf8( CCCDataObject obj,
- const unsigned char *utf7buf, /* UTF-7 buf for conv */
- int32 utf7bufsz, /* UTF-7 buf size in bytes */
- utf7_encoding_method_data* opt)
-
- {
-
- unsigned char *tobuf = NULL;
- int32 tobufsz;
- unsigned char *tobufp, *utf7p; /* current byte in bufs */
- unsigned char *tobufendp, *utf7endp; /* end of buffers */
- int32 uncvtlen;
-
- uint16 oneUCS2char;
- unsigned char onechar;
- int16 numoctets;
- int16 mustnotshift = 0;
- int16 inShiftSequence;
-
- uint32 buffer;
- uint32 buffertemp = 0;
- int16 bufferBitCount;
- unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);
-
- /* set up table to convert ASCII values of base64 chars to
- * their base 64 value. If there is no conversion, use -1 as sentinel.
- */
-
-
- /* initialize data saved from previous stream */
-
- int32 flag = INTL_GetCCCCvtflag(obj);
- inShiftSequence = flag & 1;
- buffer = 0xFFFF0000 & flag;
- bufferBitCount = (uint16) ((0x0000FF00 & flag) >> 8);
-
- #define utf8bufsz tobufsz
- #define utf8buf tobuf
- #define utf8p tobufp
- #define utf8endp tobufendp
- /* Allocate a dest buffer: */
-
-
- /* UTF-7 characters that are directly encoded will be one octet UTF-8
- * chars. Shifted chars will take 2.7 octets (plus shift in or out chars)
- * to make 2 or 3 octet UTF-8 chars. So in the worst input, all the UTF-7
- * data would convert to 3 octet UTF-8 data, and we would need 1/9th as
- * many UTF-7 characters, plus 1 to round up, plus 1 for NULL termination.
- */
-
- uncvtlen = strlen((char *)uncvtbuf);
- tobufsz = (int32) (1.2*(utf7bufsz + uncvtlen) + 2);
-
- if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL)
- {
- INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
- return(NULL);
- }
- /* Initialize pointers, etc. */
- utf7p = (unsigned char *)utf7buf;
- utf7endp = utf7p + utf7bufsz - 1;
-
- #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */
- /* If prev. unconverted chars, append unconverted
- * chars w/new chars and try to process.
- */
-
- if (uncvtbuf[0] != '\0')
- {
- uncvtp = uncvtbuf + uncvtlen;
- while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) &&
- utf7p <= utf7endp)
- *uncvtp++ = *utf7p++;
-
-
- *uncvtp = '\0'; /* nul terminate as sentinel */
- utf7p = uncvtbuf; /* process unconverted first */
- utf7endp = uncvtp - 1;
- }
- #undef uncvtp
-
- tobufp = tobuf;
- tobufendp = tobufp + tobufsz - 2;
-
- WHILELOOP:
-
- while( (tobufp <= tobufendp) && (utf7p <= utf7endp) )
- {
-
-
- onechar = *utf7p++;
-
-
- /* If I'm not in the shift sequence, and I have the start symbol,
- * absorb it and loop again. Otherwise, if I have a legal character
- * for a non-shifted sequence, (ASCII) write it directly. This is
- * ok, because ASCII is just ASCII in UTF-8, so don't need to worry
- * about UCS-2 conversion.
- */
-
- if(!inShiftSequence)
- {
-
- if(onechar == opt->startshift)
- {
- if(*utf7p == opt->endshift)
- {
- *tobufp++ = opt->startshift;
- utf7p++;
- } else inShiftSequence = TRUE;
- continue;
- }
-
- if(onechar <= MAX_ASCII) *tobufp++ = onechar;
- else continue;
-
- }
- else
- { /* inShiftSequence is TRUE */
-
- /* onechar is not a base64 allowable char if it is non-ASCII or
- * if it is a non-base64 char from the ASCII set.
- */
- mustnotshift = (onechar > MAX_ASCII ||
- (opt->fromb64[onechar] == NOT_BASE64));
-
- /* If I'm in the shift sequence, and get the opt->endshift character,
- * I want to absorb it and turn off shifting. If I get another
- * non-shiftable character, I want to write it and turn off shifting.
- * If I get an illegal character, I discard it and keep looping.
- */
-
- if(mustnotshift)
- {
-
- if(!(onechar == opt->endshift))
- {
-
- if(onechar > MAX_ASCII)
- continue;
-
- *tobufp++ = onechar;
- }
-
- inShiftSequence = FALSE;
- buffer = 0; /* flush buffer at end of shift sequence */
- bufferBitCount = 0;
-
-
- }
- else
- {
-
- buffertemp = opt->fromb64[onechar] & 0x0000003F; /* grab 6-bit base64 char */
- buffer |= buffertemp << (26 - bufferBitCount); /* 26 is 32 - 6 bits */
- bufferBitCount += 6;
-
- /* Flush the buffer of a UCS-2 character (won't be more than one) */
-
- if(bufferBitCount > 15)
- {
-
- oneUCS2char = (int16) ((buffer & 0xFFFF0000) >> 16);
- numoctets = one_ucs2_to_utf8_char(tobufp, tobufendp, oneUCS2char);
- if(numoctets == -1) break; /* out of space in tobuf */
- tobufp += numoctets;
- bufferBitCount -= 16;
- buffer <<= 16;
- }
-
- }
-
- } /* end of inShiftSequence == TRUE */
-
- } /* end of conversion while loop */
-
-
-
- if(uncvtbuf[0] != '\0')
- { /* Just processed unconverted chars.
- * ucsp points to 1st unprocessed char
- * in ucsbuf. Some may have been
- * processed while processing unconverted
- * chars, so setup ptrs. not to process
- * them twice.
- */
-
- /* If nothing was converted, there wasn't
- * enough UCS-2 data. Stop and get more
- * data.
- */
-
- if(utf7p == uncvtbuf)
- { /* nothing was converted */
- *tobufp = '\0';
- INTL_SetCCCLen(obj, 0);
- return(NULL);
- }
-
- /* set up to read ucsbuf */
- utf7endp = (unsigned char *) utf7buf + utf7bufsz - 1;
- utf7p = (unsigned char *) utf7buf + (utf7p - uncvtbuf - uncvtlen);
- uncvtbuf[0] = '\0'; /* No more unconverted chars.*/
- goto WHILELOOP; /* Process new data */
- }
-
-
- *tobufp = '\0'; /* NULL terminate dest. data */
- INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */
-
- /* If we're in a shift sequence, we need to save away our buffer
- * and the buffer bit count (although if all that's left in the buffer
- * is padding 0's, we don't need to worry about it and should reset
- * the bitCount to 0.)
- */
-
- INTL_SetCCCCvtflag(obj,((inShiftSequence ? 1 : 0 ) |
- (buffer & 0xFFFF0000) |
- ((bufferBitCount << 8) & 0x0000FF00)));
-
- /* Now check for unconverted data from utf7p */
- if(utf7p <= utf7endp)
- {
- int l = utf7endp - utf7p + 1;
- memcpy(uncvtbuf, utf7p, l);
- uncvtbuf[l] = '\0';
- }
-
- #undef utf8bufsz
- #undef utf8buf
- #undef utf8p
- #undef utf8endp
-
- return(tobuf);
-
- }
-
-
- /* UTF-8 to UTF-7 */
-
-
- /*
- * mz_utf82utf7
- * ------------
- *
- * This function takes a CCCDataObject, a buffer of UTF-8 data, and the
- * size of that buffer. It allocates and returns a buffer of the
- * corresponding UTF-7 data (returning the size as a field in the
- * CCCDataObject). The caller is responsible for freeing the returned
- * data. If there are extra data at the end of the UTF-8 buffer which
- * cannot be translated into UTF-7 (ie, an incomplete character), it
- * will be saved in the uncvtbuf of the CCCDataObject and used on the
- * next call.
- *
- * UTF-7 is a variant of base-64, and like base-64, it accumulates
- * bits in a bit buffer, transforming them to UTF-7 chars when it
- * has multiples of 6 bits. If the UTF-8 data being translated does
- * not happen to terminate with a multiple of 6 bits, the final
- * char will be padded with 0's, and the shift sequence terminated.
- * For this reason, we will *never* be inside a shift sequence in
- * between chunks of data. This may mean that the final stream of
- * data has sequences that look like +[some UTF-7 data]-+[more data]-,
- * with a plus immediately following a -. Although unconventional,
- * this is in fact legal UTF-7.
- *
- * Finally, there are two formats of UTF-7, one extremely conservative
- * fashion which shifts every character which could possibly be
- * considered unsafe, and another which is somewhat more lax. Which
- * of these is used is determined by obj->cvtflag. By default (cvtflag == 0)
- * we employ the safer form of conversion. The differing characters
- * are: !\"#$%&*;<=>@[]^_`{|}
- */
- /* Tables */
-
-
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_utf82utf7( CCCDataObject obj,
- const unsigned char *utf8buf, /* UTF-8 buf for conv */
- int32 utf8bufsz) /* UTF-8 buf size in bytes */
- {
- return intl_utf82utf7(obj,utf8buf, utf8bufsz, &rfc1642_utf7);
- }
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_utf82imap4utf7( CCCDataObject obj,
- const unsigned char *utf8buf, /* UTF-8 buf for conv */
- int32 utf8bufsz) /* UTF-8 buf size in bytes */
- {
- return intl_utf82utf7(obj,utf8buf, utf8bufsz, &rfc2060_utf7);
- }
- PRIVATE unsigned char *
- intl_utf82utf7( CCCDataObject obj,
- const unsigned char *utf8buf, /* UTF-8 buf for conv */
- int32 utf8bufsz, /* UTF-8 buf size in bytes */
- utf7_encoding_method_data* opt)
- {
-
-
- unsigned char *tobuf = NULL;
- int32 tobufsz;
- unsigned char *tobufp, *utf8p; /* current byte in bufs */
- unsigned char *tobufendp, *utf8endp; /* end of buffers */
- int32 uncvtlen;
- unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);
-
-
- uint16 onechar;
- int16 numoctets;
- int16 inShiftSequence = FALSE;
- int16 needToShift = FALSE;
- uint32 buffer = 0;
- uint32 buffertemp = 0;
- int16 bufferBitCount = 0;
- unsigned char oneBase64char;
-
-
-
- #define utf7bufsz tobufsz
- #define utf7buf tobuf
- #define utf7p tobufp
- #define utf7endp tobufendp
-
-
- /* Allocate a dest buffer: */
-
- uncvtlen = strlen((char *)uncvtbuf);
- tobufsz = 3*(utf8bufsz + uncvtlen) +1;
- if (!tobufsz) {
- return NULL;
- }
-
- if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) {
- INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
- return(NULL);
- }
- /* Initialize pointers, etc. */
- utf8p = (unsigned char *)utf8buf;
- utf8endp = utf8p + utf8bufsz - 1; /* leave room for NULL termination (as sentinel?)*/
-
- #define uncvtp tobufp /* use tobufp as temp index for uncvtbuf */
- /* If prev. unconverted chars, append unconverted
- * chars w/new chars and try to process.
- */
-
- if (uncvtbuf[0] != '\0') {
- uncvtp = uncvtbuf + uncvtlen;
- /* This is not leaving space for a NULL !!!!!!!!!!!! */
- while (uncvtp < (uncvtbuf + UNCVTBUF_SIZE) &&
- utf8p <= utf8endp)
- *uncvtp++ = *utf8p++;
-
- *uncvtp = '\0'; /* nul terminate as sentinel */
- utf8p = uncvtbuf; /* process unconverted first */
- utf8endp = uncvtp - 1;
- }
- #undef uncvtp
-
-
- tobufp = tobuf;
- tobufendp = tobufp + tobufsz - 2; /* save space for terminating null*/
-
-
-
-
- WHILELOOP:
-
- while( (tobufp <= tobufendp) && (utf8p <= utf8endp) ) {
-
- /* convert one char's worth of utf8 to ucs2 */
- numoctets = one_utf8_to_ucs2_char(utf8p, utf8endp, &onechar);
- if(numoctets == -1) break; /* out of input*/
- utf8p += numoctets;
-
- /* we need to be shifted if the character is non-ASCII or
- * is an ASCII character that should be shifted.
- */
- needToShift = (onechar > MAX_ASCII) || (opt->shift[onechar]);
-
-
- if(!needToShift && inShiftSequence) {
-
- if(bufferBitCount > 0) {
- if((tobufp+2) > tobufendp) break;
- bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt);
- if (!bufferBitCount) { /* buffer successfully flushed */
- tobufp+=2;
- buffer = 0;
- }
-
- } else {
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = opt->endshift;
- }
- inShiftSequence = FALSE; /* now just fallthrough to next case*/
- }
-
- if(!needToShift && !inShiftSequence) {
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = (char) onechar;
- }
-
- if(needToShift && !inShiftSequence) {
- *tobufp++ = opt->startshift;
- if(onechar == opt->startshift) { /* special-case behavior if onechar is a + */
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = opt->endshift;
- }
- else inShiftSequence = TRUE;
- }
-
- if(needToShift && inShiftSequence) {
-
- buffertemp = onechar & 0x0000FFFF;
- buffer |= buffertemp << (16 - bufferBitCount);
- /* ^--16 is the size of the int32 minus
- * the size of onechar */
- bufferBitCount += 16;
-
-
- /* Flush the buffer of as many base64 characters as we can form */
- while(bufferBitCount>5) {
- if(tobufp > tobufendp) break;
- oneBase64char = (char) ((buffer & 0xFC000000) >> 26);
- *tobufp++ = opt->tob64[oneBase64char];
- buffer <<= 6;
- bufferBitCount -= 6;
- }
- }
-
-
- } /* end of while loop */
-
-
-
- if(uncvtbuf[0] != '\0') { /* Just processed unconverted chars.
- * ucsp points to 1st unprocessed char
- * in ucsbuf. Some may have been
- * processed while processing unconverted
- * chars, so setup ptrs. not to process
- * them twice.
- */
-
- /* If nothing was converted, there wasn't
- * enough UTF-8 data. Stop and get more
- * data.
- */
-
- if(utf8p == uncvtbuf) { /* nothing was converted */
- *tobufp = '\0';
- return(NULL);
- }
- utf8endp = (unsigned char *) utf8buf + utf8bufsz - 1;
- utf8p = (unsigned char *) utf8buf + (utf8p - uncvtbuf - uncvtlen);
- uncvtbuf[0] = '\0'; /* No more unconverted chars.*/
- goto WHILELOOP; /* Process new data */
- }
-
-
- /* Anything left in the buffer at this point should be padded with 0's
- * and appended to tobuf. */
-
- if(inShiftSequence) {
-
- if(bufferBitCount > 0) {
-
- if((tobufp+2) <= tobufendp) {
- bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt);
- if (!bufferBitCount) { /* buffer successfully flushed */
- tobufp+=2;
- buffer = 0;
- }
- }
-
- } else {
- if((tobufp+1) <= tobufendp) *tobufp++ = opt->endshift;
- }
-
- inShiftSequence = FALSE;
- }
-
-
- *tobufp = '\0'; /* NULL terminate dest. data */
-
-
- INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */
-
- if(utf8p <= utf8endp) { /* unconverted utf8 left? */
- tobufp = uncvtbuf; /* just using tobufp as a temp index. */
- while (utf8p <= utf8endp)
- *tobufp++ = *utf8p++;
- *tobufp = '\0'; /* NULL terminate, as a sentinel if nothing else.*/
- }
-
-
- #undef utf7bufsz
- #undef utf7buf
- #undef utf7p
- #undef utf7endp
-
-
- return(tobuf);
- }
-
-
- /* Function: one_ucs2_to_utf8_char
- *
- * Function takes one UCS-2 char and writes it to a UTF-8 buffer.
- * We need a UTF-8 buffer because we don't know before this
- * function how many bytes of utf-8 data will be written. It also
- * takes a pointer to the end of the UTF-8 buffer so that we don't
- * overwrite data. This function returns the number of UTF-8 bytes
- * of data written, or -1 if the buffer would have been overrun.
- */
-
- #define LINE_SEPARATOR 0x2028
- #define PARAGRAPH_SEPARATOR 0x2029
- PRIVATE int16 one_ucs2_to_utf8_char(unsigned char *tobufp,
- unsigned char *tobufendp, uint16 onechar)
-
- {
-
- int16 numUTF8bytes = 0;
-
- if((onechar == LINE_SEPARATOR)||(onechar == PARAGRAPH_SEPARATOR))
- {
- strcpy((char*)tobufp, "\n");
- return strlen((char*)tobufp);;
- }
-
- if (onechar < 0x80) { numUTF8bytes = 1;
- } else if (onechar < 0x800) { numUTF8bytes = 2;
- } else if (onechar <= MAX_UCS2) { numUTF8bytes = 3;
- } else { numUTF8bytes = 2;
- onechar = DEFAULT_CHAR;
- }
-
- tobufp += numUTF8bytes;
-
- /* return error if we don't have space for the whole character */
- if (tobufp > tobufendp) {
- return(-1);
- }
-
-
- switch(numUTF8bytes) {
-
- case 3: *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6;
- *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6;
- *--tobufp = onechar | THREE_OCTET_BASE;
- break;
-
- case 2: *--tobufp = (onechar | BYTE_MARK) & BYTE_MASK; onechar >>=6;
- *--tobufp = onechar | TWO_OCTET_BASE;
- break;
- case 1: *--tobufp = (unsigned char)onechar; break;
- }
-
- return(numUTF8bytes);
- }
-
-
- /*
- * utf8_to_ucs2_char
- *
- * Convert a utf8 multibyte character to ucs2
- *
- * inputs: pointer to utf8 character(s)
- * length of utf8 buffer ("read" length limit)
- * pointer to return ucs2 character
- *
- * outputs: number of bytes in the utf8 character
- * -1 if not a valid utf8 character sequence
- * -2 if the buffer is too short
- */
- MODULE_PRIVATE UNICVTAPI int16
- utf8_to_ucs2_char(const unsigned char *utf8p, int16 buflen, uint16 *ucs2p)
- {
- uint16 lead, cont1, cont2;
-
- /*
- * Check for minimum buffer length
- */
- if ((buflen < 1) || (utf8p == NULL)) {
- return -2;
- }
- lead = (uint16) (*utf8p);
-
- /*
- * Check for a one octet sequence
- */
- if (IS_UTF8_1ST_OF_1(lead)) {
- *ucs2p = lead & ONE_OCTET_MASK;
- return 1;
- }
-
- /*
- * Check for a two octet sequence
- */
- if (IS_UTF8_1ST_OF_2(*utf8p)) {
- if (buflen < 2)
- return -2;
- cont1 = (uint16) *(utf8p+1);
- if (!IS_UTF8_2ND_THRU_6TH(cont1))
- return -1;
- *ucs2p = (lead & TWO_OCTET_MASK) << 6;
- *ucs2p |= cont1 & CONTINUING_OCTET_MASK;
- return 2;
- }
-
- /*
- * Check for a three octet sequence
- */
- else if (IS_UTF8_1ST_OF_3(lead)) {
- if (buflen < 3)
- return -2;
- cont1 = (uint16) *(utf8p+1);
- cont2 = (uint16) *(utf8p+2);
- if ( (!IS_UTF8_2ND_THRU_6TH(cont1))
- || (!IS_UTF8_2ND_THRU_6TH(cont2)))
- return -1;
- *ucs2p = (lead & THREE_OCTET_MASK) << 12;
- *ucs2p |= (cont1 & CONTINUING_OCTET_MASK) << 6;
- *ucs2p |= cont2 & CONTINUING_OCTET_MASK;
- return 3;
- }
- else { /* not a valid utf8/ucs2 character */
- return -1;
- }
- }
-
- UNICVTAPI int32
- INTL_NumUTF8Chars(const unsigned char *utf8p)
- {
- int num_chars = 0;
-
- while (*utf8p) {
- /*
- * Check for a one octet sequence
- */
- if (IS_UTF8_1ST_OF_1(*utf8p)) {
- num_chars += 1;
- utf8p += 1;
- continue;
- }
-
- /*
- * Check for a two octet sequence
- */
- else if (IS_UTF8_1ST_OF_2(*utf8p)
- && IS_UTF8_2ND_THRU_6TH(*(utf8p+1))) {
- num_chars += 2;
- utf8p += 2;
- continue;
- }
-
- /*
- * Check for a three octet sequence
- */
- else if (IS_UTF8_1ST_OF_3(*utf8p)
- && IS_UTF8_2ND_THRU_6TH(*(utf8p+1))
- && IS_UTF8_2ND_THRU_6TH(*(utf8p+2))) {
- num_chars += 3;
- utf8p += 3;
- continue;
- }
-
- /*
- * Not UTF8 : just muddle forward
- */
- else {
- num_chars += 1;
- utf8p += 1;
- }
-
- }
-
- return num_chars;
- }
-
- PUBLIC UNICVTAPI uint16 *
- INTL_UTF8ToUCS2(const unsigned char *utf8p, int32 *num_chars)
- {
- uint16 *ucs2_chars;
- int32 num_utf8_chars, ucs2_len, num_ucs2_chars;
- int parse_cnt, inval_cnt;
-
- /*
- * Figure the number of chars
- */
- num_utf8_chars = INTL_NumUTF8Chars(utf8p);
- ucs2_len = num_utf8_chars*2;
- ucs2_chars = (uint16 *)XP_ALLOC_PRIV(ucs2_len + 2);
- if (!ucs2_chars) return NULL;
- /*
-
- * Do the conversion
- */
- num_ucs2_chars = utf8_to_ucs2_buffer(utf8p, strlen((char*)utf8p),
- &parse_cnt, &inval_cnt, ucs2_chars, ucs2_len);
- ucs2_chars[num_ucs2_chars] = 0; /* null terminator */
-
- /*
- * return the result
- */
- if (num_ucs2_chars > 0)
- *num_chars = num_ucs2_chars;
- else
- *num_chars = 0;
- return ucs2_chars;
- }
-
- PUBLIC UNICVTAPI unsigned char *
- INTL_UCS2ToUTF8(const uint16 *ucs2p, int32 num_chars)
- {
- unsigned char *utf8_chars;
- int32 num_utf8_bytes, num_bytes_written, dummy;
- int i;
-
- /*
- * Figure the number of bytes for the utf8 string
- */
- num_utf8_bytes =0;
- for (i=0; i<num_chars; i++) {
- if (ucs2p[i] <= 0x7F) /* 0-0x7f only need one byte */
- num_utf8_bytes += 1;
- else if (ucs2p[i] <= 0x3FF) /* 0x80-0x3ff only need two bytes */
- num_utf8_bytes += 2;
- else /* 0x400-0xffff need three bytes */
- num_utf8_bytes += 3;
- }
- utf8_chars = (unsigned char *)XP_ALLOC_PRIV(num_utf8_bytes + 1);
- if (!utf8_chars) return NULL;
- XP_MEMSET(utf8_chars, 0, num_utf8_bytes + 1);
-
- /*
- * Do the conversion
- */
- num_bytes_written = ucs2_to_utf8_buffer(ucs2p, num_chars, utf8_chars,
- num_utf8_bytes, &dummy);
- /*
- * return the result
- */
- return utf8_chars;
- }
-
- /*
- * ucs2_to_utf8_buffer
- *
- * Convert a ucs2 buffer to a utf8 multibyte character string
- *
- * inputs:
- * pointer to return ucs2 buffer
- * length of ucs2 buffer ("read" length limit)
- * pointer to utf8 character(s)
- * length of utf8 buffer ("write" length limit)
- *
- * outputs: returns number of charecters "read" from the ucs2 string
- * sets *num_bytes_written to # of utf8 characters "written"
- */
- int32
- ucs2_to_utf8_buffer(const uint16 *ucs2p, int32 num_chars,
- unsigned char *utf8p, int32 num_utf8_bytes, int32 *utf8_bytes_written)
- {
- int i;
-
- /*
- * Init values
- */
- *utf8_bytes_written = 0;
-
-
- /*
- * Convert the data
- */
- for (i=0; i<num_chars; i++) {
- if (ucs2p[i] <= 0x7F) { /* 0-0x7f only need one byte */
- if (num_utf8_bytes < 1)
- break;
- utf8p[*utf8_bytes_written] = (unsigned char)ucs2p[i];
- num_utf8_bytes -= 1;
- *utf8_bytes_written += 1;
- }
- else if (ucs2p[i] <= 0x3FF) { /* 0x80-0x3ff only need two bytes */
- if (num_utf8_bytes < 2)
- break;
- utf8p[*utf8_bytes_written+0] = (unsigned char)
- (TWO_OCTET_BASE | ((ucs2p[i]>>6)&TWO_OCTET_MASK));
- utf8p[*utf8_bytes_written+1] = (unsigned char)
- (CONTINUING_OCTET_BASE | (ucs2p[i]&CONTINUING_OCTET_MASK));
- num_utf8_bytes -= 2;
- *utf8_bytes_written += 2;
- }
- else { /* 0x400-0xffff need three bytes */
- if (num_utf8_bytes < 3)
- break;
- utf8p[*utf8_bytes_written+0] = (unsigned char)
- (THREE_OCTET_BASE | ((ucs2p[i]>>12)&THREE_OCTET_MASK));
- utf8p[*utf8_bytes_written+1] = (unsigned char)
- (CONTINUING_OCTET_BASE | ((ucs2p[i]>>6)&CONTINUING_OCTET_MASK));
- utf8p[*utf8_bytes_written+2] = (unsigned char)
- (CONTINUING_OCTET_BASE | (ucs2p[i]&CONTINUING_OCTET_MASK));
- num_utf8_bytes -= 3;
- *utf8_bytes_written += 3;
- }
- }
-
- return i;
- }
-
- /*
- * utf8_to_ucs2_buffer
- *
- * Convert a utf8 multibyte character string and place in a ucs2 buffer
- *
- * inputs: pointer to utf8 character(s)
- * length of utf8 buffer ("read" length limit)
- * pointer to return ucs2 buffer
- * length of ucs2 buffer ("write" length limit)
- * pointer to return count of invalid bytes
- *
- * outputs: returns number of bytes "read" from the utf8 string
- * sets *invalid_cnt to # of invalid utf8 characters "read"
- */
- UNICVTAPI int32
- utf8_to_ucs2_buffer(const unsigned char *utf8p, int16 utf8len,
- int *parsed_cnt, int *invalid_cnt,
- uint16 *ucs2p, int32 ucs2len)
- {
- int read_len, write_len;
- int char_len;
-
- /*
- * Init the return values
- */
- *parsed_cnt = 0;
- *invalid_cnt = 0;
-
- /*
- * Check for minimum buffer lengths
- */
- if ((utf8len < 1) || (utf8p == NULL)
- || (ucs2len < 1) || (ucs2p == NULL)) {
- return 0;
- }
-
- /*
- * Do the conversion
- */
- for (read_len=0,write_len=0;
- (read_len<utf8len) && (write_len<ucs2len);
- read_len +=char_len)
- {
- char_len = utf8_to_ucs2_char(utf8p+read_len, utf8len-read_len,
- (uint16*)ucs2p+write_len);
- if (char_len == -1) { /* invalid character */
- *invalid_cnt += 1;
- char_len = 1; /* try to resynchronize */
- *(ucs2p+write_len) = *(utf8p+read_len);
- }
- else if (char_len == -2) { /* buffer too short for last char */
- /* return with what we have so far */
- break;
- }
- /*
- * Note we converted one
- */
- *parsed_cnt += char_len;
- write_len += 1;
- }
- return write_len;
- }
-
- /* Function: one_utf8_to_ucs2_char
- *
- * Converts one UTF8 char to one UCS2 char. Needs to get UTF-8 from a
- * buffer of utf8 data, because we don't know how many octets it will
- * be, not before this function is called. Take a pointer to the end of that
- * buffer to make sure we don't run past it. Put the resulting UCS-2
- * char into an int16 we're given a pointer to. Returns the number of
- * octets used in the utf-8 char we converted, and returns -1 if it
- * runs out of utf-8 data without a complete UCS-2 character.
- */
- PRIVATE int16 one_utf8_to_ucs2_char(const unsigned char *utf8p, const unsigned char *utf8endp,
- uint16 *onecharp)
- {
-
- int16 i, numoctets;
- uint32 ucs4 = 0;
- *onecharp = 0;
-
- if(*utf8p >= THREE_OCTET_BASE) numoctets = 3;
- else if (*utf8p >= TWO_OCTET_BASE) numoctets = 2;
- else numoctets = 1;
-
- /* See if all the data for the char is there */
- if ((utf8p + numoctets - 1) > utf8endp) {
- return (-1);
- }
-
-
- for(i=numoctets; i>0; i--) {
- ucs4 += *utf8p++;
- if (i == 1) break;
- ucs4 <<= 6;
- }
-
- switch(numoctets) {
-
- case 3: ucs4 -= 0x000E2080UL; break; /* truncating... */
- case 2: ucs4 -= 0x00003080UL; break;
- }
- *onecharp= (uint16)(ucs4 & 0x0000FFFFUL);
- return(numoctets);
- }
-
-
- /*
- * Internal Function: pad_and_write
- * Checks to make sure there is less than one full base64 character in the
- * buffer, pad it with 0 to make up a full base64 character, write that
- * to tobuf, and write the shift termination character. (-)
- */
-
- PRIVATE uint16 pad_and_write(uint32 buffer, unsigned char *tobufp,
- int16 bufferBitCount, utf7_encoding_method_data* opt)
-
-
- {
- int16 oneBase64char;
-
- if(bufferBitCount >= 6) return(bufferBitCount);
- oneBase64char = ((unsigned char) (buffer >> 26));
- *tobufp++ = opt->tob64[oneBase64char];
- *tobufp = opt->endshift;
- return(0);
- }
-
-
- /* Function: swap_ucs2_bytes
- *
- * Takes a buffer of ucs2 chars, and its size in *bytes*.
- *
- * This function is meant to cope with the problem that sometimes
- * UCS-2 data (because of the big-endian, little-endian problem?)
- * comes in in reversed order, and needs to be swapped to be
- * dealt with appropriately.
- *
- * This case can be detected at the very beginning of the stream,
- * because the first two bytes of any UCS-2 stream should be the
- * Byte Order Mark, or 0xFEFF. If instead you see 0xFFFE, you know
- * you need to swap. Neither of these are legal UCS-2 characters
- * otherwise, so you know that there is no danger of accidentally
- * triggering swapping with a legitimate UCS-2 stream.
- * Unfortunately, this marker is only present at the very beginning
- * of a stream; future chunks of the stream won't have the marker.
- * So if we ever detect that a stream needs to be swapped, we
- * save that information by turning on the obj->cvtflag. If, on
- * future chunks, we see that that flag is turned on, we'll go
- * ahead and swap.
- * Notice that if swapping is unnecessary, this function has
- * no effect whatsoever.
- */
- PRIVATE void swap_ucs2_bytes(unsigned char *ucsbuf, int32 ucsbufsz)
- {
-
- int32 i;
- unsigned char swapTemp = 0;
-
- if(ucsbufsz%2) ucsbufsz--;
-
- for(i=0; i<ucsbufsz; i+=2) {
-
- swapTemp = ucsbuf[i];
- ucsbuf[i] = ucsbuf[i+1];
- ucsbuf[i+1] = swapTemp;
-
- }
- return;
- }
-
-
-
-
-
-
-
- /* UCS-2 to UTF-7 jliu */
-
-
- /*
- * mz_ucs2utf7
- * ------------
- *
- * This function takes a CCCDataObject, a buffer of UCS-2 data, and the
- * size of that buffer. It allocates and returns a buffer of the
- * corresponding UTF-7 data (returning the size as a field in the
- * CCCDataObject). The caller is responsible for freeing the returned
- * data. If there are extra data at the end of the UTF-8 buffer which
- * cannot be translated into UTF-7 (ie, an incomplete character), it
- * will be saved in the uncvtbuf of the CCCDataObject and used on the
- * next call.
- *
- * UTF-7 is a variant of base-64, and like base-64, it accumulates
- * bits in a bit buffer, transforming them to UTF-7 chars when it
- * has multiples of 6 bits. If the UTF-8 data being translated does
- * not happen to terminate with a multiple of 6 bits, the final
- * char will be padded with 0's, and the shift sequence terminated.
- * For this reason, we will *never* be inside a shift sequence in
- * between chunks of data. This may mean that the final stream of
- * data has sequences that look like +[some UTF-7 data]-+[more data]-,
- * with a plus immediately following a -. Although unconventional,
- * this is in fact legal UTF-7.
- *
- * Finally, there are two formats of UTF-7, one extremely conservative
- * fashion which shifts every character which could possibly be
- * considered unsafe, and another which is somewhat more lax. Which
- * of these is used is determined by obj->cvtflag. By default (cvtflag == 0)
- * we employ the safer form of conversion. The differing characters
- * are: !\"#$%&*;<=>@[]^_`{|}
- */
- /* Tables */
-
-
- MODULE_PRIVATE UNICVTAPI unsigned char *
- mz_ucs2utf7( CCCDataObject obj,
- const unsigned char *ucs2buf, /* UTF-8 buf for conv */
- int32 ucs2bufsz) /* UTF-8 buf size in bytes */
- {
- utf7_encoding_method_data* opt = &rfc1642_utf7;
- unsigned char *tobuf = NULL;
- int32 tobufsz;
- unsigned char *tobufp, *ucs2p; /* current byte in bufs */
- unsigned char *tobufendp, *ucs2endp; /* end of buffers */
- int32 uncvtlen = 0;
- unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);
-
-
- uint16 onechar;
- int16 inShiftSequence = FALSE;
- int16 needToShift = FALSE;
- uint32 buffer = 0;
- uint32 buffertemp = 0;
- int16 bufferBitCount = 0;
- unsigned char oneBase64char;
- XP_Bool needToSwap = FALSE;
-
-
- if( INTL_GetCCCFromCSID( obj ) == CS_UCS2_SWAP )
- needToSwap = TRUE;
-
-
- /* Allocate a dest buffer:
- ** in the worst case, every Unicode character will cost 2+4 = 6 octetes
- */
-
- uncvtlen = uncvtbuf[0];
- tobufsz = 6*( (ucs2bufsz + uncvtlen)/2 + 1 ) + 1;
- if (!tobufsz) {
- return NULL;
- }
-
- if ((tobuf = (unsigned char *)XP_ALLOC_PRIV(tobufsz)) == (unsigned char *)NULL) {
- INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
- return(NULL);
- }
- /* Initialize pointers, etc. */
- ucs2p = (unsigned char *)ucs2buf;
- ucs2endp = ucs2p + ucs2bufsz - 1; /* leave room for NULL termination (as sentinel?)*/
-
- tobufp = tobuf;
- tobufendp = tobufp + tobufsz - 2; /* save space for terminating null*/
-
-
- while( (tobufp <= tobufendp) && (ucs2p < ucs2endp) ) {
-
- if( uncvtbuf[0] != 0 ){
- onechar = uncvtbuf[1];
- uncvtbuf[0] = 0;
- } else
- onechar = *ucs2p++;
- onechar <<= 8;
- onechar |= *ucs2p++;
-
- /* do the swap stuff */
-
- if( onechar == NEEDS_SWAP_MARK ){
- INTL_SetCCCFromCSID( obj, CS_UCS2_SWAP );
- needToSwap = TRUE;
- continue;
- } else if( onechar == BYTE_ORDER_MARK ){
- INTL_SetCCCFromCSID( obj, CS_UCS2 );
- needToSwap = FALSE;
- continue;
- }
-
- if( needToSwap ){
- onechar = ( onechar << 8 ) | ( onechar >> 8 );
- }
-
- /* we need to be shifted if the character is non-ASCII or
- * is an ASCII character that should be shifted.
- */
- needToShift = (onechar > MAX_ASCII) || (opt->shift[onechar]);
-
-
- if(!needToShift && inShiftSequence) {
-
- if(bufferBitCount > 0) {
- if((tobufp+2) > tobufendp) break;
- bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt);
- if (!bufferBitCount) { /* buffer successfully flushed */
- tobufp+=2;
- buffer = 0;
- }
-
- } else {
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = opt->endshift;
- }
- inShiftSequence = FALSE; /* now just fallthrough to next case*/
- }
-
- if(!needToShift && !inShiftSequence) {
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = (char) onechar;
- }
-
- if(needToShift && !inShiftSequence) {
- *tobufp++ = opt->startshift;
- if(onechar == opt->startshift) { /* special-case behavior if onechar is a + */
- if((tobufp+1) > tobufendp) break;
- *tobufp++ = opt->endshift;
- }
- else inShiftSequence = TRUE;
- }
-
- if(needToShift && inShiftSequence) {
-
- buffertemp = onechar & 0x0000FFFF;
- buffer |= buffertemp << (16 - bufferBitCount);
- /* ^--16 is the size of the int32 minus
- * the size of onechar */
- bufferBitCount += 16;
-
-
- /* Flush the buffer of as many base64 characters as we can form */
- while(bufferBitCount>5) {
- if(tobufp > tobufendp) break;
- oneBase64char = (char) ((buffer & 0xFC000000) >> 26);
- *tobufp++ = opt->tob64[oneBase64char];
- buffer <<= 6;
- bufferBitCount -= 6;
- }
- }
-
-
- } /* end of while loop */
-
-
-
- /* Anything left in the buffer at this point should be padded with 0's
- * and appended to tobuf. */
-
- if(inShiftSequence) {
-
- if(bufferBitCount > 0) {
-
- if((tobufp+2) <= tobufendp) {
- bufferBitCount = pad_and_write(buffer, tobufp, bufferBitCount, opt);
- if (!bufferBitCount) { /* buffer successfully flushed */
- tobufp+=2;
- buffer = 0;
- }
- }
-
- } else {
- if((tobufp+1) <= tobufendp) *tobufp++ = opt->endshift;
- }
-
- inShiftSequence = FALSE;
- }
-
-
- *tobufp = '\0'; /* NULL terminate dest. data */
-
-
- INTL_SetCCCLen(obj, tobufp - tobuf); /* length of processed data, in bytes */
-
- if(ucs2p <= ucs2endp) { /* unconverted ucs2 left? */
- uncvtbuf[0] = 1;
- uncvtbuf[1] = *ucs2endp;
- } else
- uncvtbuf[0] = 0;
-
-
- return(tobuf);
- }
-
-