home *** CD-ROM | disk | FTP | other *** search
- /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
- *
- * The contents of this file are subject to the Netscape Public License
- * Version 1.0 (the "NPL"); you may not use this file except in
- * compliance with the NPL. You may obtain a copy of the NPL at
- * http://www.mozilla.org/NPL/
- *
- * Software distributed under the NPL is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
- * for the specific language governing rights and limitations under the
- * NPL.
- *
- * The Initial Developer of this code under the NPL is Netscape
- * Communications Corporation. Portions created by Netscape are
- * Copyright (C) 1998 Netscape Communications Corporation. All Rights
- * Reserved.
- */
- /* autodetect.c */
-
- /*
- * CODESET 1st Byte 2nd Byte 3rd Byte
- * JIS 0x21-0x7E 0x21-0x7E n/a
- * SJIS 0xA1-0xDF n/a n/a
- * 0x81-0x9F 0x40-0xFC n/a
- * 0xE0-0xEF 0x40-0xFC n/a
- * EUCJP 0x8E (SS2) 0xA1-0xDF n/a
- * 0xA1-0xFE 0xA1-0xFE n/a
- * 0x8F (SS3) 0xA1-0xFE 0xA1-0xFE
- * Invalid 7F,80,A0,FF
- */
-
- #include "intlpriv.h"
-
- #define ALLOW_NBSP 1
-
- /*
- * JIS X 0201-Roman ESC ( J
- * Half-width Katakana ESC ( I
- * JIS X 0208-1978 ESC $ @
- * JIS X 0208-1983 ESC $ B
- * JIS X 0212-1990 ESC $ ( D
- */
- #define IsJISEscSeq(cp, len) \
- ((cp[0] == ESC) && ((len) > 2) && ( \
- ((cp[1] == '$') && (cp[2] == 'B')) || \
- ((cp[1] == '$') && (cp[2] == '@')) || \
- ((cp[1] == '(') && (cp[2] == 'J')) || \
- ((cp[1] == '(') && (cp[2] == 'I')) || \
- (((len) > 3) && (cp[1] == '$') && (cp[2] == '(') && (cp[3] == 'D')) ) )
-
- #define IsRoman(c) ((c) < 0x80)
- #define IsSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xFD))
- #define IsLoSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xA1))
- #define IsHiSJIS2ndByte(c) (((c) > 0xA0) && ((c) < 0xFD))
- #define IsEUCJPKana(b1) (((b1) > 0xA0) && ((b1) < 0xE0))
- #define IsEUCJPKanji(b1or2) (((b1or2) > 0xA0) && ((b1or2) < 0xFF))
-
- #define YES 1
- #define NO 0
- #define MAYBE -1
-
- PRIVATE int
- isSJIS(const unsigned char *cp, int32 len)
- {
- while (len) {
- if (IsRoman(*cp)) {
- cp++, len--;
- } else if (*cp == 0x80) { /* illegal SJIS 1st byte */
- return NO;
- } else if ((*cp < 0xA0)) { /* byte 1 of 2byte SJIS 1st range */
- if (len > 1) {
- if (IsSJIS2ndByte(cp[1])) {
- if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0))
- return YES;
- cp += 2, len -= 2; /* valid 2 byte SJIS */
- } else {
- return NO; /* invalid SJIS 2nd byte */
- }
- } else
- break; /* buffer ended w/1of2 byte SJIS */
- } else if (*cp == 0xA0) { /* illegal EUCJP byte */
- #if ALLOW_NBSP
- cp++, len--; /* allow nbsp */
- #endif
- } else if (*cp < 0xE0) { /* SJIS half-width kana */
- cp++, len--;
- } else if (*cp < 0xF0) { /* byte 1 of 2byte SJIS 2nd range */
- if (len > 1) {
- if (IsSJIS2ndByte(cp[1])) {
- cp += 2, len -= 2; /* valid 2 byte SJIS */
- } else {
- return NO; /* invalid SJIS */
- }
- } else
- break; /* buffer ended w/1of2 byte SJIS */
- } else {
- return NO; /* invalid SJIS 1st byte */
- }
- }
- return MAYBE; /* No illegal SJIS values found */
- }
-
- PRIVATE int
- isEUCJP(const unsigned char *cp, int32 len)
- {
- while (len) {
- if (IsRoman(*cp)) { /* Roman */
- cp++, len--;
- } else if (*cp == SS2) { /* EUCJP JIS201 half-width kana */
- if (len > 1) {
- if (IsEUCJPKana(cp[1]))
- cp += 2, len -= 2; /* valid half-width kana */
- else
- return NO; /* invalid 2of3 byte EUC */
- } else
- break; /* buffer ended w/1of2 byte EUC */
- } else if (*cp == SS3) { /* EUCJP JIS212 */
- if (len > 1) {
- if (IsEUCJPKanji(cp[1])) {
- if (len > 2) {
- if (IsEUCJPKanji(cp[2]))
- cp += 2, len -= 2; /* valid 3 byte EUCJP */
- else
- return NO; /* invalid 3of3 byte EUCJP */
- } else
- break; /* buffer ended w/2of3 byte EUCJP */
- } else
- return NO; /* invalid 2of3 byte EUCJP */
- } else
- break; /* buffer ended w/1of3 byte EUCJP */
- } else if (*cp == 0xA0) { /* illegal EUCJP byte */
- #if ALLOW_NBSP
- cp++, len--; /* allow nbsp */
- #else
- return NO;
- #endif
- } else if (*cp < 0xF0) { /* EUCJP JIS208 (overlaps SJIS) */
- if (len > 1) {
- if (IsEUCJPKanji(cp[1]))
- cp += 2, len -= 2; /* valid 2 byte EUCJP */
- else
- return NO; /* invalid 2of2 byte EUCJP */
- } else
- break; /* buffer ended w/1of2 byte EUCJP */
- } else if (*cp < 0xFF) { /* EUCJP JIS208 only: */
- if (len > 1) {
- if (IsEUCJPKanji(cp[1]))
- return YES; /* valid 2 byte EUCJP, invalid SJIS */
- else
- return NO; /* invalid 2of2 byte EUCJP */
- } else
- break; /* buffer ended w/1of2 byte EUCJP */
- } else {
- return NO; /* invalid EUCJP 1st byte: 0xFF */
- }
- }
- return MAYBE;
- }
-
- MODULE_PRIVATE int16
- intl_detect_JCSID (uint16 defaultCSID, const unsigned char *buf, int32 len)
- {
- register const unsigned char *cp = buf;
- int sjisFlag;
- int eucjpFlag;
-
- /* JIS is 7bit. Scan to end of 7bit data or legitimate JIS ESC sequence. */
- while (len && (IsRoman(*cp) || (*cp == 0xA0))) { /* allow nbsp */
- if (IsJISEscSeq(cp, len))
- return CS_JIS;
- cp++, len--;
- }
-
- /* If len > 0, must be either SJIS or EUC because there's 8bit data */
- while (len) {
- if (*cp == 0x80) {
- return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
- }
- if (*cp < 0x8E)
- return CS_SJIS; /* Illegal EUCJP 1st byte */
- if (*cp == 0xA0) {
- #if ALLOW_NBSP
- cp++; len--;
- continue; /* allow nbsp */
- #else
- return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
- #endif
- }
- if ( (*cp > 0xEF) && (*cp < 0xFF) ) /* illegal SJIS 1st byte */
- return CS_EUCJP;
- if (*cp == 0xFF) {
- return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
- }
-
- /* At this point. 1st byte is 0x8E, 0x8F, or 0xA1-0xEF. */
- /* If 1st Byte is 0xE0-0xEF inclusive, then it's 2byte SJIS or EUC */
- if ((*cp > 0xDF) && (*cp < 0xF0)) {
- if (len > 1) {
- if (cp[1] < 0x41) { /* illegal byte2 (SJIS & EUCJP) */
- return CS_DEFAULT;
- }
- if (cp[1] < 0xA1)
- return CS_SJIS; /* Illegal EUCJP 2nd byte */
- if (cp[1] > 0xFC)
- return CS_EUCJP; /* illegal SJIS 2nd byte */
- cp += 2, len -= 2; /* Skip 2 byte character */
-
- /* Gobble up single byte characters and continue outer loop */
- while (len && IsRoman(*cp)) {
- cp++, len--;
- }
- continue;
- } else {
- len = 0;
- break; /* No more chars in buffer */
- }
- }
- /* 1st Byte is 0xA1-DF inclusive:
- * 1byte SJIS kana or 1of2 byte SJIS or EUC
- */
- break; /* break and handle ambiguous cases */
- }
-
- if (len) {
- eucjpFlag = isEUCJP(cp, len);
- if (YES == eucjpFlag)
- return CS_EUCJP;
-
- sjisFlag = isSJIS(cp, len);
- if (YES == sjisFlag)
- return CS_SJIS;
-
- /* Neither one is YES, look at NO : MAYBE Pair */
- if ((NO == eucjpFlag) && (MAYBE == sjisFlag))
- return CS_SJIS;
- if ((MAYBE == eucjpFlag) && (NO == sjisFlag))
- return CS_EUCJP;
- }
-
- /* Some servers relied upon the previous Nav3.0 default for ambiguous SJIS/EUC encoding. */
- #define USE_ACKBAR_LOGIC 1
-
- /* Now, both are NO or both are MAYBE, look at default */
- if (len) { /* Must be ambiguous -- EUC or SJIS */
- #if USE_ACKBAR_LOGIC
-
- #ifdef XP_MAC
- defaultCSID = CS_SJIS_AUTO; /* simulate Akbar old charset hints */
- #else
- defaultCSID = CS_JIS;
- #endif
- if (defaultCSID == CS_SJIS) {
- eucjpFlag = isEUCJP(cp, len);
- if (eucjpFlag == YES)
- return CS_EUCJP;
- else
- return CS_SJIS;
- } else if (defaultCSID == CS_EUCJP) {
- sjisFlag = isSJIS(cp, len);
- if (sjisFlag == YES)
- return CS_SJIS;
- else
- return CS_EUCJP;
- } else { /* default is JIS */
- sjisFlag = isSJIS(cp, len);
- if (sjisFlag == YES)
- return CS_SJIS;
- eucjpFlag = isEUCJP(cp, len);
- if (eucjpFlag == YES)
- return CS_EUCJP;
- if (sjisFlag == NO) {
- if (eucjpFlag != NO) /* SJIS-NO, EUCJP-MAYBE */
- return CS_EUCJP;
- } else {
- if (eucjpFlag == NO) /* SJIS-MAYBE, EUCJP-NO */
- return CS_SJIS;
- else { /* both MAYBE */
- return CS_EUCJP; /* have to pick one... */
- }
- }
- }
- #else
- if (CS_SJIS == defaultCSID) {
- if (MAYBE == sjisFlag)
- return CS_SJIS;
- } else if (CS_EUCJP == defaultCSID) {
- if (MAYBE == eucjpFlag)
- return CS_EUCJP;
- } else { /* default is JIS */
- if ((MAYBE == eucjpFlag) && (MAYBE == sjisFlag)) /* pick one- EUC */
- return CS_EUCJP;
- }
- #endif
- }
- return CS_ASCII; /* Could be any of the 3... */
- }
-
- /* Auto Detect Japanese Char Code Conversion */
- MODULE_PRIVATE unsigned char *
- autoJCCC (CCCDataObject obj, const unsigned char *s, int32 l)
- {
- int16 doc_csid = 0;
- uint16 detected_doc_csid;
-
- /* try to determine doc Japanese CSID. */
- doc_csid = intl_detect_JCSID((uint16)(INTL_GetCCCDefaultCSID(obj)&~CS_AUTO),
- (const unsigned char *) s,l);
- if (doc_csid == CS_ASCII) { /* return s unconverted and */
- INTL_SetCCCLen(obj, l);
- return (unsigned char *)s; /* autodetect next block of stream data */
- }
- if (doc_csid == CS_DEFAULT) { /* found unexpected chars */
- doc_csid = INTL_GetCCCDefaultCSID(obj) & ~CS_AUTO;
- detected_doc_csid = CS_DEFAULT;
- } else {
- detected_doc_csid = doc_csid | CS_AUTO;
- }
- /* Setup converter function for success streams data blocks */
- (void) INTL_GetCharCodeConverter(doc_csid, INTL_GetCCCToCSID(obj), obj);
- INTL_CallCCCReportAutoDetect(obj, detected_doc_csid);
-
- /* If no conversion needed, change put_block module for successive
- * data blocks. For current data block, return unmodified buffer.
- */
- if (INTL_GetCCCCvtfunc(obj) == NULL) {
- INTL_SetCCCLen(obj, l);
- return((unsigned char *) s);
- }
- /* For initial block, must call converter directly. Success calls
- * to the converter will be called directly from net_CharCodeConv()
- */
- return (unsigned char *)(INTL_GetCCCCvtfunc(obj)) (obj, (const unsigned char *)s, l);
- }
-
-
-
-