home *** CD-ROM | disk | FTP | other *** search
- To: vim-dev@vim.org
- Subject: Patch 6.2.506 (extra)
- Fcc: outbox
- From: Bram Moolenaar <Bram@moolenaar.net>
- Mime-Version: 1.0
- Content-Type: text/plain; charset=ISO-8859-1
- Content-Transfer-Encoding: 8bit
- ------------
-
- Patch 6.2.506 (extra)
- Problem: Win32: When 'encoding' is a codepage then reading a utf-8 file
- only works when iconv is available. Writing a file in another
- codepage uses the wrong kind of conversion.
- Solution: Use internal conversion functions. Enable reading and writing
- files with 'fileencoding' different from 'encoding' for all valid
- codepages and utf-8 without the need for iconv.
- Files: src/fileio.c, src/testdir/Make_dos.mak, src/testdir/test52.in,
- src/testdir/test52.ok
-
-
- *** ../vim-6.2.505/src/fileio.c Sun Apr 25 16:26:29 2004
- --- src/fileio.c Tue Apr 27 15:31:34 2004
- ***************
- *** 939,947 ****
-
- # ifdef WIN3264
- /*
- ! * Conversion from an MS-Windows codepage to UTF-8 is handled here.
- */
- ! if (fio_flags == 0 && enc_utf8)
- fio_flags = get_win_fio_flags(fenc);
- # endif
-
- --- 939,948 ----
-
- # ifdef WIN3264
- /*
- ! * Conversion from an MS-Windows codepage to UTF-8 or another codepage
- ! * is handled with MultiByteToWideChar().
- */
- ! if (fio_flags == 0)
- fio_flags = get_win_fio_flags(fenc);
- # endif
-
- ***************
- *** 1329,1388 ****
- if (fio_flags & FIO_CODEPAGE)
- {
- /*
- ! * Conversion from an MS-Windows codepage to UTF-8, using
- ! * standard MS-Windows functions.
- */
- char_u *ucsp;
- ! size_t from_size;
- int needed;
- char_u *p;
- int u8c;
-
- /*
- ! * We can't tell if the last byte of an MBCS string is valid
- ! * and MultiByteToWideChar() returns zero if it isn't.
- ! * Try the whole string, and if that fails, bump the last byte
- ! * into conv_rest and try again.
- */
- ! from_size = size;
- ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
- ! NULL, 0);
- ! if (needed == 0)
- {
- ! conv_rest[0] = ptr[from_size - 1];
- ! conv_restlen = 1;
- ! --from_size;
- needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
- NULL, 0);
- ! }
-
- ! /* If there really is a conversion error, try using another
- ! * conversion. */
- ! if (needed == 0)
- ! goto rewind_retry;
-
- ! /* Put the result of conversion to UCS-2 at the end of the
- ! * buffer, then convert from UCS-2 to UTF-8 into the start of
- ! * the buffer. If there is not enough space just fail, there
- ! * is probably something wrong. */
- ucsp = ptr + real_size - (needed * sizeof(WCHAR));
- if (ucsp < ptr + size)
- goto rewind_retry;
- ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- MB_ERR_INVALID_CHARS, (LPCSTR)ptr,
- from_size, (LPWSTR)ucsp, needed);
-
- ! /* Now go from UCS-2 to UTF-8. */
- ! p = ptr;
- ! for (; needed > 0; --needed)
- ! {
- ! u8c = *ucsp++;
- ! u8c += (*ucsp++ << 8);
- ! p += utf_char2bytes(u8c, p);
- }
- - size = p - ptr;
- }
- else
- # endif
- --- 1330,1462 ----
- if (fio_flags & FIO_CODEPAGE)
- {
- /*
- ! * Conversion from an MS-Windows codepage or UTF-8 to UTF-8 or
- ! * a codepage, using standard MS-Windows functions.
- ! * 1. find out how many ucs-2 characters there are.
- ! * 2. convert from 'fileencoding' to ucs-2
- ! * 3. convert from ucs-2 to 'encoding'
- */
- char_u *ucsp;
- ! size_t from_size = size;
- int needed;
- char_u *p;
- int u8c;
- + int l, len;
-
- /*
- ! * 1. find out how many ucs-2 characters there are.
- */
- ! if (FIO_GET_CP(fio_flags) == CP_UTF8)
- {
- ! /* Handle CP_UTF8 ourselves to be able to handle trailing
- ! * bytes properly. First find out the number of
- ! * characters and check for trailing bytes. */
- ! needed = 0;
- ! p = ptr;
- ! for (len = from_size; len > 0; len -= l)
- ! {
- ! l = utf_ptr2len_check_len(p, len);
- ! if (l > len) /* incomplete char */
- ! {
- ! if (l > CONV_RESTLEN)
- ! /* weird overlong byte sequence */
- ! goto rewind_retry;
- ! mch_memmove(conv_rest, p, len);
- ! conv_restlen = len;
- ! from_size -= len;
- ! break;
- ! }
- ! if (l == 1 && *p >= 0x80) /* illegal byte */
- ! goto rewind_retry;
- ! ++needed;
- ! p += l;
- ! }
- ! }
- ! else
- ! {
- ! /* We can't tell if the last byte of an MBCS string is
- ! * valid and MultiByteToWideChar() returns zero if it
- ! * isn't. Try the whole string, and if that fails, bump
- ! * the last byte into conv_rest and try again. */
- needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
- NULL, 0);
- ! if (needed == 0)
- ! {
- ! conv_rest[0] = ptr[from_size - 1];
- ! conv_restlen = 1;
- ! --from_size;
- ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
- ! NULL, 0);
- ! }
-
- ! /* If there really is a conversion error, try using another
- ! * conversion. */
- ! if (needed == 0)
- ! goto rewind_retry;
- ! }
-
- ! /*
- ! * 2. convert from 'fileencoding' to ucs-2
- ! *
- ! * Put the result of conversion to UCS-2 at the end of the
- ! * buffer, then convert from UCS-2 to UTF-8 or "enc_codepage"
- ! * into the start of the buffer. If there is not enough space
- ! * just fail, there is probably something wrong.
- ! */
- ucsp = ptr + real_size - (needed * sizeof(WCHAR));
- if (ucsp < ptr + size)
- goto rewind_retry;
- !
- ! if (FIO_GET_CP(fio_flags) == CP_UTF8)
- ! {
- ! /* Convert from utf-8 to ucs-2. */
- ! needed = 0;
- ! p = ptr;
- ! for (len = from_size; len > 0; len -= l)
- ! {
- ! l = utf_ptr2len_check_len(p, len);
- ! u8c = utf_ptr2char(p);
- ! ucsp[needed * 2] = (u8c & 0xff);
- ! ucsp[needed * 2 + 1] = (u8c >> 8);
- ! ++needed;
- ! p += l;
- ! }
- ! }
- ! else
- ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
- MB_ERR_INVALID_CHARS, (LPCSTR)ptr,
- from_size, (LPWSTR)ucsp, needed);
-
- ! /*
- ! * 3. convert from ucs-2 to 'encoding'
- ! */
- ! if (enc_utf8)
- ! {
- ! /* From UCS-2 to UTF-8. Cannot fail. */
- ! p = ptr;
- ! for (; needed > 0; --needed)
- ! {
- ! u8c = *ucsp++;
- ! u8c += (*ucsp++ << 8);
- ! p += utf_char2bytes(u8c, p);
- ! }
- ! size = p - ptr;
- ! }
- ! else
- ! {
- ! BOOL bad = FALSE;
- !
- ! /* From UCS-2 to "enc_codepage". If the conversion uses
- ! * the default character "?", the data doesn't fit in this
- ! * encoding, so fail (unless forced). */
- ! size = WideCharToMultiByte(enc_codepage, 0,
- ! (LPCWSTR)ucsp, needed,
- ! (LPSTR)ptr, real_size, "?", &bad);
- ! if (bad && !keep_dest_enc)
- ! goto rewind_retry;
- }
- }
- else
- # endif
- ***************
- *** 3442,3451 ****
- }
-
- # ifdef WIN3264
- ! if (converted && wb_flags == 0 && get_win_fio_flags(fenc))
- {
- - wb_flags = get_win_fio_flags(fenc);
- -
- /* Convert UTF-8 -> UCS-2 and UCS-2 -> DBCS. Worst-case * 4: */
- write_info.bw_conv_buflen = bufsize * 4;
- write_info.bw_conv_buf
- --- 3516,3523 ----
- }
-
- # ifdef WIN3264
- ! if (converted && wb_flags == 0 && (wb_flags = get_win_fio_flags(fenc)) != 0)
- {
- /* Convert UTF-8 -> UCS-2 and UCS-2 -> DBCS. Worst-case * 4: */
- write_info.bw_conv_buflen = bufsize * 4;
- write_info.bw_conv_buf
- ***************
- *** 4474,4486 ****
- else if (flags & FIO_CODEPAGE)
- {
- /*
- ! * Convert UTF-8 to UCS-2 and then to MS-Windows codepage.
- */
- char_u *from;
- size_t fromlen;
- char_u *to;
- int u8c;
- BOOL bad = FALSE;
-
- if (ip->bw_restlen > 0)
- {
- --- 4546,4560 ----
- else if (flags & FIO_CODEPAGE)
- {
- /*
- ! * Convert UTF-8 or codepage to UCS-2 and then to MS-Windows
- ! * codepage.
- */
- char_u *from;
- size_t fromlen;
- char_u *to;
- int u8c;
- BOOL bad = FALSE;
- + int needed;
-
- if (ip->bw_restlen > 0)
- {
- ***************
- *** 4498,4535 ****
- fromlen = len;
- }
-
- - /* Convert from UTF-8 to UCS-2, to the start of the buffer.
- - * The buffer has been allocated to be big enough. */
- to = ip->bw_conv_buf;
- ! while (fromlen > 0)
- {
- ! n = utf_ptr2len_check_len(from, fromlen);
- ! if (n > (int)fromlen)
- ! break;
- ! u8c = utf_ptr2char(from);
- ! *to++ = (u8c & 0xff);
- ! *to++ = (u8c >> 8);
- ! fromlen -= n;
- ! from += n;
- ! }
-
- ! /* copy remainder to ip->bw_rest[] to be used for the next call. */
- ! mch_memmove(ip->bw_rest, from, fromlen);
- ! ip->bw_restlen = fromlen;
-
- - /* Convert from UCS-2 to the codepage, using the remainder of the
- - * conversion buffer. If the conversion uses the default
- - * character "0", the data doesn't fit in this encoding, so fail. */
- fromlen = to - ip->bw_conv_buf;
- ! len = WideCharToMultiByte(FIO_GET_CP(flags), 0,
- ! (LPCWSTR)ip->bw_conv_buf, (int)fromlen / sizeof(WCHAR),
- ! (LPSTR)to, ip->bw_conv_buflen - fromlen, 0, &bad);
- ! if (bad)
- {
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- }
- - buf = to;
- }
- # endif
-
- --- 4572,4675 ----
- fromlen = len;
- }
-
- to = ip->bw_conv_buf;
- ! if (enc_utf8)
- {
- ! /* Convert from UTF-8 to UCS-2, to the start of the buffer.
- ! * The buffer has been allocated to be big enough. */
- ! while (fromlen > 0)
- ! {
- ! n = utf_ptr2len_check_len(from, fromlen);
- ! if (n > (int)fromlen) /* incomplete byte sequence */
- ! break;
- ! u8c = utf_ptr2char(from);
- ! *to++ = (u8c & 0xff);
- ! *to++ = (u8c >> 8);
- ! fromlen -= n;
- ! from += n;
- ! }
-
- ! /* Copy remainder to ip->bw_rest[] to be used for the next
- ! * call. */
- ! if (fromlen > CONV_RESTLEN)
- ! {
- ! /* weird overlong sequence */
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- ! }
- ! mch_memmove(ip->bw_rest, from, fromlen);
- ! ip->bw_restlen = fromlen;
- ! }
- ! else
- ! {
- ! /* Convert from enc_codepage to UCS-2, to the start of the
- ! * buffer. The buffer has been allocated to be big enough. */
- ! ip->bw_restlen = 0;
- ! needed = MultiByteToWideChar(enc_codepage,
- ! MB_ERR_INVALID_CHARS, (LPCSTR)from, fromlen,
- ! NULL, 0);
- ! if (needed == 0)
- ! {
- ! /* When conversion fails there may be a trailing byte. */
- ! ip->bw_restlen = 1;
- ! needed = MultiByteToWideChar(enc_codepage,
- ! MB_ERR_INVALID_CHARS, (LPCSTR)from, fromlen,
- ! NULL, 0);
- ! if (needed == 0)
- ! {
- ! /* Conversion doesn't work. */
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- ! }
- ! /* Save the trailing byte for the next call. */
- ! *ip->bw_rest = from[fromlen - 1];
- ! }
- ! needed = MultiByteToWideChar(enc_codepage, MB_ERR_INVALID_CHARS,
- ! (LPCSTR)from, fromlen - ip->bw_restlen,
- ! (LPWSTR)to, needed);
- ! if (needed == 0)
- ! {
- ! /* Safety check: Conversion doesn't work. */
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- ! }
- ! to += needed * 2;
- ! }
-
- fromlen = to - ip->bw_conv_buf;
- ! buf = to;
- ! if (FIO_GET_CP(flags) == CP_UTF8)
- {
- ! /* Convert from UCS-2 to UTF-8, using the remainder of the
- ! * conversion buffer. Fails when out of space. */
- ! for (from = ip->bw_conv_buf; fromlen > 1; fromlen -= 2)
- ! {
- ! u8c = *from++;
- ! u8c += (*from++ << 8);
- ! to += utf_char2bytes(u8c, to);
- ! if (to + 6 >= ip->bw_conv_buf + ip->bw_conv_buflen)
- ! {
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- ! }
- ! }
- ! len = to - buf;
- ! }
- ! else
- ! {
- ! /* Convert from UCS-2 to the codepage, using the remainder of
- ! * the conversion buffer. If the conversion uses the default
- ! * character "0", the data doesn't fit in this encoding, so
- ! * fail. */
- ! len = WideCharToMultiByte(FIO_GET_CP(flags), 0,
- ! (LPCWSTR)ip->bw_conv_buf, (int)fromlen / sizeof(WCHAR),
- ! (LPSTR)to, ip->bw_conv_buflen - fromlen, 0, &bad);
- ! if (bad)
- ! {
- ! ip->bw_conv_error = TRUE;
- ! return FAIL;
- ! }
- }
- }
- # endif
-
- ***************
- *** 4775,4789 ****
- #ifdef WIN3264
- /*
- * Check "ptr" for a MS-Windows codepage name and return the FIO_ flags needed
- ! * for the conversion MS-Windows can do for us.
- */
- static int
- get_win_fio_flags(ptr)
- char_u *ptr;
- {
- ! if (ptr[0] == 'c' && ptr[1] == 'p' && VIM_ISDIGIT(ptr[2]))
- ! return FIO_PUT_CP(atoi(ptr + 2)) | FIO_CODEPAGE;
- ! return 0;
- }
- #endif
-
- --- 4915,4942 ----
- #ifdef WIN3264
- /*
- * Check "ptr" for a MS-Windows codepage name and return the FIO_ flags needed
- ! * for the conversion MS-Windows can do for us. Also accept "utf-8".
- ! * Used for conversion between 'encoding' and 'fileencoding'.
- */
- static int
- get_win_fio_flags(ptr)
- char_u *ptr;
- {
- ! int cp;
- !
- ! /* Cannot do this when 'encoding' is not utf-8 and not a codepage. */
- ! if (!enc_utf8 && enc_codepage <= 0)
- ! return 0;
- !
- ! cp = encname2codepage(ptr);
- ! if (cp == 0)
- ! {
- ! if (STRCMP(ptr, "utf-8") == 0)
- ! cp = CP_UTF8;
- ! else
- ! return 0;
- ! }
- ! return FIO_PUT_CP(cp) | FIO_CODEPAGE;
- }
- #endif
-
- *** ../vim-6.2.505/src/testdir/Make_dos.mak Mon Mar 22 17:28:47 2004
- --- src/testdir/Make_dos.mak Tue Apr 27 15:51:03 2004
- ***************
- *** 24,30 ****
- test15.out test17.out test18.out test21.out test26.out \
- test30.out test31.out test32.out test33.out test34.out \
- test37.out test38.out test39.out test40.out test41.out \
- ! test42.out
-
- SCRIPTS32 = test50.out
-
- --- 24,30 ----
- test15.out test17.out test18.out test21.out test26.out \
- test30.out test31.out test32.out test33.out test34.out \
- test37.out test38.out test39.out test40.out test41.out \
- ! test42.out test52.out
-
- SCRIPTS32 = test50.out
-
- ***************
- *** 51,56 ****
- --- 51,57 ----
- -del tiny.vim
- -del mbyte.vim
- -del X*
- + -del viminfo
-
- .in.out:
- copy $*.ok test.ok
- ***************
- *** 60,62 ****
- --- 61,64 ----
- rename test.out $*.out
- -del X*
- -del test.ok
- + -del viminfo
- *** ../vim-6.2.505/src/testdir/test52.in Tue Apr 27 16:24:44 2004
- --- src/testdir/test52.in Tue Apr 27 16:20:18 2004
- ***************
- *** 0 ****
- --- 1,65 ----
- + Tests for reading and writing files with conversion for Win32.
- +
- + STARTTEST
- + :so mbyte.vim
- + :" make this a dummy test for non-Win32 systems
- + :if !has("win32") | e! testk.ok | wq! test.out | endif
- + :"
- + :" write tests:
- + :" combine three values for 'encoding' with three values for 'fileencoding'
- + :" also write files for read tests
- + /^1
- + :set encoding=utf-8
- + :.w! ++enc=utf-8 test.out
- + :.w ++enc=cp1251 >>test.out
- + :.w ++enc=cp866 >>test.out
- + :.w! ++enc=utf-8 Xutf8
- + /^2
- + :set encoding=cp1251
- + :.w ++enc=utf-8 >>test.out
- + :.w ++enc=cp1251 >>test.out
- + :.w ++enc=cp866 >>test.out
- + :.w! ++enc=cp1251 Xcp1251
- + /^3
- + :set encoding=cp866
- + :.w ++enc=utf-8 >>test.out
- + :.w ++enc=cp1251 >>test.out
- + :.w ++enc=cp866 >>test.out
- + :.w! ++enc=cp866 Xcp866
- + :"
- + :" read three 'fileencoding's with utf-8 'encoding'
- + :set encoding=utf-8 fencs=utf-8,cp1251
- + :e Xutf8
- + :.w ++enc=utf-8 >>test.out
- + :e Xcp1251
- + :.w ++enc=utf-8 >>test.out
- + :set fencs=utf-8,cp866
- + :e Xcp866
- + :.w ++enc=utf-8 >>test.out
- + :"
- + :" read three 'fileencoding's with cp1251 'encoding'
- + :set encoding=utf-8 fencs=utf-8,cp1251
- + :e Xutf8
- + :.w ++enc=cp1251 >>test.out
- + :e Xcp1251
- + :.w ++enc=cp1251 >>test.out
- + :set fencs=utf-8,cp866
- + :e Xcp866
- + :.w ++enc=cp1251 >>test.out
- + :"
- + :" read three 'fileencoding's with cp866 'encoding'
- + :set encoding=cp866 fencs=utf-8,cp1251
- + :e Xutf8
- + :.w ++enc=cp866 >>test.out
- + :e Xcp1251
- + :.w ++enc=cp866 >>test.out
- + :set fencs=utf-8,cp866
- + :e Xcp866
- + :.w ++enc=cp866 >>test.out
- + :"
- + :qa!
- + ENDTEST
- +
- + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 2 cp1251 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 3 cp866 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- *** ../vim-6.2.505/src/testdir/test52.ok Tue Apr 27 16:24:44 2004
- --- src/testdir/test52.ok Tue Apr 27 16:20:56 2004
- ***************
- *** 0 ****
- --- 1,18 ----
- + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 1 utf-8 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 1 utf-8 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- + 2 cp1251 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 2 cp1251 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 2 cp1251 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- + 3 cp866 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 3 cp866 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 3 cp866 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 2 cp1251 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 3 cp866 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01
- + 1 utf-8 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 2 cp1251 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 3 cp866 text: ─δ Vim version 6.2. ╧ε±δσΣφσσ Φτ∞σφσφΦσ: 1970 Jan 01
- + 1 utf-8 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- + 2 cp1251 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- + 3 cp866 text: ä½∩ Vim version 6.2. ū߽Ññ¡ÑÑ ¿º¼Ñ¡Ñ¡¿Ñ: 1970 Jan 01
- *** ../vim-6.2.505/src/version.c Tue Apr 27 10:03:32 2004
- --- src/version.c Tue Apr 27 16:23:35 2004
- ***************
- *** 639,640 ****
- --- 639,642 ----
- { /* Add new patch number below this line */
- + /**/
- + 506,
- /**/
-
- --
- hundred-and-one symptoms of being an internet addict:
- 34. You laugh at people with 14400 baud modems.
-
- /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\
- /// Sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
- \\\ Project leader for A-A-P -- http://www.A-A-P.org ///
- \\\ Buy at Amazon and help AIDS victims -- http://ICCF.nl/click1.html ///
-