updated for version 7.0169

author Bram Moolenaar <Bram@vim.org>

Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)

committer Bram Moolenaar <Bram@vim.org>

Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)
author Bram Moolenaar <Bram@vim.org>
Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)
committer Bram Moolenaar <Bram@vim.org>
Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)
diff --git a/runtime/doc/editing.txt b/runtime/doc/editing.txt

index 5f75a0c7498da5bed3673c6fdfc018bdd1dfef19..e850dac015f9c4485036a1889af594c327acbd59 100644 (file)
--- a/runtime/doc/editing.txt
+++ b/runtime/doc/editing.txt
@@ -1,4 +1,4 @@
-*editing.txt*   For Vim version 7.0aa.  Last change: 2005 Jul 25
+*editing.txt*   For Vim version 7.0aa.  Last change: 2005 Dec 13
  
  
                   VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -398,8 +398,9 @@ The expression can contain just about anything, thus this can also be used to
  avoid the special meaning of '"', '|', '%' and '#'.
  
                                                         *++opt* *[++opt]*
-The [++opt] argument can be used to force the value of 'fileformat' or
-'fileencoding' to a value for one command.  The form is: >
+The [++opt] argument can be used to force the value of 'fileformat',
+'fileencoding' or 'binary' to a value for one command, and to specify the
+behavior for bad characters.  The form is: >
         ++{optname}={value}
  
  Where {optname} is one of:             *++ff* *++enc* *++bin* *++nobin*
@@ -407,6 +408,7 @@ Where {optname} is one of:          *++ff* *++enc* *++bin* *++nobin*
      enc    or  encoding            overrides 'fileencoding'
      bin    or  binary      sets 'binary'
      nobin  or  nobinary            resets 'binary'
+    bad                     specifies behavior for bad characters
  
  {value} cannot contain white space.  It can be any valid value for these
  options.  Examples: >
@@ -416,12 +418,24 @@ This edits the same file again with 'fileformat' set to "unix". >
         :w ++enc=latin1 newfile
  This writes the current buffer to "newfile" in latin1 format.
  
+There may be several ++opt arguments, separated by white space.  They must all
+appear before any |+cmd| argument.
+
+                                                               *++bad*
+The argument of "++bad=" specifies what happens with characters that can't be
+converted and illegal bytes.  It can be one of three things:
+    ++bad=X      A single-byte character that replaces each bad character.
+    ++bad=keep   Keep bad characters without conversion.  Note that this may
+                result in illegal bytes in your text!
+    ++bad=drop   Remove the bad characters.
+
+The default is like "++bad=?": Replace each bad character with a question
+mark.
+
  Note that when reading, the 'fileformat' and 'fileencoding' options will be
  set to the used format.  When writing this doesn't happen, thus a next write
  will use the old value of the option.  Same for the 'binary' option.
  
-There may be several ++opt arguments, separated by white space.  They must all
-appear before any |+cmd| argument.
  
                                                         *+cmd* *[+cmd]*
  The [+cmd] argument can be used to position the cursor in the newly opened
diff --git a/src/ex_docmd.c b/src/ex_docmd.c

index 6977904b2a35d06fb7672174ce2cc96905578b8f..d8aa8e52f7cf9b305857a584a554fb9ad755d57d 100644 (file)
--- a/src/ex_docmd.c
+++ b/src/ex_docmd.c
@@ -4549,6 +4549,11 @@ getargopt(eap)
         arg += 8;
         pp = &eap->force_enc;
      }
+    else if (STRNCMP(arg, "bad", 3) == 0)
+    {
+       arg += 3;
+       pp = &eap->bad_char;
+    }
  #endif
  
      if (pp == NULL || *arg != '=')
@@ -4568,12 +4573,26 @@ getargopt(eap)
             return FAIL;
  #ifdef FEAT_MBYTE
      }
-    else
+    else if (pp == &eap->force_enc)
      {
         /* Make 'fileencoding' lower case. */
         for (p = eap->cmd + eap->force_enc; *p != NUL; ++p)
             *p = TOLOWER_ASC(*p);
      }
+    else
+    {
+       /* Check ++bad= argument.  Must be a single-byte character, "keep" or
+        * "drop". */
+       p = eap->cmd + eap->bad_char;
+       if (STRICMP(p, "keep") == 0)
+           eap->bad_char = BAD_KEEP;
+       else if (STRICMP(p, "drop") == 0)
+           eap->bad_char = BAD_DROP;
+       else if (MB_BYTE2LEN(*p) == 1 && p[1] == NUL)
+           eap->bad_char = *p;
+       else
+           return FAIL;
+    }
  #endif
  
      return OK;
diff --git a/src/fileio.c b/src/fileio.c

index 4c415ab4429a6f57c4ea95cd16a931a930e0f7e9..0cfbc2222709394f9830bf2762e2a7ec11e2aaaa 100644 (file)
--- a/src/fileio.c
+++ b/src/fileio.c
@@ -124,6 +124,7 @@ struct bw_info
  static int  buf_write_bytes __ARGS((struct bw_info *ip));
  
  #ifdef FEAT_MBYTE
+static linenr_T readfile_linenr __ARGS((linenr_T linecnt, char_u *p, char_u *endp));
  static int ucs2bytes __ARGS((unsigned c, char_u **pp, int flags));
  static int same_encoding __ARGS((char_u *a, char_u *b));
  static int get_fio_flags __ARGS((char_u *ptr));
@@ -138,6 +139,7 @@ static int get_mac_fio_flags __ARGS((char_u *ptr));
  #endif
  static int move_lines __ARGS((buf_T *frombuf, buf_T *tobuf));
  
+
      void
  filemess(buf, name, s, attr)
      buf_T      *buf;
@@ -257,10 +259,13 @@ readfile(fname, sfname, from, lines_to_skip, lines_to_read, eap, flags)
      int                file_rewind = FALSE;
  #ifdef FEAT_MBYTE
      int                can_retry;
-    int                conv_error = FALSE;     /* conversion error detected */
+    linenr_T   conv_error = 0;         /* line nr with conversion error */
+    linenr_T   illegal_byte = 0;       /* line nr with illegal byte */
      int                keep_dest_enc = FALSE;  /* don't retry when char doesn't fit
                                            in destination encoding */
-    linenr_T   illegal_byte = 0;       /* line nr with illegal byte */
+    int                bad_char_behavior = BAD_REPLACE;
+                                       /* BAD_KEEP, BAD_DROP or character to
+                                        * replace with */
      char_u     *tmpname = NULL;        /* name of 'charconvert' output file */
      int                fio_flags = 0;
      char_u     *fenc;                  /* fileencoding to use */
@@ -754,13 +759,18 @@ readfile(fname, sfname, from, lines_to_skip, lines_to_read, eap, flags)
      linecnt = curbuf->b_ml.ml_line_count;
  
  #ifdef FEAT_MBYTE
+    /* "++bad=" argument. */
+    if (eap != NULL && eap->bad_char != 0)
+       bad_char_behavior = eap->bad_char;
+
      /*
-     * Decide which 'encoding' to use first.
+     * Decide which 'encoding' to use or use first.
       */
      if (eap != NULL && eap->force_enc != 0)
      {
         fenc = enc_canonize(eap->cmd + eap->force_enc);
         fenc_alloced = TRUE;
+       keep_dest_enc = TRUE;
      }
      else if (curbuf->b_p_bin)
      {
@@ -864,7 +874,7 @@ retry:
  #ifdef FEAT_MBYTE
         if (newfile)
             curbuf->b_p_bomb = FALSE;
-       conv_error = FALSE;
+       conv_error = 0;
  #endif
      }
  
@@ -908,7 +918,7 @@ retry:
             /* Conversion given with "++cc=" wasn't possible, read
              * without conversion. */
             notconverted = TRUE;
-           conv_error = FALSE;
+           conv_error = 0;
             if (fenc_alloced)
                 vim_free(fenc);
             fenc = (char_u *)"";
@@ -1043,11 +1053,10 @@ retry:
         }
      }
  
-    /* Set can_retry when it's possible to rewind the file and try with
+    /* Set "can_retry" when it's possible to rewind the file and try with
       * another "fenc" value.  It's FALSE when no other "fenc" to try, reading
-     * stdin or "fenc" was specified with "++enc=". */
-    can_retry = (*fenc != NUL && !read_stdin
-                                    && (eap == NULL || eap->force_enc == 0));
+     * stdin or fixed at a specific encoding. */
+    can_retry = (*fenc != NUL && !read_stdin && !keep_dest_enc);
  #endif
  
      if (!skip_read)
@@ -1229,8 +1238,30 @@ retry:
                         error = TRUE;
  #ifdef FEAT_MBYTE
                     else if (conv_restlen > 0)
-                       /* some trailing bytes unconverted */
-                       conv_error = TRUE;
+                   {
+                       /* Reached end-of-file but some trailing bytes could
+                        * not be converted.  Trucated file? */
+                       if (conv_error == 0)
+                           conv_error = linecnt;
+                       if (bad_char_behavior != BAD_DROP)
+                       {
+                           fio_flags = 0;      /* don't convert this */
+                           if (bad_char_behavior == BAD_KEEP)
+                           {
+                               /* Keep the trailing bytes as-is. */
+                               size = conv_restlen;
+                               ptr -= conv_restlen;
+                           }
+                           else
+                           {
+                               /* Replace the trailing bytes with the
+                                * replacement character. */
+                               size = 1;
+                               *--ptr = bad_char_behavior;
+                           }
+                           conv_restlen = 0;
+                       }
+                   }
  #endif
                 }
  
@@ -1349,16 +1380,25 @@ retry:
                             == (size_t)-1 && ICONV_ERRNO != ICONV_EINVAL)
                                                   || from_size > CONV_RESTLEN)
                 {
-                   if (!keep_dest_enc && can_retry)
+                   if (can_retry)
                         goto rewind_retry;
-                   if (!keep_dest_enc)
-                       conv_error = TRUE;
+                   if (conv_error == 0)
+                       conv_error = readfile_linenr(linecnt,
+                                                         ptr, (char_u *)top);
  
-                   /* Ignore a byte and try again. */
+                   /* Deal with a bad byte and continue with the next. */
                     ++fromp;
                     --from_size;
-                   *top++ = '?';
-                   --to_size;
+                   if (bad_char_behavior == BAD_KEEP)
+                   {
+                       *top++ = *(fromp - 1);
+                       --to_size;
+                   }
+                   else if (bad_char_behavior != BAD_DROP)
+                   {
+                       *top++ = bad_char_behavior;
+                       --to_size;
+                   }
                 }
  
                 if (from_size > 0)
@@ -1379,141 +1419,167 @@ retry:
  # ifdef WIN3264
             if (fio_flags & FIO_CODEPAGE)
             {
+               char_u  *src, *dst;
+               int     u8c;
+               WCHAR   ucs2buf[3];
+               int     ucs2len;
+               int     codepage = FIO_GET_CP(fio_flags);
+               int     bytelen;
+               int     found_bad;
+               char    replstr[2];
+
                 /*
                  * Conversion from an MS-Windows codepage or UTF-8 to UTF-8 or
-                * a codepage, using standard MS-Windows functions.
-                * 1. find out how many ucs-2 characters there are.
-                * 2. convert from 'fileencoding' to ucs-2
-                * 3. convert from ucs-2 to 'encoding'
+                * a codepage, using standard MS-Windows functions.  This
+                * requires two steps:
+                * 1. convert from 'fileencoding' to ucs-2
+                * 2. convert from ucs-2 to 'encoding'
+                *
+                * Because there may be illegal bytes AND an incomplete byte
+                * sequence at the end, we may have to do the conversion one
+                * character at a time to get it right.
                  */
-               char_u  *ucsp;
-               size_t  from_size = size;
-               int     needed;
-               char_u  *p;
-               int     u8c;
+
+               /* Replacement string for WideCharToMultiByte(). */
+               if (bad_char_behavior > 0)
+                   replstr[0] = bad_char_behavior;
+               else
+                   replstr[0] = '?';
+               replstr[1] = NUL;
  
                 /*
-                * 1. find out how many ucs-2 characters there are.
+                * Move the bytes to the end of the buffer, so that we have
+                * room to put the result at the start.
                  */
-#  ifdef CP_UTF8       /* VC 4.1 doesn't define CP_UTF8 */
-               if (FIO_GET_CP(fio_flags) == CP_UTF8)
+               src = ptr + real_size - size;
+               mch_memmove(src, ptr, size);
+
+               /*
+                * Do the conversion.
+                */
+               dst = ptr;
+               size = size;
+               while (size > 0)
                 {
-                   int         l, flen;
-
-                   /* Handle CP_UTF8 ourselves to be able to handle trailing
-                    * bytes properly.  First find out the number of
-                    * characters and check for trailing bytes. */
-                   needed = 0;
-                   p = ptr;
-                   for (flen = from_size; flen > 0; flen -= l)
+                   found_bad = FALSE;
+
+#  ifdef CP_UTF8       /* VC 4.1 doesn't define CP_UTF8 */
+                   if (codepage == CP_UTF8)
                     {
-                       l = utf_ptr2len_len(p, flen);
-                       if (l > flen)                   /* incomplete char */
+                       /* Handle CP_UTF8 input ourselves to be able to handle
+                        * trailing bytes properly.
+                        * Get one UTF-8 character from src. */
+                       bytelen = utf_ptr2len_len(src, size);
+                       if (bytelen > size)
                         {
-                           if (l > CONV_RESTLEN)
-                               /* weird overlong byte sequence */
-                               goto rewind_retry;
-                           mch_memmove(conv_rest, p, flen);
-                           conv_restlen = flen;
-                           from_size -= flen;
-                           break;
+                           /* Only got some bytes of a character.  Normally
+                            * it's put in "conv_rest", but if it's too long
+                            * deal with it as if they were illegal bytes. */
+                           if (bytelen <= CONV_RESTLEN)
+                               break;
+
+                           /* weird overlong byte sequence */
+                           bytelen = size;
+                           found_bad = TRUE;
+                       }
+                       else
+                       {
+                           u8c = utf_ptr2char(src);
+                           if (u8c > 0xffff)
+                               found_bad = TRUE;
+                           ucs2buf[0] = u8c;
+                           ucs2len = 1;
                         }
-                       if (l == 1 && *p >= 0x80)       /* illegal byte */
-                           goto rewind_retry;
-                       ++needed;
-                       p += l;
                     }
-               }
-               else
+                   else
  #  endif
-               {
-                   /* We can't tell if the last byte of an MBCS string is
-                    * valid and MultiByteToWideChar() returns zero if it
-                    * isn't.  Try the whole string, and if that fails, bump
-                    * the last byte into conv_rest and try again. */
-                   needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
-                                MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
-                                                                    NULL, 0);
-                   if (needed == 0)
                     {
-                       conv_rest[0] = ptr[from_size - 1];
-                       conv_restlen = 1;
-                       --from_size;
-                       needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
-                                MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size,
-                                                                    NULL, 0);
+                       /* We don't know how long the byte sequence is, try
+                        * from one to three bytes. */
+                       for (bytelen = 1; bytelen <= size && bytelen <= 3;
+                                                                   ++bytelen)
+                       {
+                           ucs2len = MultiByteToWideChar(codepage,
+                                                        MB_ERR_INVALID_CHARS,
+                                                        (LPCSTR)src, bytelen,
+                                                                  ucs2buf, 3);
+                           if (ucs2len > 0)
+                               break;
+                       }
+                       if (ucs2len == 0)
+                       {
+                           /* If we have only one byte then it's probably an
+                            * incomplete byte sequence.  Otherwise discard
+                            * one byte as a bad character. */
+                           if (size == 1)
+                               break;
+                           found_bad = TRUE;
+                           bytelen = 1;
+                       }
                     }
  
-                   /* If there really is a conversion error, try using another
-                    * conversion. */
-                   if (needed == 0)
-                       goto rewind_retry;
-               }
-
-               /*
-                * 2. convert from 'fileencoding' to ucs-2
-                *
-                * Put the result of conversion to UCS-2 at the end of the
-                * buffer, then convert from UCS-2 to UTF-8 or "enc_codepage"
-                * into the start of the buffer.  If there is not enough space
-                * just fail, there is probably something wrong.
-                */
-               ucsp = ptr + real_size - (needed * sizeof(WCHAR));
-               if (ucsp < ptr + size)
-                   goto rewind_retry;
-
-#  ifdef CP_UTF8       /* VC 4.1 doesn't define CP_UTF8 */
-               if (FIO_GET_CP(fio_flags) == CP_UTF8)
-               {
-                   int         l, flen;
-
-                   /* Convert from utf-8 to ucs-2. */
-                   needed = 0;
-                   p = ptr;
-                   for (flen = from_size; flen > 0; flen -= l)
+                   if (!found_bad)
                     {
-                       l = utf_ptr2len_len(p, flen);
-                       u8c = utf_ptr2char(p);
-                       ucsp[needed * 2] = (u8c & 0xff);
-                       ucsp[needed * 2 + 1] = (u8c >> 8);
-                       ++needed;
-                       p += l;
+                       int     i;
+
+                       /* Convert "ucs2buf[ucs2len]" to 'enc' in "dst". */
+                       if (enc_utf8)
+                       {
+                           /* From UCS-2 to UTF-8.  Cannot fail. */
+                           for (i = 0; i < ucs2len; ++i)
+                               dst += utf_char2bytes(ucs2buf[i], dst);
+                       }
+                       else
+                       {
+                           BOOL        bad = FALSE;
+                           int         dstlen;
+
+                           /* From UCS-2 to "enc_codepage".  If the
+                            * conversion uses the default character "?",
+                            * the data doesn't fit in this encoding. */
+                           dstlen = WideCharToMultiByte(enc_codepage, 0,
+                                   (LPCWSTR)ucs2buf, ucs2len,
+                                   (LPSTR)dst, (src - dst),
+                                   replstr, &bad);
+                           if (bad)
+                               found_bad = TRUE;
+                           else
+                               dst += dstlen;
+                       }
                     }
-               }
-               else
-#  endif
-                   needed = MultiByteToWideChar(FIO_GET_CP(fio_flags),
-                                           MB_ERR_INVALID_CHARS, (LPCSTR)ptr,
-                                            from_size, (LPWSTR)ucsp, needed);
  
-               /*
-                * 3. convert from ucs-2 to 'encoding'
-                */
-               if (enc_utf8)
-               {
-                   /* From UCS-2 to UTF-8.  Cannot fail. */
-                   p = ptr;
-                   for (; needed > 0; --needed)
+                   if (found_bad)
                     {
-                       u8c = *ucsp++;
-                       u8c += (*ucsp++ << 8);
-                       p += utf_char2bytes(u8c, p);
+                       /* Deal with bytes we can't convert. */
+                       if (can_retry)
+                           goto rewind_retry;
+                       if (conv_error == 0)
+                           conv_error = readfile_linenr(linecnt, ptr, dst);
+                       if (bad_char_behavior != BAD_DROP)
+                       {
+                           if (bad_char_behavior == BAD_KEEP)
+                           {
+                               mch_memmove(dst, src, bytelen);
+                               dst += bytelen;
+                           }
+                           else
+                               *dst++ = bad_char_behavior;
+                       }
                     }
-                   size = p - ptr;
+
+                   src += bytelen;
+                   size -= bytelen;
                 }
-               else
+
+               if (size > 0)
                 {
-                   BOOL        bad = FALSE;
-
-                   /* From UCS-2 to "enc_codepage". If the conversion uses
-                    * the default character "?", the data doesn't fit in this
-                    * encoding, so fail (unless forced). */
-                   size = WideCharToMultiByte(enc_codepage, 0,
-                                                       (LPCWSTR)ucsp, needed,
-                                           (LPSTR)ptr, real_size, "?", &bad);
-                   if (bad && !keep_dest_enc)
-                       goto rewind_retry;
+                   /* An incomplete byte sequence remaining. */
+                   mch_memmove(conv_rest, src, size);
+                   conv_restlen = size;
                 }
+
+               /* The new size is equal to how much "dst" was advanced. */
+               size = dst - ptr;
             }
             else
  # endif
@@ -1628,7 +1694,13 @@ retry:
                                 /* Missing leading word. */
                                 if (can_retry)
                                     goto rewind_retry;
-                               conv_error = TRUE;
+                               if (conv_error == 0)
+                                   conv_error = readfile_linenr(linecnt,
+                                                                     ptr, p);
+                               if (bad_char_behavior == BAD_DROP)
+                                   continue;
+                               if (bad_char_behavior != BAD_KEEP)
+                                   u8c = bad_char_behavior;
                             }
  
                             /* found second word of double-word, get the first
@@ -1643,15 +1715,22 @@ retry:
                                 u16c = *--p;
                                 u16c += (*--p << 8);
                             }
+                           u8c = 0x10000 + ((u16c & 0x3ff) << 10)
+                                                             + (u8c & 0x3ff);
+
                             /* Check if the word is indeed a leading word. */
                             if (u16c < 0xd800 || u16c > 0xdbff)
                             {
                                 if (can_retry)
                                     goto rewind_retry;
-                               conv_error = TRUE;
+                               if (conv_error == 0)
+                                   conv_error = readfile_linenr(linecnt,
+                                                                     ptr, p);
+                               if (bad_char_behavior == BAD_DROP)
+                                   continue;
+                               if (bad_char_behavior != BAD_KEEP)
+                                   u8c = bad_char_behavior;
                             }
-                           u8c = 0x10000 + ((u16c & 0x3ff) << 10)
-                                                             + (u8c & 0x3ff);
                         }
                     }
                     else if (fio_flags & FIO_UCS4)
@@ -1678,6 +1757,8 @@ retry:
                         else
                         {
                             len = utf_head_off(ptr, p);
+                           p -= len;
+                           u8c = utf_ptr2char(p);
                             if (len == 0)
                             {
                                 /* Not a valid UTF-8 character, retry with
@@ -1685,10 +1766,14 @@ retry:
                                  * report the error. */
                                 if (can_retry)
                                     goto rewind_retry;
-                               conv_error = TRUE;
+                               if (conv_error == 0)
+                                   conv_error = readfile_linenr(linecnt,
+                                                                     ptr, p);
+                               if (bad_char_behavior == BAD_DROP)
+                                   continue;
+                               if (bad_char_behavior != BAD_KEEP)
+                                   u8c = bad_char_behavior;
                             }
-                           p -= len;
-                           u8c = utf_ptr2char(p);
                         }
                     }
                     if (enc_utf8)       /* produce UTF-8 */
@@ -1704,10 +1789,18 @@ retry:
                             /* character doesn't fit in latin1, retry with
                              * another fenc when possible, otherwise just
                              * report the error. */
-                           if (can_retry && !keep_dest_enc)
+                           if (can_retry)
                                 goto rewind_retry;
-                           *dest = 0xBF;
-                           conv_error = TRUE;
+                           if (conv_error == 0)
+                               conv_error = readfile_linenr(linecnt, ptr, p);
+                           if (bad_char_behavior == BAD_DROP)
+                               ++dest;
+                           else if (bad_char_behavior == BAD_KEEP)
+                               *dest = u8c;
+                           else if (eap != NULL && eap->bad_char != 0)
+                               *dest = bad_char_behavior;
+                           else
+                               *dest = 0xBF;
                         }
                         else
                             *dest = u8c;
@@ -1720,63 +1813,76 @@ retry:
                 size = (long)((ptr + real_size) - dest);
                 ptr = dest;
             }
-           else if (enc_utf8 && !conv_error && !curbuf->b_p_bin)
+           else if (enc_utf8 && conv_error == 0 && !curbuf->b_p_bin)
             {
                 /* Reading UTF-8: Check if the bytes are valid UTF-8.
                  * Need to start before "ptr" when part of the character was
                  * read in the previous read() call. */
-               for (p = ptr - utf_head_off(buffer, ptr); p < ptr + size; ++p)
+               for (p = ptr - utf_head_off(buffer, ptr); ; ++p)
                 {
+                   int  todo = (ptr + size) - p;
+                   int  l;
+
+                   if (todo <= 0)
+                       break;
                     if (*p >= 0x80)
                     {
-                       len = utf_ptr2len(p);
                         /* A length of 1 means it's an illegal byte.  Accept
                          * an incomplete character at the end though, the next
                          * read() will get the next bytes, we'll check it
                          * then. */
-                       if (len == 1)
+                       l = utf_ptr2len_len(p, todo);
+                       if (l > todo)
                         {
-                           p += utf_byte2len(*p) - 1;
+                           /* Incomplete byte sequence, the next read()
+                            * should get them and check the bytes. */
+                           p += todo;
                             break;
                         }
-                       p += len - 1;
+                       if (l == 1)
+                       {
+                           /* Illegal byte.  If we can try another encoding
+                            * do that. */
+                           if (can_retry)
+                               break;
+
+                           /* Remember the first linenr with an illegal byte */
+                           if (illegal_byte == 0)
+                               illegal_byte = readfile_linenr(linecnt, ptr, p);
+# ifdef USE_ICONV
+                           /* When we did a conversion report an error. */
+                           if (iconv_fd != (iconv_t)-1 && conv_error == 0)
+                               conv_error = readfile_linenr(linecnt, ptr, p);
+# endif
+
+                           /* Drop, keep or replace the bad byte. */
+                           if (bad_char_behavior == BAD_DROP)
+                           {
+                               mch_memmove(p, p+1, todo - 1);
+                               --p;
+                               --size;
+                           }
+                           else if (bad_char_behavior != BAD_KEEP)
+                               *p = bad_char_behavior;
+                       }
+                       p += l - 1;
                     }
                 }
                 if (p < ptr + size)
                 {
                     /* Detected a UTF-8 error. */
-                   if (can_retry)
-                   {
  rewind_retry:
-                       /* Retry reading with another conversion. */
+                   /* Retry reading with another conversion. */
  # if defined(FEAT_EVAL) && defined(USE_ICONV)
-                       if (*p_ccv != NUL && iconv_fd != (iconv_t)-1)
-                           /* iconv() failed, try 'charconvert' */
-                           did_iconv = TRUE;
-                       else
-# endif
-                           /* use next item from 'fileencodings' */
-                           advance_fenc = TRUE;
-                       file_rewind = TRUE;
-                       goto retry;
-                   }
-
-                   /* There is no alternative fenc, just report the error. */
-# ifdef USE_ICONV
-                   if (iconv_fd != (iconv_t)-1)
-                       conv_error = TRUE;
+                   if (*p_ccv != NUL && iconv_fd != (iconv_t)-1)
+                       /* iconv() failed, try 'charconvert' */
+                       did_iconv = TRUE;
                     else
  # endif
-                   if (illegal_byte == 0)  /* Keep the first linenr */
-                   {
-                       char_u          *s;
-
-                       /* Estimate the line number. */
-                       illegal_byte = curbuf->b_ml.ml_line_count - linecnt + 1;
-                       for (s = ptr; s < p; ++s)
-                           if (*s == '\n')
-                               ++illegal_byte;
-                   }
+                       /* use next item from 'fileencodings' */
+                       advance_fenc = TRUE;
+                   file_rewind = TRUE;
+                   goto retry;
                 }
             }
  #endif
@@ -2159,9 +2265,10 @@ failed:
             }
  #endif
  #ifdef FEAT_MBYTE
-           if (conv_error)
+           if (conv_error != 0)
             {
-               STRCAT(IObuff, _("[CONVERSION ERROR]"));
+               sprintf((char *)IObuff + STRLEN(IObuff),
+                      _("[CONVERSION ERROR in line %ld]"), (long)conv_error);
                 c = TRUE;
             }
             else if (illegal_byte > 0)
@@ -2215,7 +2322,7 @@ failed:
         /* with errors writing the file requires ":w!" */
         if (newfile && (error
  #ifdef FEAT_MBYTE
-                   || conv_error
+                   || conv_error != 0
  #endif
                     ))
             curbuf->b_p_ro = TRUE;
@@ -2297,6 +2404,30 @@ failed:
      return OK;
  }
  
+#ifdef FEAT_MBYTE
+
+/*
+ * From the current line count and characters read after that, estimate the
+ * line number where we are now.
+ * Used for error messages that include a line number.
+ */
+    static linenr_T
+readfile_linenr(linecnt, p, endp)
+    linenr_T   linecnt;        /* line count before reading more bytes */
+    char_u     *p;             /* start of more bytes read */
+    char_u     *endp;          /* end of more bytes read */
+{
+    char_u     *s;
+    linenr_T   lnum;
+
+    lnum = curbuf->b_ml.ml_line_count - linecnt + 1;
+    for (s = p; s < endp; ++s)
+       if (*s == '\n')
+           ++lnum;
+    return lnum;
+}
+#endif
+
  /*
   * Fill "*eap" to force the 'fileencoding' and 'fileformat' to be equal to the
   * buffer "buf".  Used for calling readfile().
diff --git a/src/keymap.h b/src/keymap.h

index 7bc6c47cecfb8e58e4840155d45db825b76222de..88855f1f7ddc94d8ee35360363780610db9e2867 100644 (file)
--- a/src/keymap.h
+++ b/src/keymap.h
@@ -6,18 +6,18 @@
   * Do ":help credits" in Vim to see a list of people who contributed.
   */
  
-/*
- * For MSDOS some keys produce codes larger than 0xff. They are split into two
- * chars, the first one is K_NUL (same value used in term.h).
- */
-#define K_NUL                  (0xce)  /* for MSDOS: special key follows */
-
  /*
   * Keycode definitions for special keys.
   *
   * Any special key code sequences are replaced by these codes.
   */
  
+/*
+ * For MSDOS some keys produce codes larger than 0xff. They are split into two
+ * chars, the first one is K_NUL (same value used in term.h).
+ */
+#define K_NUL                  (0xce)  /* for MSDOS: special key follows */
+
  /*
   * K_SPECIAL is the first byte of a special key code and is always followed by
   * two bytes.
diff --git a/src/normal.c b/src/normal.c

index 346cc8b5e09158788ed76520e31d60d37ae943a5..cee008e63e2d7f2d577646a04bc0dd3602ea6e71 100644 (file)
--- a/src/normal.c
+++ b/src/normal.c
@@ -2110,6 +2110,7 @@ op_colon(oap)
  /*
   * Handle the "gy" operator: call 'operatorfunc'.
   */
+/*ARGSUSED*/
      void
  op_function(oap)
      oparg_T    *oap;
diff --git a/src/vim.h b/src/vim.h

index 273d10f08eebdcdf0dc98425c3868e793b01c11f..69092ee7b00a7fb9873e5954a46f3eb7d8f7ae7d 100644 (file)
--- a/src/vim.h
+++ b/src/vim.h
@@ -1923,4 +1923,9 @@ typedef int proftime_T;       /* dummy for function prototypes */
  #define VGR_GLOBAL     1
  #define VGR_NOJUMP     2
  
+/* behavior for bad character, "++bad=" argument */
+#define BAD_REPLACE    '?'     /* replace it with '?' (default) */
+#define BAD_KEEP       -1      /* leave it */
+#define BAD_DROP       -2      /* erase it */
+
  #endif /* VIM__H */
author	Bram Moolenaar <Bram@vim.org>
	Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)
committer	Bram Moolenaar <Bram@vim.org>
	Tue, 13 Dec 2005 20:02:15 +0000 (20:02 +0000)
runtime/doc/editing.txt		patch \| blob \| history
src/ex_docmd.c		patch \| blob \| history
src/fileio.c		patch \| blob \| history
src/keymap.h		patch \| blob \| history
src/normal.c		patch \| blob \| history
src/vim.h		patch \| blob \| history