granicus.if.org Git - postgresql/blob - src/backend/utils/mb/wchar.c

   1 /*
   2  * conversion functions between pg_wchar and multibyte streams.
   3  * Tatsuo Ishii
   4  * src/backend/utils/mb/wchar.c
   5  *
   6  */
   7 /* can be used in either frontend or backend */
   8 #ifdef FRONTEND
   9 #include "postgres_fe.h"
  10 #else
  11 #include "postgres.h"
  12 #endif
  13
  14 #include "mb/pg_wchar.h"
  15
  16
  17 /*
  18  * Operations on multi-byte encodings are driven by a table of helper
  19  * functions.
  20  *
  21  * To add an encoding support, define mblen(), dsplen() and verifier() for
  22  * the encoding.  For server-encodings, also define mb2wchar() and wchar2mb()
  23  * conversion functions.
  24  *
  25  * These functions generally assume that their input is validly formed.
  26  * The "verifier" functions, further down in the file, have to be more
  27  * paranoid.
  28  *
  29  * We expect that mblen() does not need to examine more than the first byte
  30  * of the character to discover the correct length.  GB18030 is an exception
  31  * to that rule, though, as it also looks at second byte.  But even that
  32  * behaves in a predictable way, if you only pass the first byte: it will
  33  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
  34  * good enough for all current uses.
  35  *
  36  * Note: for the display output of psql to work properly, the return values
  37  * of the dsplen functions must conform to the Unicode standard. In particular
  38  * the NUL character is zero width and control characters are generally
  39  * width -1. It is recommended that non-ASCII encodings refer their ASCII
  40  * subset to the ASCII routines to ensure consistency.
  41  */
  42
  43 /*
  44  * SQL/ASCII
  45  */
  46 static int
  47 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  48 {
  49         int                     cnt = 0;
  50
  51         while (len > 0 && *from)
  52         {
  53                 *to++ = *from++;
  54                 len--;
  55                 cnt++;
  56         }
  57         *to = 0;
  58         return cnt;
  59 }
  60
  61 static int
  62 pg_ascii_mblen(const unsigned char *s)
  63 {
  64         return 1;
  65 }
  66
  67 static int
  68 pg_ascii_dsplen(const unsigned char *s)
  69 {
  70         if (*s == '\0')
  71                 return 0;
  72         if (*s < 0x20 || *s == 0x7f)
  73                 return -1;
  74
  75         return 1;
  76 }
  77
  78 /*
  79  * EUC
  80  */
  81 static int
  82 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  83 {
  84         int                     cnt = 0;
  85
  86         while (len > 0 && *from)
  87         {
  88                 if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte
  89                                                                                  * KANA") */
  90                 {
  91                         from++;
  92                         *to = (SS2 << 8) | *from++;
  93                         len -= 2;
  94                 }
  95                 else if (*from == SS3 && len >= 3)      /* JIS X 0212 KANJI */
  96                 {
  97                         from++;
  98                         *to = (SS3 << 16) | (*from++ << 8);
  99                         *to |= *from++;
 100                         len -= 3;
 101                 }
 102                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
 103                 {
 104                         *to = *from++ << 8;
 105                         *to |= *from++;
 106                         len -= 2;
 107                 }
 108                 else                                    /* must be ASCII */
 109                 {
 110                         *to = *from++;
 111                         len--;
 112                 }
 113                 to++;
 114                 cnt++;
 115         }
 116         *to = 0;
 117         return cnt;
 118 }
 119
 120 static inline int
 121 pg_euc_mblen(const unsigned char *s)
 122 {
 123         int                     len;
 124
 125         if (*s == SS2)
 126                 len = 2;
 127         else if (*s == SS3)
 128                 len = 3;
 129         else if (IS_HIGHBIT_SET(*s))
 130                 len = 2;
 131         else
 132                 len = 1;
 133         return len;
 134 }
 135
 136 static inline int
 137 pg_euc_dsplen(const unsigned char *s)
 138 {
 139         int                     len;
 140
 141         if (*s == SS2)
 142                 len = 2;
 143         else if (*s == SS3)
 144                 len = 2;
 145         else if (IS_HIGHBIT_SET(*s))
 146                 len = 2;
 147         else
 148                 len = pg_ascii_dsplen(s);
 149         return len;
 150 }
 151
 152 /*
 153  * EUC_JP
 154  */
 155 static int
 156 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 157 {
 158         return pg_euc2wchar_with_len(from, to, len);
 159 }
 160
 161 static int
 162 pg_eucjp_mblen(const unsigned char *s)
 163 {
 164         return pg_euc_mblen(s);
 165 }
 166
 167 static int
 168 pg_eucjp_dsplen(const unsigned char *s)
 169 {
 170         int                     len;
 171
 172         if (*s == SS2)
 173                 len = 1;
 174         else if (*s == SS3)
 175                 len = 2;
 176         else if (IS_HIGHBIT_SET(*s))
 177                 len = 2;
 178         else
 179                 len = pg_ascii_dsplen(s);
 180         return len;
 181 }
 182
 183 /*
 184  * EUC_KR
 185  */
 186 static int
 187 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 188 {
 189         return pg_euc2wchar_with_len(from, to, len);
 190 }
 191
 192 static int
 193 pg_euckr_mblen(const unsigned char *s)
 194 {
 195         return pg_euc_mblen(s);
 196 }
 197
 198 static int
 199 pg_euckr_dsplen(const unsigned char *s)
 200 {
 201         return pg_euc_dsplen(s);
 202 }
 203
 204 /*
 205  * EUC_CN
 206  *
 207  */
 208 static int
 209 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 210 {
 211         int                     cnt = 0;
 212
 213         while (len > 0 && *from)
 214         {
 215                 if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
 216                 {
 217                         from++;
 218                         *to = (SS2 << 16) | (*from++ << 8);
 219                         *to |= *from++;
 220                         len -= 3;
 221                 }
 222                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused ?) */
 223                 {
 224                         from++;
 225                         *to = (SS3 << 16) | (*from++ << 8);
 226                         *to |= *from++;
 227                         len -= 3;
 228                 }
 229                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
 230                 {
 231                         *to = *from++ << 8;
 232                         *to |= *from++;
 233                         len -= 2;
 234                 }
 235                 else
 236                 {
 237                         *to = *from++;
 238                         len--;
 239                 }
 240                 to++;
 241                 cnt++;
 242         }
 243         *to = 0;
 244         return cnt;
 245 }
 246
 247 static int
 248 pg_euccn_mblen(const unsigned char *s)
 249 {
 250         int                     len;
 251
 252         if (IS_HIGHBIT_SET(*s))
 253                 len = 2;
 254         else
 255                 len = 1;
 256         return len;
 257 }
 258
 259 static int
 260 pg_euccn_dsplen(const unsigned char *s)
 261 {
 262         int                     len;
 263
 264         if (IS_HIGHBIT_SET(*s))
 265                 len = 2;
 266         else
 267                 len = pg_ascii_dsplen(s);
 268         return len;
 269 }
 270
 271 /*
 272  * EUC_TW
 273  *
 274  */
 275 static int
 276 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 277 {
 278         int                     cnt = 0;
 279
 280         while (len > 0 && *from)
 281         {
 282                 if (*from == SS2 && len >= 4)   /* code set 2 */
 283                 {
 284                         from++;
 285                         *to = (((uint32) SS2) << 24) | (*from++ << 16);
 286                         *to |= *from++ << 8;
 287                         *to |= *from++;
 288                         len -= 4;
 289                 }
 290                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused?) */
 291                 {
 292                         from++;
 293                         *to = (SS3 << 16) | (*from++ << 8);
 294                         *to |= *from++;
 295                         len -= 3;
 296                 }
 297                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
 298                 {
 299                         *to = *from++ << 8;
 300                         *to |= *from++;
 301                         len -= 2;
 302                 }
 303                 else
 304                 {
 305                         *to = *from++;
 306                         len--;
 307                 }
 308                 to++;
 309                 cnt++;
 310         }
 311         *to = 0;
 312         return cnt;
 313 }
 314
 315 static int
 316 pg_euctw_mblen(const unsigned char *s)
 317 {
 318         int                     len;
 319
 320         if (*s == SS2)
 321                 len = 4;
 322         else if (*s == SS3)
 323                 len = 3;
 324         else if (IS_HIGHBIT_SET(*s))
 325                 len = 2;
 326         else
 327                 len = 1;
 328         return len;
 329 }
 330
 331 static int
 332 pg_euctw_dsplen(const unsigned char *s)
 333 {
 334         int                     len;
 335
 336         if (*s == SS2)
 337                 len = 2;
 338         else if (*s == SS3)
 339                 len = 2;
 340         else if (IS_HIGHBIT_SET(*s))
 341                 len = 2;
 342         else
 343                 len = pg_ascii_dsplen(s);
 344         return len;
 345 }
 346
 347 /*
 348  * Convert pg_wchar to EUC_* encoding.
 349  * caller must allocate enough space for "to", including a trailing zero!
 350  * len: length of from.
 351  * "from" not necessarily null terminated.
 352  */
 353 static int
 354 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
 355 {
 356         int                     cnt = 0;
 357
 358         while (len > 0 && *from)
 359         {
 360                 unsigned char c;
 361
 362                 if ((c = (*from >> 24)))
 363                 {
 364                         *to++ = c;
 365                         *to++ = (*from >> 16) & 0xff;
 366                         *to++ = (*from >> 8) & 0xff;
 367                         *to++ = *from & 0xff;
 368                         cnt += 4;
 369                 }
 370                 else if ((c = (*from >> 16)))
 371                 {
 372                         *to++ = c;
 373                         *to++ = (*from >> 8) & 0xff;
 374                         *to++ = *from & 0xff;
 375                         cnt += 3;
 376                 }
 377                 else if ((c = (*from >> 8)))
 378                 {
 379                         *to++ = c;
 380                         *to++ = *from & 0xff;
 381                         cnt += 2;
 382                 }
 383                 else
 384                 {
 385                         *to++ = *from;
 386                         cnt++;
 387                 }
 388                 from++;
 389                 len--;
 390         }
 391         *to = 0;
 392         return cnt;
 393 }
 394
 395
 396 /*
 397  * JOHAB
 398  */
 399 static int
 400 pg_johab_mblen(const unsigned char *s)
 401 {
 402         return pg_euc_mblen(s);
 403 }
 404
 405 static int
 406 pg_johab_dsplen(const unsigned char *s)
 407 {
 408         return pg_euc_dsplen(s);
 409 }
 410
 411 /*
 412  * convert UTF8 string to pg_wchar (UCS-4)
 413  * caller must allocate enough space for "to", including a trailing zero!
 414  * len: length of from.
 415  * "from" not necessarily null terminated.
 416  */
 417 static int
 418 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 419 {
 420         int                     cnt = 0;
 421         uint32          c1,
 422                                 c2,
 423                                 c3,
 424                                 c4;
 425
 426         while (len > 0 && *from)
 427         {
 428                 if ((*from & 0x80) == 0)
 429                 {
 430                         *to = *from++;
 431                         len--;
 432                 }
 433                 else if ((*from & 0xe0) == 0xc0)
 434                 {
 435                         if (len < 2)
 436                                 break;                  /* drop trailing incomplete char */
 437                         c1 = *from++ & 0x1f;
 438                         c2 = *from++ & 0x3f;
 439                         *to = (c1 << 6) | c2;
 440                         len -= 2;
 441                 }
 442                 else if ((*from & 0xf0) == 0xe0)
 443                 {
 444                         if (len < 3)
 445                                 break;                  /* drop trailing incomplete char */
 446                         c1 = *from++ & 0x0f;
 447                         c2 = *from++ & 0x3f;
 448                         c3 = *from++ & 0x3f;
 449                         *to = (c1 << 12) | (c2 << 6) | c3;
 450                         len -= 3;
 451                 }
 452                 else if ((*from & 0xf8) == 0xf0)
 453                 {
 454                         if (len < 4)
 455                                 break;                  /* drop trailing incomplete char */
 456                         c1 = *from++ & 0x07;
 457                         c2 = *from++ & 0x3f;
 458                         c3 = *from++ & 0x3f;
 459                         c4 = *from++ & 0x3f;
 460                         *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 461                         len -= 4;
 462                 }
 463                 else
 464                 {
 465                         /* treat a bogus char as length 1; not ours to raise error */
 466                         *to = *from++;
 467                         len--;
 468                 }
 469                 to++;
 470                 cnt++;
 471         }
 472         *to = 0;
 473         return cnt;
 474 }
 475
 476
 477 /*
 478  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
 479  * space allocated.
 480  */
 481 unsigned char *
 482 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
 483 {
 484         if (c <= 0x7F)
 485         {
 486                 utf8string[0] = c;
 487         }
 488         else if (c <= 0x7FF)
 489         {
 490                 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
 491                 utf8string[1] = 0x80 | (c & 0x3F);
 492         }
 493         else if (c <= 0xFFFF)
 494         {
 495                 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
 496                 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
 497                 utf8string[2] = 0x80 | (c & 0x3F);
 498         }
 499         else
 500         {
 501                 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
 502                 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
 503                 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
 504                 utf8string[3] = 0x80 | (c & 0x3F);
 505         }
 506
 507         return utf8string;
 508 }
 509
 510 /*
 511  * Trivial conversion from pg_wchar to UTF-8.
 512  * caller should allocate enough space for "to"
 513  * len: length of from.
 514  * "from" not necessarily null terminated.
 515  */
 516 static int
 517 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
 518 {
 519         int                     cnt = 0;
 520
 521         while (len > 0 && *from)
 522         {
 523                 int                     char_len;
 524
 525                 unicode_to_utf8(*from, to);
 526                 char_len = pg_utf_mblen(to);
 527                 cnt += char_len;
 528                 to += char_len;
 529                 from++;
 530                 len--;
 531         }
 532         *to = 0;
 533         return cnt;
 534 }
 535
 536 /*
 537  * Return the byte length of a UTF8 character pointed to by s
 538  *
 539  * Note: in the current implementation we do not support UTF8 sequences
 540  * of more than 4 bytes; hence do NOT return a value larger than 4.
 541  * We return "1" for any leading byte that is either flat-out illegal or
 542  * indicates a length larger than we support.
 543  *
 544  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
 545  * other places would need to be fixed to change this.
 546  */
 547 int
 548 pg_utf_mblen(const unsigned char *s)
 549 {
 550         int                     len;
 551
 552         if ((*s & 0x80) == 0)
 553                 len = 1;
 554         else if ((*s & 0xe0) == 0xc0)
 555                 len = 2;
 556         else if ((*s & 0xf0) == 0xe0)
 557                 len = 3;
 558         else if ((*s & 0xf8) == 0xf0)
 559                 len = 4;
 560 #ifdef NOT_USED
 561         else if ((*s & 0xfc) == 0xf8)
 562                 len = 5;
 563         else if ((*s & 0xfe) == 0xfc)
 564                 len = 6;
 565 #endif
 566         else
 567                 len = 1;
 568         return len;
 569 }
 570
 571 /*
 572  * This is an implementation of wcwidth() and wcswidth() as defined in
 573  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
 574  * <http://www.unix.org/online.html>
 575  *
 576  * Markus Kuhn -- 2001-09-08 -- public domain
 577  *
 578  * customised for PostgreSQL
 579  *
 580  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 581  */
 582
 583 struct mbinterval
 584 {
 585         unsigned short first;
 586         unsigned short last;
 587 };
 588
 589 /* auxiliary function for binary search in interval table */
 590 static int
 591 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
 592 {
 593         int                     min = 0;
 594         int                     mid;
 595
 596         if (ucs < table[0].first || ucs > table[max].last)
 597                 return 0;
 598         while (max >= min)
 599         {
 600                 mid = (min + max) / 2;
 601                 if (ucs > table[mid].last)
 602                         min = mid + 1;
 603                 else if (ucs < table[mid].first)
 604                         max = mid - 1;
 605                 else
 606                         return 1;
 607         }
 608
 609         return 0;
 610 }
 611
 612
 613 /* The following functions define the column width of an ISO 10646
 614  * character as follows:
 615  *
 616  *        - The null character (U+0000) has a column width of 0.
 617  *
 618  *        - Other C0/C1 control characters and DEL will lead to a return
 619  *              value of -1.
 620  *
 621  *        - Non-spacing and enclosing combining characters (general
 622  *              category code Mn or Me in the Unicode database) have a
 623  *              column width of 0.
 624  *
 625  *        - Other format characters (general category code Cf in the Unicode
 626  *              database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 627  *
 628  *        - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
 629  *              have a column width of 0.
 630  *
 631  *        - Spacing characters in the East Asian Wide (W) or East Asian
 632  *              FullWidth (F) category as defined in Unicode Technical
 633  *              Report #11 have a column width of 2.
 634  *
 635  *        - All remaining characters (including all printable
 636  *              ISO 8859-1 and WGL4 characters, Unicode control characters,
 637  *              etc.) have a column width of 1.
 638  *
 639  * This implementation assumes that wchar_t characters are encoded
 640  * in ISO 10646.
 641  */
 642
 643 static int
 644 ucs_wcwidth(pg_wchar ucs)
 645 {
 646         /* sorted list of non-overlapping intervals of non-spacing characters */
 647         static const struct mbinterval combining[] = {
 648                 {0x0300, 0x036F}, {0x0483, 0x0489}, {0x0591, 0x05BD},
 649                 {0x05BF, 0x05BF}, {0x05C1, 0x05C2}, {0x05C4, 0x05C5},
 650                 {0x05C7, 0x05C7}, {0x0610, 0x061A}, {0x064B, 0x065F},
 651                 {0x0670, 0x0670}, {0x06D6, 0x06DC}, {0x06DF, 0x06E4},
 652                 {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, {0x0711, 0x0711},
 653                 {0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x07EB, 0x07F3},
 654                 {0x07FD, 0x07FD}, {0x0816, 0x0819}, {0x081B, 0x0823},
 655                 {0x0825, 0x0827}, {0x0829, 0x082D}, {0x0859, 0x085B},
 656                 {0x08D3, 0x08E1}, {0x08E3, 0x0902}, {0x093A, 0x093A},
 657                 {0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D},
 658                 {0x0951, 0x0957}, {0x0962, 0x0963}, {0x0981, 0x0981},
 659                 {0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
 660                 {0x09E2, 0x09E3}, {0x09FE, 0x0A02}, {0x0A3C, 0x0A3C},
 661                 {0x0A41, 0x0A51}, {0x0A70, 0x0A71}, {0x0A75, 0x0A75},
 662                 {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC8},
 663                 {0x0ACD, 0x0ACD}, {0x0AE2, 0x0AE3}, {0x0AFA, 0x0B01},
 664                 {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B44},
 665                 {0x0B4D, 0x0B56}, {0x0B62, 0x0B63}, {0x0B82, 0x0B82},
 666                 {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C00, 0x0C00},
 667                 {0x0C04, 0x0C04}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C56},
 668                 {0x0C62, 0x0C63}, {0x0C81, 0x0C81}, {0x0CBC, 0x0CBC},
 669                 {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
 670                 {0x0CE2, 0x0CE3}, {0x0D00, 0x0D01}, {0x0D3B, 0x0D3C},
 671                 {0x0D41, 0x0D44}, {0x0D4D, 0x0D4D}, {0x0D62, 0x0D63},
 672                 {0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD6}, {0x0E31, 0x0E31},
 673                 {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
 674                 {0x0EB4, 0x0EBC}, {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19},
 675                 {0x0F35, 0x0F35}, {0x0F37, 0x0F37}, {0x0F39, 0x0F39},
 676                 {0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, {0x0F86, 0x0F87},
 677                 {0x0F8D, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030},
 678                 {0x1032, 0x1037}, {0x1039, 0x103A}, {0x103D, 0x103E},
 679                 {0x1058, 0x1059}, {0x105E, 0x1060}, {0x1071, 0x1074},
 680                 {0x1082, 0x1082}, {0x1085, 0x1086}, {0x108D, 0x108D},
 681                 {0x109D, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714},
 682                 {0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773},
 683                 {0x17B4, 0x17B5}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
 684                 {0x17C9, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D},
 685                 {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x1922},
 686                 {0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B},
 687                 {0x1A17, 0x1A18}, {0x1A1B, 0x1A1B}, {0x1A56, 0x1A56},
 688                 {0x1A58, 0x1A60}, {0x1A62, 0x1A62}, {0x1A65, 0x1A6C},
 689                 {0x1A73, 0x1A7F}, {0x1AB0, 0x1B03}, {0x1B34, 0x1B34},
 690                 {0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42},
 691                 {0x1B6B, 0x1B73}, {0x1B80, 0x1B81}, {0x1BA2, 0x1BA5},
 692                 {0x1BA8, 0x1BA9}, {0x1BAB, 0x1BAD}, {0x1BE6, 0x1BE6},
 693                 {0x1BE8, 0x1BE9}, {0x1BED, 0x1BED}, {0x1BEF, 0x1BF1},
 694                 {0x1C2C, 0x1C33}, {0x1C36, 0x1C37}, {0x1CD0, 0x1CD2},
 695                 {0x1CD4, 0x1CE0}, {0x1CE2, 0x1CE8}, {0x1CED, 0x1CED},
 696                 {0x1CF4, 0x1CF4}, {0x1CF8, 0x1CF9}, {0x1DC0, 0x1DFF},
 697                 {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F},
 698                 {0x2DE0, 0x2DFF}, {0x302A, 0x302D}, {0x3099, 0x309A},
 699                 {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F},
 700                 {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806},
 701                 {0xA80B, 0xA80B}, {0xA825, 0xA826}, {0xA8C4, 0xA8C5},
 702                 {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D},
 703                 {0xA947, 0xA951}, {0xA980, 0xA982}, {0xA9B3, 0xA9B3},
 704                 {0xA9B6, 0xA9B9}, {0xA9BC, 0xA9BD}, {0xA9E5, 0xA9E5},
 705                 {0xAA29, 0xAA2E}, {0xAA31, 0xAA32}, {0xAA35, 0xAA36},
 706                 {0xAA43, 0xAA43}, {0xAA4C, 0xAA4C}, {0xAA7C, 0xAA7C},
 707                 {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8},
 708                 {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEC, 0xAAED},
 709                 {0xAAF6, 0xAAF6}, {0xABE5, 0xABE5}, {0xABE8, 0xABE8},
 710                 {0xABED, 0xABED}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F},
 711                 {0xFE20, 0xFE2F},
 712         };
 713
 714         /* test for 8-bit control characters */
 715         if (ucs == 0)
 716                 return 0;
 717
 718         if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
 719                 return -1;
 720
 721         /* binary search in table of non-spacing characters */
 722         if (mbbisearch(ucs, combining,
 723                                    sizeof(combining) / sizeof(struct mbinterval) - 1))
 724                 return 0;
 725
 726         /*
 727          * if we arrive here, ucs is not a combining or C0/C1 control character
 728          */
 729
 730         return 1 +
 731                 (ucs >= 0x1100 &&
 732                  (ucs <= 0x115f ||              /* Hangul Jamo init. consonants */
 733                   (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
 734                    ucs != 0x303f) ||    /* CJK ... Yi */
 735                   (ucs >= 0xac00 && ucs <= 0xd7a3) ||   /* Hangul Syllables */
 736                   (ucs >= 0xf900 && ucs <= 0xfaff) ||   /* CJK Compatibility
 737                                                                                                  * Ideographs */
 738                   (ucs >= 0xfe30 && ucs <= 0xfe6f) ||   /* CJK Compatibility Forms */
 739                   (ucs >= 0xff00 && ucs <= 0xff5f) ||   /* Fullwidth Forms */
 740                   (ucs >= 0xffe0 && ucs <= 0xffe6) ||
 741                   (ucs >= 0x20000 && ucs <= 0x2ffff)));
 742 }
 743
 744 /*
 745  * Convert a UTF-8 character to a Unicode code point.
 746  * This is a one-character version of pg_utf2wchar_with_len.
 747  *
 748  * No error checks here, c must point to a long-enough string.
 749  */
 750 pg_wchar
 751 utf8_to_unicode(const unsigned char *c)
 752 {
 753         if ((*c & 0x80) == 0)
 754                 return (pg_wchar) c[0];
 755         else if ((*c & 0xe0) == 0xc0)
 756                 return (pg_wchar) (((c[0] & 0x1f) << 6) |
 757                                                    (c[1] & 0x3f));
 758         else if ((*c & 0xf0) == 0xe0)
 759                 return (pg_wchar) (((c[0] & 0x0f) << 12) |
 760                                                    ((c[1] & 0x3f) << 6) |
 761                                                    (c[2] & 0x3f));
 762         else if ((*c & 0xf8) == 0xf0)
 763                 return (pg_wchar) (((c[0] & 0x07) << 18) |
 764                                                    ((c[1] & 0x3f) << 12) |
 765                                                    ((c[2] & 0x3f) << 6) |
 766                                                    (c[3] & 0x3f));
 767         else
 768                 /* that is an invalid code on purpose */
 769                 return 0xffffffff;
 770 }
 771
 772 static int
 773 pg_utf_dsplen(const unsigned char *s)
 774 {
 775         return ucs_wcwidth(utf8_to_unicode(s));
 776 }
 777
 778 /*
 779  * convert mule internal code to pg_wchar
 780  * caller should allocate enough space for "to"
 781  * len: length of from.
 782  * "from" not necessarily null terminated.
 783  */
 784 static int
 785 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 786 {
 787         int                     cnt = 0;
 788
 789         while (len > 0 && *from)
 790         {
 791                 if (IS_LC1(*from) && len >= 2)
 792                 {
 793                         *to = *from++ << 16;
 794                         *to |= *from++;
 795                         len -= 2;
 796                 }
 797                 else if (IS_LCPRV1(*from) && len >= 3)
 798                 {
 799                         from++;
 800                         *to = *from++ << 16;
 801                         *to |= *from++;
 802                         len -= 3;
 803                 }
 804                 else if (IS_LC2(*from) && len >= 3)
 805                 {
 806                         *to = *from++ << 16;
 807                         *to |= *from++ << 8;
 808                         *to |= *from++;
 809                         len -= 3;
 810                 }
 811                 else if (IS_LCPRV2(*from) && len >= 4)
 812                 {
 813                         from++;
 814                         *to = *from++ << 16;
 815                         *to |= *from++ << 8;
 816                         *to |= *from++;
 817                         len -= 4;
 818                 }
 819                 else
 820                 {                                               /* assume ASCII */
 821                         *to = (unsigned char) *from++;
 822                         len--;
 823                 }
 824                 to++;
 825                 cnt++;
 826         }
 827         *to = 0;
 828         return cnt;
 829 }
 830
 831 /*
 832  * convert pg_wchar to mule internal code
 833  * caller should allocate enough space for "to"
 834  * len: length of from.
 835  * "from" not necessarily null terminated.
 836  */
 837 static int
 838 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
 839 {
 840         int                     cnt = 0;
 841
 842         while (len > 0 && *from)
 843         {
 844                 unsigned char lb;
 845
 846                 lb = (*from >> 16) & 0xff;
 847                 if (IS_LC1(lb))
 848                 {
 849                         *to++ = lb;
 850                         *to++ = *from & 0xff;
 851                         cnt += 2;
 852                 }
 853                 else if (IS_LC2(lb))
 854                 {
 855                         *to++ = lb;
 856                         *to++ = (*from >> 8) & 0xff;
 857                         *to++ = *from & 0xff;
 858                         cnt += 3;
 859                 }
 860                 else if (IS_LCPRV1_A_RANGE(lb))
 861                 {
 862                         *to++ = LCPRV1_A;
 863                         *to++ = lb;
 864                         *to++ = *from & 0xff;
 865                         cnt += 3;
 866                 }
 867                 else if (IS_LCPRV1_B_RANGE(lb))
 868                 {
 869                         *to++ = LCPRV1_B;
 870                         *to++ = lb;
 871                         *to++ = *from & 0xff;
 872                         cnt += 3;
 873                 }
 874                 else if (IS_LCPRV2_A_RANGE(lb))
 875                 {
 876                         *to++ = LCPRV2_A;
 877                         *to++ = lb;
 878                         *to++ = (*from >> 8) & 0xff;
 879                         *to++ = *from & 0xff;
 880                         cnt += 4;
 881                 }
 882                 else if (IS_LCPRV2_B_RANGE(lb))
 883                 {
 884                         *to++ = LCPRV2_B;
 885                         *to++ = lb;
 886                         *to++ = (*from >> 8) & 0xff;
 887                         *to++ = *from & 0xff;
 888                         cnt += 4;
 889                 }
 890                 else
 891                 {
 892                         *to++ = *from & 0xff;
 893                         cnt += 1;
 894                 }
 895                 from++;
 896                 len--;
 897         }
 898         *to = 0;
 899         return cnt;
 900 }
 901
 902 int
 903 pg_mule_mblen(const unsigned char *s)
 904 {
 905         int                     len;
 906
 907         if (IS_LC1(*s))
 908                 len = 2;
 909         else if (IS_LCPRV1(*s))
 910                 len = 3;
 911         else if (IS_LC2(*s))
 912                 len = 3;
 913         else if (IS_LCPRV2(*s))
 914                 len = 4;
 915         else
 916                 len = 1;                                /* assume ASCII */
 917         return len;
 918 }
 919
 920 static int
 921 pg_mule_dsplen(const unsigned char *s)
 922 {
 923         int                     len;
 924
 925         /*
 926          * Note: it's not really appropriate to assume that all multibyte charsets
 927          * are double-wide on screen.  But this seems an okay approximation for
 928          * the MULE charsets we currently support.
 929          */
 930
 931         if (IS_LC1(*s))
 932                 len = 1;
 933         else if (IS_LCPRV1(*s))
 934                 len = 1;
 935         else if (IS_LC2(*s))
 936                 len = 2;
 937         else if (IS_LCPRV2(*s))
 938                 len = 2;
 939         else
 940                 len = 1;                                /* assume ASCII */
 941
 942         return len;
 943 }
 944
 945 /*
 946  * ISO8859-1
 947  */
 948 static int
 949 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 950 {
 951         int                     cnt = 0;
 952
 953         while (len > 0 && *from)
 954         {
 955                 *to++ = *from++;
 956                 len--;
 957                 cnt++;
 958         }
 959         *to = 0;
 960         return cnt;
 961 }
 962
 963 /*
 964  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
 965  * high bits.
 966  * caller should allocate enough space for "to"
 967  * len: length of from.
 968  * "from" not necessarily null terminated.
 969  */
 970 static int
 971 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
 972 {
 973         int                     cnt = 0;
 974
 975         while (len > 0 && *from)
 976         {
 977                 *to++ = *from++;
 978                 len--;
 979                 cnt++;
 980         }
 981         *to = 0;
 982         return cnt;
 983 }
 984
 985 static int
 986 pg_latin1_mblen(const unsigned char *s)
 987 {
 988         return 1;
 989 }
 990
 991 static int
 992 pg_latin1_dsplen(const unsigned char *s)
 993 {
 994         return pg_ascii_dsplen(s);
 995 }
 996
 997 /*
 998  * SJIS
 999  */
1000 static int
1001 pg_sjis_mblen(const unsigned char *s)
1002 {
1003         int                     len;
1004
1005         if (*s >= 0xa1 && *s <= 0xdf)
1006                 len = 1;                                /* 1 byte kana? */
1007         else if (IS_HIGHBIT_SET(*s))
1008                 len = 2;                                /* kanji? */
1009         else
1010                 len = 1;                                /* should be ASCII */
1011         return len;
1012 }
1013
1014 static int
1015 pg_sjis_dsplen(const unsigned char *s)
1016 {
1017         int                     len;
1018
1019         if (*s >= 0xa1 && *s <= 0xdf)
1020                 len = 1;                                /* 1 byte kana? */
1021         else if (IS_HIGHBIT_SET(*s))
1022                 len = 2;                                /* kanji? */
1023         else
1024                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1025         return len;
1026 }
1027
1028 /*
1029  * Big5
1030  */
1031 static int
1032 pg_big5_mblen(const unsigned char *s)
1033 {
1034         int                     len;
1035
1036         if (IS_HIGHBIT_SET(*s))
1037                 len = 2;                                /* kanji? */
1038         else
1039                 len = 1;                                /* should be ASCII */
1040         return len;
1041 }
1042
1043 static int
1044 pg_big5_dsplen(const unsigned char *s)
1045 {
1046         int                     len;
1047
1048         if (IS_HIGHBIT_SET(*s))
1049                 len = 2;                                /* kanji? */
1050         else
1051                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1052         return len;
1053 }
1054
1055 /*
1056  * GBK
1057  */
1058 static int
1059 pg_gbk_mblen(const unsigned char *s)
1060 {
1061         int                     len;
1062
1063         if (IS_HIGHBIT_SET(*s))
1064                 len = 2;                                /* kanji? */
1065         else
1066                 len = 1;                                /* should be ASCII */
1067         return len;
1068 }
1069
1070 static int
1071 pg_gbk_dsplen(const unsigned char *s)
1072 {
1073         int                     len;
1074
1075         if (IS_HIGHBIT_SET(*s))
1076                 len = 2;                                /* kanji? */
1077         else
1078                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1079         return len;
1080 }
1081
1082 /*
1083  * UHC
1084  */
1085 static int
1086 pg_uhc_mblen(const unsigned char *s)
1087 {
1088         int                     len;
1089
1090         if (IS_HIGHBIT_SET(*s))
1091                 len = 2;                                /* 2byte? */
1092         else
1093                 len = 1;                                /* should be ASCII */
1094         return len;
1095 }
1096
1097 static int
1098 pg_uhc_dsplen(const unsigned char *s)
1099 {
1100         int                     len;
1101
1102         if (IS_HIGHBIT_SET(*s))
1103                 len = 2;                                /* 2byte? */
1104         else
1105                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1106         return len;
1107 }
1108
1109 /*
1110  * GB18030
1111  *      Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1112  */
1113
1114 /*
1115  * Unlike all other mblen() functions, this also looks at the second byte of
1116  * the input.  However, if you only pass the first byte of a multi-byte
1117  * string, and \0 as the second byte, this still works in a predictable way:
1118  * a 4-byte character will be reported as two 2-byte characters.  That's
1119  * enough for all current uses, as a client-only encoding.  It works that
1120  * way, because in any valid 4-byte GB18030-encoded character, the third and
1121  * fourth byte look like a 2-byte encoded character, when looked at
1122  * separately.
1123  */
1124 static int
1125 pg_gb18030_mblen(const unsigned char *s)
1126 {
1127         int                     len;
1128
1129         if (!IS_HIGHBIT_SET(*s))
1130                 len = 1;                                /* ASCII */
1131         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1132                 len = 4;
1133         else
1134                 len = 2;
1135         return len;
1136 }
1137
1138 static int
1139 pg_gb18030_dsplen(const unsigned char *s)
1140 {
1141         int                     len;
1142
1143         if (IS_HIGHBIT_SET(*s))
1144                 len = 2;
1145         else
1146                 len = pg_ascii_dsplen(s);       /* ASCII */
1147         return len;
1148 }
1149
1150 /*
1151  *-------------------------------------------------------------------
1152  * multibyte sequence validators
1153  *
1154  * These functions accept "s", a pointer to the first byte of a string,
1155  * and "len", the remaining length of the string.  If there is a validly
1156  * encoded character beginning at *s, return its length in bytes; else
1157  * return -1.
1158  *
1159  * The functions can assume that len > 0 and that *s != '\0', but they must
1160  * test for and reject zeroes in any additional bytes of a multibyte character.
1161  *
1162  * Note that this definition allows the function for a single-byte
1163  * encoding to be just "return 1".
1164  *-------------------------------------------------------------------
1165  */
1166
1167 static int
1168 pg_ascii_verifier(const unsigned char *s, int len)
1169 {
1170         return 1;
1171 }
1172
1173 #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
1174
1175 static int
1176 pg_eucjp_verifier(const unsigned char *s, int len)
1177 {
1178         int                     l;
1179         unsigned char c1,
1180                                 c2;
1181
1182         c1 = *s++;
1183
1184         switch (c1)
1185         {
1186                 case SS2:                               /* JIS X 0201 */
1187                         l = 2;
1188                         if (l > len)
1189                                 return -1;
1190                         c2 = *s++;
1191                         if (c2 < 0xa1 || c2 > 0xdf)
1192                                 return -1;
1193                         break;
1194
1195                 case SS3:                               /* JIS X 0212 */
1196                         l = 3;
1197                         if (l > len)
1198                                 return -1;
1199                         c2 = *s++;
1200                         if (!IS_EUC_RANGE_VALID(c2))
1201                                 return -1;
1202                         c2 = *s++;
1203                         if (!IS_EUC_RANGE_VALID(c2))
1204                                 return -1;
1205                         break;
1206
1207                 default:
1208                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1209                         {
1210                                 l = 2;
1211                                 if (l > len)
1212                                         return -1;
1213                                 if (!IS_EUC_RANGE_VALID(c1))
1214                                         return -1;
1215                                 c2 = *s++;
1216                                 if (!IS_EUC_RANGE_VALID(c2))
1217                                         return -1;
1218                         }
1219                         else
1220                                 /* must be ASCII */
1221                         {
1222                                 l = 1;
1223                         }
1224                         break;
1225         }
1226
1227         return l;
1228 }
1229
1230 static int
1231 pg_euckr_verifier(const unsigned char *s, int len)
1232 {
1233         int                     l;
1234         unsigned char c1,
1235                                 c2;
1236
1237         c1 = *s++;
1238
1239         if (IS_HIGHBIT_SET(c1))
1240         {
1241                 l = 2;
1242                 if (l > len)
1243                         return -1;
1244                 if (!IS_EUC_RANGE_VALID(c1))
1245                         return -1;
1246                 c2 = *s++;
1247                 if (!IS_EUC_RANGE_VALID(c2))
1248                         return -1;
1249         }
1250         else
1251                 /* must be ASCII */
1252         {
1253                 l = 1;
1254         }
1255
1256         return l;
1257 }
1258
1259 /* EUC-CN byte sequences are exactly same as EUC-KR */
1260 #define pg_euccn_verifier       pg_euckr_verifier
1261
1262 static int
1263 pg_euctw_verifier(const unsigned char *s, int len)
1264 {
1265         int                     l;
1266         unsigned char c1,
1267                                 c2;
1268
1269         c1 = *s++;
1270
1271         switch (c1)
1272         {
1273                 case SS2:                               /* CNS 11643 Plane 1-7 */
1274                         l = 4;
1275                         if (l > len)
1276                                 return -1;
1277                         c2 = *s++;
1278                         if (c2 < 0xa1 || c2 > 0xa7)
1279                                 return -1;
1280                         c2 = *s++;
1281                         if (!IS_EUC_RANGE_VALID(c2))
1282                                 return -1;
1283                         c2 = *s++;
1284                         if (!IS_EUC_RANGE_VALID(c2))
1285                                 return -1;
1286                         break;
1287
1288                 case SS3:                               /* unused */
1289                         return -1;
1290
1291                 default:
1292                         if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1293                         {
1294                                 l = 2;
1295                                 if (l > len)
1296                                         return -1;
1297                                 /* no further range check on c1? */
1298                                 c2 = *s++;
1299                                 if (!IS_EUC_RANGE_VALID(c2))
1300                                         return -1;
1301                         }
1302                         else
1303                                 /* must be ASCII */
1304                         {
1305                                 l = 1;
1306                         }
1307                         break;
1308         }
1309         return l;
1310 }
1311
1312 static int
1313 pg_johab_verifier(const unsigned char *s, int len)
1314 {
1315         int                     l,
1316                                 mbl;
1317         unsigned char c;
1318
1319         l = mbl = pg_johab_mblen(s);
1320
1321         if (len < l)
1322                 return -1;
1323
1324         if (!IS_HIGHBIT_SET(*s))
1325                 return mbl;
1326
1327         while (--l > 0)
1328         {
1329                 c = *++s;
1330                 if (!IS_EUC_RANGE_VALID(c))
1331                         return -1;
1332         }
1333         return mbl;
1334 }
1335
1336 static int
1337 pg_mule_verifier(const unsigned char *s, int len)
1338 {
1339         int                     l,
1340                                 mbl;
1341         unsigned char c;
1342
1343         l = mbl = pg_mule_mblen(s);
1344
1345         if (len < l)
1346                 return -1;
1347
1348         while (--l > 0)
1349         {
1350                 c = *++s;
1351                 if (!IS_HIGHBIT_SET(c))
1352                         return -1;
1353         }
1354         return mbl;
1355 }
1356
1357 static int
1358 pg_latin1_verifier(const unsigned char *s, int len)
1359 {
1360         return 1;
1361 }
1362
1363 static int
1364 pg_sjis_verifier(const unsigned char *s, int len)
1365 {
1366         int                     l,
1367                                 mbl;
1368         unsigned char c1,
1369                                 c2;
1370
1371         l = mbl = pg_sjis_mblen(s);
1372
1373         if (len < l)
1374                 return -1;
1375
1376         if (l == 1)                                     /* pg_sjis_mblen already verified it */
1377                 return mbl;
1378
1379         c1 = *s++;
1380         c2 = *s;
1381         if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1382                 return -1;
1383         return mbl;
1384 }
1385
1386 static int
1387 pg_big5_verifier(const unsigned char *s, int len)
1388 {
1389         int                     l,
1390                                 mbl;
1391
1392         l = mbl = pg_big5_mblen(s);
1393
1394         if (len < l)
1395                 return -1;
1396
1397         while (--l > 0)
1398         {
1399                 if (*++s == '\0')
1400                         return -1;
1401         }
1402
1403         return mbl;
1404 }
1405
1406 static int
1407 pg_gbk_verifier(const unsigned char *s, int len)
1408 {
1409         int                     l,
1410                                 mbl;
1411
1412         l = mbl = pg_gbk_mblen(s);
1413
1414         if (len < l)
1415                 return -1;
1416
1417         while (--l > 0)
1418         {
1419                 if (*++s == '\0')
1420                         return -1;
1421         }
1422
1423         return mbl;
1424 }
1425
1426 static int
1427 pg_uhc_verifier(const unsigned char *s, int len)
1428 {
1429         int                     l,
1430                                 mbl;
1431
1432         l = mbl = pg_uhc_mblen(s);
1433
1434         if (len < l)
1435                 return -1;
1436
1437         while (--l > 0)
1438         {
1439                 if (*++s == '\0')
1440                         return -1;
1441         }
1442
1443         return mbl;
1444 }
1445
1446 static int
1447 pg_gb18030_verifier(const unsigned char *s, int len)
1448 {
1449         int                     l;
1450
1451         if (!IS_HIGHBIT_SET(*s))
1452                 l = 1;                                  /* ASCII */
1453         else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1454         {
1455                 /* Should be 4-byte, validate remaining bytes */
1456                 if (*s >= 0x81 && *s <= 0xfe &&
1457                         *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1458                         *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1459                         l = 4;
1460                 else
1461                         l = -1;
1462         }
1463         else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1464         {
1465                 /* Should be 2-byte, validate */
1466                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1467                         (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1468                         l = 2;
1469                 else
1470                         l = -1;
1471         }
1472         else
1473                 l = -1;
1474         return l;
1475 }
1476
1477 static int
1478 pg_utf8_verifier(const unsigned char *s, int len)
1479 {
1480         int                     l = pg_utf_mblen(s);
1481
1482         if (len < l)
1483                 return -1;
1484
1485         if (!pg_utf8_islegal(s, l))
1486                 return -1;
1487
1488         return l;
1489 }
1490
1491 /*
1492  * Check for validity of a single UTF-8 encoded character
1493  *
1494  * This directly implements the rules in RFC3629.  The bizarre-looking
1495  * restrictions on the second byte are meant to ensure that there isn't
1496  * more than one encoding of a given Unicode character point; that is,
1497  * you may not use a longer-than-necessary byte sequence with high order
1498  * zero bits to represent a character that would fit in fewer bytes.
1499  * To do otherwise is to create security hazards (eg, create an apparent
1500  * non-ASCII character that decodes to plain ASCII).
1501  *
1502  * length is assumed to have been obtained by pg_utf_mblen(), and the
1503  * caller must have checked that that many bytes are present in the buffer.
1504  */
1505 bool
1506 pg_utf8_islegal(const unsigned char *source, int length)
1507 {
1508         unsigned char a;
1509
1510         switch (length)
1511         {
1512                 default:
1513                         /* reject lengths 5 and 6 for now */
1514                         return false;
1515                 case 4:
1516                         a = source[3];
1517                         if (a < 0x80 || a > 0xBF)
1518                                 return false;
1519                         /* FALL THRU */
1520                 case 3:
1521                         a = source[2];
1522                         if (a < 0x80 || a > 0xBF)
1523                                 return false;
1524                         /* FALL THRU */
1525                 case 2:
1526                         a = source[1];
1527                         switch (*source)
1528                         {
1529                                 case 0xE0:
1530                                         if (a < 0xA0 || a > 0xBF)
1531                                                 return false;
1532                                         break;
1533                                 case 0xED:
1534                                         if (a < 0x80 || a > 0x9F)
1535                                                 return false;
1536                                         break;
1537                                 case 0xF0:
1538                                         if (a < 0x90 || a > 0xBF)
1539                                                 return false;
1540                                         break;
1541                                 case 0xF4:
1542                                         if (a < 0x80 || a > 0x8F)
1543                                                 return false;
1544                                         break;
1545                                 default:
1546                                         if (a < 0x80 || a > 0xBF)
1547                                                 return false;
1548                                         break;
1549                         }
1550                         /* FALL THRU */
1551                 case 1:
1552                         a = *source;
1553                         if (a >= 0x80 && a < 0xC2)
1554                                 return false;
1555                         if (a > 0xF4)
1556                                 return false;
1557                         break;
1558         }
1559         return true;
1560 }
1561
1562 #ifndef FRONTEND
1563
1564 /*
1565  * Generic character incrementer function.
1566  *
1567  * Not knowing anything about the properties of the encoding in use, we just
1568  * keep incrementing the last byte until we get a validly-encoded result,
1569  * or we run out of values to try.  We don't bother to try incrementing
1570  * higher-order bytes, so there's no growth in runtime for wider characters.
1571  * (If we did try to do that, we'd need to consider the likelihood that 255
1572  * is not a valid final byte in the encoding.)
1573  */
1574 static bool
1575 pg_generic_charinc(unsigned char *charptr, int len)
1576 {
1577         unsigned char *lastbyte = charptr + len - 1;
1578         mbverifier      mbverify;
1579
1580         /* We can just invoke the character verifier directly. */
1581         mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1582
1583         while (*lastbyte < (unsigned char) 255)
1584         {
1585                 (*lastbyte)++;
1586                 if ((*mbverify) (charptr, len) == len)
1587                         return true;
1588         }
1589
1590         return false;
1591 }
1592
1593 /*
1594  * UTF-8 character incrementer function.
1595  *
1596  * For a one-byte character less than 0x7F, we just increment the byte.
1597  *
1598  * For a multibyte character, every byte but the first must fall between 0x80
1599  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
1600  * the last byte that's not already at its maximum value.  If we can't find a
1601  * byte that's less than the maximum allowable value, we simply fail.  We also
1602  * need some special-case logic to skip regions used for surrogate pair
1603  * handling, as those should not occur in valid UTF-8.
1604  *
1605  * Note that we don't reset lower-order bytes back to their minimums, since
1606  * we can't afford to make an exhaustive search (see make_greater_string).
1607  */
1608 static bool
1609 pg_utf8_increment(unsigned char *charptr, int length)
1610 {
1611         unsigned char a;
1612         unsigned char limit;
1613
1614         switch (length)
1615         {
1616                 default:
1617                         /* reject lengths 5 and 6 for now */
1618                         return false;
1619                 case 4:
1620                         a = charptr[3];
1621                         if (a < 0xBF)
1622                         {
1623                                 charptr[3]++;
1624                                 break;
1625                         }
1626                         /* FALL THRU */
1627                 case 3:
1628                         a = charptr[2];
1629                         if (a < 0xBF)
1630                         {
1631                                 charptr[2]++;
1632                                 break;
1633                         }
1634                         /* FALL THRU */
1635                 case 2:
1636                         a = charptr[1];
1637                         switch (*charptr)
1638                         {
1639                                 case 0xED:
1640                                         limit = 0x9F;
1641                                         break;
1642                                 case 0xF4:
1643                                         limit = 0x8F;
1644                                         break;
1645                                 default:
1646                                         limit = 0xBF;
1647                                         break;
1648                         }
1649                         if (a < limit)
1650                         {
1651                                 charptr[1]++;
1652                                 break;
1653                         }
1654                         /* FALL THRU */
1655                 case 1:
1656                         a = *charptr;
1657                         if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1658                                 return false;
1659                         charptr[0]++;
1660                         break;
1661         }
1662
1663         return true;
1664 }
1665
1666 /*
1667  * EUC-JP character incrementer function.
1668  *
1669  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1670  * representing JIS X 0201 characters with the second byte ranging between
1671  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
1672  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1673  *
1674  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1675  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
1676  * is incremented if possible, otherwise the second-to-last byte.
1677  *
1678  * If the sequence starts with a value other than the above and its MSB
1679  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1680  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
1681  * incremented if possible, otherwise the second-to-last byte.
1682  *
1683  * Otherwise, the sequence is a single-byte ASCII character. It is
1684  * incremented up to 0x7f.
1685  */
1686 static bool
1687 pg_eucjp_increment(unsigned char *charptr, int length)
1688 {
1689         unsigned char c1,
1690                                 c2;
1691         int                     i;
1692
1693         c1 = *charptr;
1694
1695         switch (c1)
1696         {
1697                 case SS2:                               /* JIS X 0201 */
1698                         if (length != 2)
1699                                 return false;
1700
1701                         c2 = charptr[1];
1702
1703                         if (c2 >= 0xdf)
1704                                 charptr[0] = charptr[1] = 0xa1;
1705                         else if (c2 < 0xa1)
1706                                 charptr[1] = 0xa1;
1707                         else
1708                                 charptr[1]++;
1709                         break;
1710
1711                 case SS3:                               /* JIS X 0212 */
1712                         if (length != 3)
1713                                 return false;
1714
1715                         for (i = 2; i > 0; i--)
1716                         {
1717                                 c2 = charptr[i];
1718                                 if (c2 < 0xa1)
1719                                 {
1720                                         charptr[i] = 0xa1;
1721                                         return true;
1722                                 }
1723                                 else if (c2 < 0xfe)
1724                                 {
1725                                         charptr[i]++;
1726                                         return true;
1727                                 }
1728                         }
1729
1730                         /* Out of 3-byte code region */
1731                         return false;
1732
1733                 default:
1734                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1735                         {
1736                                 if (length != 2)
1737                                         return false;
1738
1739                                 for (i = 1; i >= 0; i--)
1740                                 {
1741                                         c2 = charptr[i];
1742                                         if (c2 < 0xa1)
1743                                         {
1744                                                 charptr[i] = 0xa1;
1745                                                 return true;
1746                                         }
1747                                         else if (c2 < 0xfe)
1748                                         {
1749                                                 charptr[i]++;
1750                                                 return true;
1751                                         }
1752                                 }
1753
1754                                 /* Out of 2 byte code region */
1755                                 return false;
1756                         }
1757                         else
1758                         {                                       /* ASCII, single byte */
1759                                 if (c1 > 0x7e)
1760                                         return false;
1761                                 (*charptr)++;
1762                         }
1763                         break;
1764         }
1765
1766         return true;
1767 }
1768 #endif                                                  /* !FRONTEND */
1769
1770
1771 /*
1772  *-------------------------------------------------------------------
1773  * encoding info table
1774  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1775  *-------------------------------------------------------------------
1776  */
1777 const pg_wchar_tbl pg_wchar_table[] = {
1778         {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1779         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},        /* PG_EUC_JP */
1780         {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},        /* PG_EUC_CN */
1781         {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},        /* PG_EUC_KR */
1782         {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},        /* PG_EUC_TW */
1783         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},        /* PG_EUC_JIS_2004 */
1784         {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},       /* PG_UTF8 */
1785         {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},   /* PG_MULE_INTERNAL */
1786         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1787         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1788         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1789         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1790         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1791         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1792         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1793         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1794         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1795         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1796         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1797         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1798         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1799         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1800         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1801         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1802         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1803         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1804         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1805         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1806         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1807         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1808         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1809         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1810         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1811         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1812         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1813         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1814         {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1815         {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},        /* PG_GBK */
1816         {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},        /* PG_UHC */
1817         {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
1818         {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},  /* PG_JOHAB */
1819         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}      /* PG_SHIFT_JIS_2004 */
1820 };
1821
1822 /* returns the byte length of a word for mule internal code */
1823 int
1824 pg_mic_mblen(const unsigned char *mbstr)
1825 {
1826         return pg_mule_mblen(mbstr);
1827 }
1828
1829 /*
1830  * Returns the byte length of a multibyte character.
1831  */
1832 int
1833 pg_encoding_mblen(int encoding, const char *mbstr)
1834 {
1835         return (PG_VALID_ENCODING(encoding) ?
1836                         pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1837                         pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1838 }
1839
1840 /*
1841  * Returns the display length of a multibyte character.
1842  */
1843 int
1844 pg_encoding_dsplen(int encoding, const char *mbstr)
1845 {
1846         return (PG_VALID_ENCODING(encoding) ?
1847                         pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1848                         pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1849 }
1850
1851 /*
1852  * Verify the first multibyte character of the given string.
1853  * Return its byte length if good, -1 if bad.  (See comments above for
1854  * full details of the mbverify API.)
1855  */
1856 int
1857 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1858 {
1859         return (PG_VALID_ENCODING(encoding) ?
1860                         pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1861                         pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1862 }
1863
1864 /*
1865  * fetch maximum length of a given encoding
1866  */
1867 int
1868 pg_encoding_max_length(int encoding)
1869 {
1870         Assert(PG_VALID_ENCODING(encoding));
1871
1872         return pg_wchar_table[encoding].maxmblen;
1873 }
1874
1875 #ifndef FRONTEND
1876
1877 /*
1878  * fetch maximum length of the encoding for the current database
1879  */
1880 int
1881 pg_database_encoding_max_length(void)
1882 {
1883         return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1884 }
1885
1886 /*
1887  * get the character incrementer for the encoding for the current database
1888  */
1889 mbcharacter_incrementer
1890 pg_database_encoding_character_incrementer(void)
1891 {
1892         /*
1893          * Eventually it might be best to add a field to pg_wchar_table[], but for
1894          * now we just use a switch.
1895          */
1896         switch (GetDatabaseEncoding())
1897         {
1898                 case PG_UTF8:
1899                         return pg_utf8_increment;
1900
1901                 case PG_EUC_JP:
1902                         return pg_eucjp_increment;
1903
1904                 default:
1905                         return pg_generic_charinc;
1906         }
1907 }
1908
1909 /*
1910  * Verify mbstr to make sure that it is validly encoded in the current
1911  * database encoding.  Otherwise same as pg_verify_mbstr().
1912  */
1913 bool
1914 pg_verifymbstr(const char *mbstr, int len, bool noError)
1915 {
1916         return
1917                 pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1918 }
1919
1920 /*
1921  * Verify mbstr to make sure that it is validly encoded in the specified
1922  * encoding.
1923  */
1924 bool
1925 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1926 {
1927         return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1928 }
1929
1930 /*
1931  * Verify mbstr to make sure that it is validly encoded in the specified
1932  * encoding.
1933  *
1934  * mbstr is not necessarily zero terminated; length of mbstr is
1935  * specified by len.
1936  *
1937  * If OK, return length of string in the encoding.
1938  * If a problem is found, return -1 when noError is
1939  * true; when noError is false, ereport() a descriptive message.
1940  */
1941 int
1942 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1943 {
1944         mbverifier      mbverify;
1945         int                     mb_len;
1946
1947         Assert(PG_VALID_ENCODING(encoding));
1948
1949         /*
1950          * In single-byte encodings, we need only reject nulls (\0).
1951          */
1952         if (pg_encoding_max_length(encoding) <= 1)
1953         {
1954                 const char *nullpos = memchr(mbstr, 0, len);
1955
1956                 if (nullpos == NULL)
1957                         return len;
1958                 if (noError)
1959                         return -1;
1960                 report_invalid_encoding(encoding, nullpos, 1);
1961         }
1962
1963         /* fetch function pointer just once */
1964         mbverify = pg_wchar_table[encoding].mbverify;
1965
1966         mb_len = 0;
1967
1968         while (len > 0)
1969         {
1970                 int                     l;
1971
1972                 /* fast path for ASCII-subset characters */
1973                 if (!IS_HIGHBIT_SET(*mbstr))
1974                 {
1975                         if (*mbstr != '\0')
1976                         {
1977                                 mb_len++;
1978                                 mbstr++;
1979                                 len--;
1980                                 continue;
1981                         }
1982                         if (noError)
1983                                 return -1;
1984                         report_invalid_encoding(encoding, mbstr, len);
1985                 }
1986
1987                 l = (*mbverify) ((const unsigned char *) mbstr, len);
1988
1989                 if (l < 0)
1990                 {
1991                         if (noError)
1992                                 return -1;
1993                         report_invalid_encoding(encoding, mbstr, len);
1994                 }
1995
1996                 mbstr += l;
1997                 len -= l;
1998                 mb_len++;
1999         }
2000         return mb_len;
2001 }
2002
2003 /*
2004  * check_encoding_conversion_args: check arguments of a conversion function
2005  *
2006  * "expected" arguments can be either an encoding ID or -1 to indicate that
2007  * the caller will check whether it accepts the ID.
2008  *
2009  * Note: the errors here are not really user-facing, so elog instead of
2010  * ereport seems sufficient.  Also, we trust that the "expected" encoding
2011  * arguments are valid encoding IDs, but we don't trust the actuals.
2012  */
2013 void
2014 check_encoding_conversion_args(int src_encoding,
2015                                                            int dest_encoding,
2016                                                            int len,
2017                                                            int expected_src_encoding,
2018                                                            int expected_dest_encoding)
2019 {
2020         if (!PG_VALID_ENCODING(src_encoding))
2021                 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
2022         if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
2023                 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
2024                          pg_enc2name_tbl[expected_src_encoding].name,
2025                          pg_enc2name_tbl[src_encoding].name);
2026         if (!PG_VALID_ENCODING(dest_encoding))
2027                 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
2028         if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
2029                 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
2030                          pg_enc2name_tbl[expected_dest_encoding].name,
2031                          pg_enc2name_tbl[dest_encoding].name);
2032         if (len < 0)
2033                 elog(ERROR, "encoding conversion length must not be negative");
2034 }
2035
2036 /*
2037  * report_invalid_encoding: complain about invalid multibyte character
2038  *
2039  * note: len is remaining length of string, not length of character;
2040  * len must be greater than zero, as we always examine the first byte.
2041  */
2042 void
2043 report_invalid_encoding(int encoding, const char *mbstr, int len)
2044 {
2045         int                     l = pg_encoding_mblen(encoding, mbstr);
2046         char            buf[8 * 5 + 1];
2047         char       *p = buf;
2048         int                     j,
2049                                 jlimit;
2050
2051         jlimit = Min(l, len);
2052         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
2053
2054         for (j = 0; j < jlimit; j++)
2055         {
2056                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2057                 if (j < jlimit - 1)
2058                         p += sprintf(p, " ");
2059         }
2060
2061         ereport(ERROR,
2062                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
2063                          errmsg("invalid byte sequence for encoding \"%s\": %s",
2064                                         pg_enc2name_tbl[encoding].name,
2065                                         buf)));
2066 }
2067
2068 /*
2069  * report_untranslatable_char: complain about untranslatable character
2070  *
2071  * note: len is remaining length of string, not length of character;
2072  * len must be greater than zero, as we always examine the first byte.
2073  */
2074 void
2075 report_untranslatable_char(int src_encoding, int dest_encoding,
2076                                                    const char *mbstr, int len)
2077 {
2078         int                     l = pg_encoding_mblen(src_encoding, mbstr);
2079         char            buf[8 * 5 + 1];
2080         char       *p = buf;
2081         int                     j,
2082                                 jlimit;
2083
2084         jlimit = Min(l, len);
2085         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
2086
2087         for (j = 0; j < jlimit; j++)
2088         {
2089                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2090                 if (j < jlimit - 1)
2091                         p += sprintf(p, " ");
2092         }
2093
2094         ereport(ERROR,
2095                         (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2096                          errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2097                                         buf,
2098                                         pg_enc2name_tbl[src_encoding].name,
2099                                         pg_enc2name_tbl[dest_encoding].name)));
2100 }
2101
2102 #endif                                                  /* !FRONTEND */