granicus.if.org Git - postgresql/blob - src/backend/utils/mb/mbutils.c

   1 /*
   2  * This file contains public functions for conversion between
   3  * client encoding and server (database) encoding.
   4  *
   5  * Tatsuo Ishii
   6  *
   7  * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.85 2009/04/08 09:50:48 heikki Exp $
   8  */
   9 #include "postgres.h"
  10
  11 #include "access/xact.h"
  12 #include "catalog/namespace.h"
  13 #include "mb/pg_wchar.h"
  14 #include "utils/builtins.h"
  15 #include "utils/memutils.h"
  16 #include "utils/pg_locale.h"
  17 #include "utils/syscache.h"
  18
  19 /*
  20  * When converting strings between different encodings, we assume that space
  21  * for converted result is 4-to-1 growth in the worst case. The rate for
  22  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
  23  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
  24  *
  25  * Note that this is not the same as the maximum character width in any
  26  * particular encoding.
  27  */
  28 #define MAX_CONVERSION_GROWTH  4
  29
  30 /*
  31  * We maintain a simple linked list caching the fmgr lookup info for the
  32  * currently selected conversion functions, as well as any that have been
  33  * selected previously in the current session.  (We remember previous
  34  * settings because we must be able to restore a previous setting during
  35  * transaction rollback, without doing any fresh catalog accesses.)
  36  *
  37  * Since we'll never release this data, we just keep it in TopMemoryContext.
  38  */
  39 typedef struct ConvProcInfo
  40 {
  41         int                     s_encoding;             /* server and client encoding IDs */
  42         int                     c_encoding;
  43         FmgrInfo        to_server_info; /* lookup info for conversion procs */
  44         FmgrInfo        to_client_info;
  45 } ConvProcInfo;
  46
  47 static List *ConvProcList = NIL;        /* List of ConvProcInfo */
  48
  49 /*
  50  * These variables point to the currently active conversion functions,
  51  * or are NULL when no conversion is needed.
  52  */
  53 static FmgrInfo *ToServerConvProc = NULL;
  54 static FmgrInfo *ToClientConvProc = NULL;
  55
  56 /*
  57  * These variables track the currently selected FE and BE encodings.
  58  */
  59 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  60 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  61
  62 /*
  63  * During backend startup we can't set client encoding because we (a)
  64  * can't look up the conversion functions, and (b) may not know the database
  65  * encoding yet either.  So SetClientEncoding() just accepts anything and
  66  * remembers it for InitializeClientEncoding() to apply later.
  67  */
  68 static bool backend_startup_complete = false;
  69 static int      pending_client_encoding = PG_SQL_ASCII;
  70
  71
  72 /* Internal functions */
  73 static char *perform_default_encoding_conversion(const char *src,
  74                                                                         int len, bool is_client_to_server);
  75 static int      cliplen(const char *str, int len, int limit);
  76
  77
  78 /*
  79  * Set the client encoding and save fmgrinfo for the conversion
  80  * function if necessary.  Returns 0 if okay, -1 if not (bad encoding
  81  * or can't support conversion)
  82  */
  83 int
  84 SetClientEncoding(int encoding, bool doit)
  85 {
  86         int                     current_server_encoding;
  87         ListCell   *lc;
  88
  89         if (!PG_VALID_FE_ENCODING(encoding))
  90                 return -1;
  91
  92         /* Can't do anything during startup, per notes above */
  93         if (!backend_startup_complete)
  94         {
  95                 if (doit)
  96                         pending_client_encoding = encoding;
  97                 return 0;
  98         }
  99
 100         current_server_encoding = GetDatabaseEncoding();
 101
 102         /*
 103          * Check for cases that require no conversion function.
 104          */
 105         if (current_server_encoding == encoding ||
 106                 current_server_encoding == PG_SQL_ASCII ||
 107                 encoding == PG_SQL_ASCII)
 108         {
 109                 if (doit)
 110                 {
 111                         ClientEncoding = &pg_enc2name_tbl[encoding];
 112                         ToServerConvProc = NULL;
 113                         ToClientConvProc = NULL;
 114                 }
 115                 return 0;
 116         }
 117
 118         if (IsTransactionState())
 119         {
 120                 /*
 121                  * If we're in a live transaction, it's safe to access the catalogs,
 122                  * so look up the functions.  We repeat the lookup even if the info
 123                  * is already cached, so that we can react to changes in the contents
 124                  * of pg_conversion.
 125                  */
 126                 Oid                     to_server_proc,
 127                                         to_client_proc;
 128                 ConvProcInfo *convinfo;
 129                 MemoryContext oldcontext;
 130
 131                 to_server_proc = FindDefaultConversionProc(encoding,
 132                                                                                                    current_server_encoding);
 133                 if (!OidIsValid(to_server_proc))
 134                         return -1;
 135                 to_client_proc = FindDefaultConversionProc(current_server_encoding,
 136                                                                                                    encoding);
 137                 if (!OidIsValid(to_client_proc))
 138                         return -1;
 139
 140                 /*
 141                  * Done if not wanting to actually apply setting.
 142                  */
 143                 if (!doit)
 144                         return 0;
 145
 146                 /*
 147                  * Load the fmgr info into TopMemoryContext (could still fail here)
 148                  */
 149                 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
 150                                                                                                            sizeof(ConvProcInfo));
 151                 convinfo->s_encoding = current_server_encoding;
 152                 convinfo->c_encoding = encoding;
 153                 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
 154                                           TopMemoryContext);
 155                 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
 156                                           TopMemoryContext);
 157
 158                 /* Attach new info to head of list */
 159                 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 160                 ConvProcList = lcons(convinfo, ConvProcList);
 161                 MemoryContextSwitchTo(oldcontext);
 162
 163                 /*
 164                  * Everything is okay, so apply the setting.
 165                  */
 166                 ClientEncoding = &pg_enc2name_tbl[encoding];
 167                 ToServerConvProc = &convinfo->to_server_info;
 168                 ToClientConvProc = &convinfo->to_client_info;
 169
 170                 /*
 171                  * Remove any older entry for the same encoding pair (this is just
 172                  * to avoid memory leakage).
 173                  */
 174                 foreach(lc, ConvProcList)
 175                 {
 176                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 177
 178                         if (oldinfo == convinfo)
 179                                 continue;
 180                         if (oldinfo->s_encoding == convinfo->s_encoding &&
 181                                 oldinfo->c_encoding == convinfo->c_encoding)
 182                         {
 183                                 ConvProcList = list_delete_ptr(ConvProcList, oldinfo);
 184                                 pfree(oldinfo);
 185                                 break;                  /* need not look further */
 186                         }
 187                 }
 188
 189                 return 0;                               /* success */
 190         }
 191         else
 192         {
 193                 /*
 194                  * If we're not in a live transaction, the only thing we can do
 195                  * is restore a previous setting using the cache.  This covers all
 196                  * transaction-rollback cases.  The only case it might not work for
 197                  * is trying to change client_encoding on the fly by editing
 198                  * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
 199                  * thing to do anyway.
 200                  */
 201                 foreach(lc, ConvProcList)
 202                 {
 203                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 204
 205                         if (oldinfo->s_encoding == current_server_encoding &&
 206                                 oldinfo->c_encoding == encoding)
 207                         {
 208                                 if (doit)
 209                                 {
 210                                         ClientEncoding = &pg_enc2name_tbl[encoding];
 211                                         ToServerConvProc = &oldinfo->to_server_info;
 212                                         ToClientConvProc = &oldinfo->to_client_info;
 213                                 }
 214                                 return 0;
 215                         }
 216                 }
 217
 218                 return -1;                              /* it's not cached, so fail */
 219         }
 220 }
 221
 222 /*
 223  * Initialize client encoding if necessary.
 224  *              called from InitPostgres() once during backend startup.
 225  */
 226 void
 227 InitializeClientEncoding(void)
 228 {
 229         Assert(!backend_startup_complete);
 230         backend_startup_complete = true;
 231
 232         if (SetClientEncoding(pending_client_encoding, true) < 0)
 233         {
 234                 /*
 235                  * Oops, the requested conversion is not available. We couldn't fail
 236                  * before, but we can now.
 237                  */
 238                 ereport(FATAL,
 239                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 240                                  errmsg("conversion between %s and %s is not supported",
 241                                                 pg_enc2name_tbl[pending_client_encoding].name,
 242                                                 GetDatabaseEncodingName())));
 243         }
 244 }
 245
 246 /*
 247  * returns the current client encoding
 248  */
 249 int
 250 pg_get_client_encoding(void)
 251 {
 252         Assert(ClientEncoding);
 253         return ClientEncoding->encoding;
 254 }
 255
 256 /*
 257  * returns the current client encoding name
 258  */
 259 const char *
 260 pg_get_client_encoding_name(void)
 261 {
 262         Assert(ClientEncoding);
 263         return ClientEncoding->name;
 264 }
 265
 266 /*
 267  * Apply encoding conversion on src and return it. The encoding
 268  * conversion function is chosen from the pg_conversion system catalog
 269  * marked as "default". If it is not found in the schema search path,
 270  * it's taken from pg_catalog schema. If it even is not in the schema,
 271  * warn and return src.
 272  *
 273  * If conversion occurs, a palloc'd null-terminated string is returned.
 274  * In the case of no conversion, src is returned.
 275  *
 276  * CAUTION: although the presence of a length argument means that callers
 277  * can pass non-null-terminated strings, care is required because the same
 278  * string will be passed back if no conversion occurs.  Such callers *must*
 279  * check whether result == src and handle that case differently.
 280  *
 281  * Note: we try to avoid raising error, since that could get us into
 282  * infinite recursion when this function is invoked during error message
 283  * sending.  It should be OK to raise error for overlength strings though,
 284  * since the recursion will come with a shorter message.
 285  */
 286 unsigned char *
 287 pg_do_encoding_conversion(unsigned char *src, int len,
 288                                                   int src_encoding, int dest_encoding)
 289 {
 290         unsigned char *result;
 291         Oid                     proc;
 292
 293         if (!IsTransactionState())
 294                 return src;
 295
 296         if (src_encoding == dest_encoding)
 297                 return src;
 298
 299         if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
 300                 return src;
 301
 302         if (len <= 0)
 303                 return src;
 304
 305         proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 306         if (!OidIsValid(proc))
 307         {
 308                 ereport(LOG,
 309                                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
 310                                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 311                                                 pg_encoding_to_char(src_encoding),
 312                                                 pg_encoding_to_char(dest_encoding))));
 313                 return src;
 314         }
 315
 316         /*
 317          * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
 318          * are going into infinite loop!  So we have to make sure that the
 319          * function exists before calling OidFunctionCall.
 320          */
 321         if (!SearchSysCacheExists(PROCOID,
 322                                                           ObjectIdGetDatum(proc),
 323                                                           0, 0, 0))
 324         {
 325                 elog(LOG, "cache lookup failed for function %u", proc);
 326                 return src;
 327         }
 328
 329         /*
 330          * Allocate space for conversion result, being wary of integer overflow
 331          */
 332         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 333                 ereport(ERROR,
 334                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 335                                  errmsg("out of memory"),
 336                  errdetail("String of %d bytes is too long for encoding conversion.",
 337                                    len)));
 338
 339         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 340
 341         OidFunctionCall5(proc,
 342                                          Int32GetDatum(src_encoding),
 343                                          Int32GetDatum(dest_encoding),
 344                                          CStringGetDatum(src),
 345                                          CStringGetDatum(result),
 346                                          Int32GetDatum(len));
 347         return result;
 348 }
 349
 350 /*
 351  * Convert string using encoding_name. The source
 352  * encoding is the DB encoding.
 353  *
 354  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 355 Datum
 356 pg_convert_to(PG_FUNCTION_ARGS)
 357 {
 358         Datum           string = PG_GETARG_DATUM(0);
 359         Datum           dest_encoding_name = PG_GETARG_DATUM(1);
 360         Datum           src_encoding_name = DirectFunctionCall1(namein,
 361                                                                         CStringGetDatum(DatabaseEncoding->name));
 362         Datum           result;
 363
 364         /*
 365          * pg_convert expects a bytea as its first argument. We're passing it a
 366          * text argument here, relying on the fact that they are both in fact
 367          * varlena types, and thus structurally identical.
 368          */
 369         result = DirectFunctionCall3(pg_convert, string,
 370                                                                  src_encoding_name, dest_encoding_name);
 371
 372         PG_RETURN_DATUM(result);
 373 }
 374
 375 /*
 376  * Convert string using encoding_name. The destination
 377  * encoding is the DB encoding.
 378  *
 379  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 380 Datum
 381 pg_convert_from(PG_FUNCTION_ARGS)
 382 {
 383         Datum           string = PG_GETARG_DATUM(0);
 384         Datum           src_encoding_name = PG_GETARG_DATUM(1);
 385         Datum           dest_encoding_name = DirectFunctionCall1(namein,
 386                                                                         CStringGetDatum(DatabaseEncoding->name));
 387         Datum           result;
 388
 389         result = DirectFunctionCall3(pg_convert, string,
 390                                                                  src_encoding_name, dest_encoding_name);
 391
 392         /*
 393          * pg_convert returns a bytea, which we in turn return as text, relying on
 394          * the fact that they are both in fact varlena types, and thus
 395          * structurally identical. Although not all bytea values are valid text,
 396          * in this case it will be because we've told pg_convert to return one
 397          * that is valid as text in the current database encoding.
 398          */
 399         PG_RETURN_DATUM(result);
 400 }
 401
 402 /*
 403  * Convert string using encoding_names.
 404  *
 405  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
 406  */
 407 Datum
 408 pg_convert(PG_FUNCTION_ARGS)
 409 {
 410         bytea      *string = PG_GETARG_BYTEA_P(0);
 411         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 412         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 413         char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 414         int                     dest_encoding = pg_char_to_encoding(dest_encoding_name);
 415         unsigned char *result;
 416         bytea      *retval;
 417         unsigned char *str;
 418         int                     len;
 419
 420         if (src_encoding < 0)
 421                 ereport(ERROR,
 422                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 423                                  errmsg("invalid source encoding name \"%s\"",
 424                                                 src_encoding_name)));
 425         if (dest_encoding < 0)
 426                 ereport(ERROR,
 427                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 428                                  errmsg("invalid destination encoding name \"%s\"",
 429                                                 dest_encoding_name)));
 430
 431         /* make sure that source string is valid and null terminated */
 432         len = VARSIZE(string) - VARHDRSZ;
 433         pg_verify_mbstr(src_encoding, VARDATA(string), len, false);
 434         str = palloc(len + 1);
 435         memcpy(str, VARDATA(string), len);
 436         *(str + len) = '\0';
 437
 438         result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);
 439
 440         /*
 441          * build bytea data type structure.
 442          */
 443         len = strlen((char *) result) + VARHDRSZ;
 444         retval = palloc(len);
 445         SET_VARSIZE(retval, len);
 446         memcpy(VARDATA(retval), result, len - VARHDRSZ);
 447
 448         if (result != str)
 449                 pfree(result);
 450         pfree(str);
 451
 452         /* free memory if allocated by the toaster */
 453         PG_FREE_IF_COPY(string, 0);
 454
 455         PG_RETURN_BYTEA_P(retval);
 456 }
 457
 458 /*
 459  * get the length of the string considered as text in the specified
 460  * encoding. Raises an error if the data is not valid in that
 461  * encoding.
 462  *
 463  * INT4 length (BYTEA string, NAME src_encoding_name)
 464  */
 465 Datum
 466 length_in_encoding(PG_FUNCTION_ARGS)
 467 {
 468         bytea      *string = PG_GETARG_BYTEA_P(0);
 469         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 470         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 471         int                     len = VARSIZE(string) - VARHDRSZ;
 472         int                     retval;
 473
 474         if (src_encoding < 0)
 475                 ereport(ERROR,
 476                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 477                                  errmsg("invalid encoding name \"%s\"",
 478                                                 src_encoding_name)));
 479
 480         retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
 481         PG_RETURN_INT32(retval);
 482
 483 }
 484
 485 /*
 486  * convert client encoding to server encoding.
 487  */
 488 char *
 489 pg_client_to_server(const char *s, int len)
 490 {
 491         Assert(DatabaseEncoding);
 492         Assert(ClientEncoding);
 493
 494         if (len <= 0)
 495                 return (char *) s;
 496
 497         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 498                 ClientEncoding->encoding == PG_SQL_ASCII)
 499         {
 500                 /*
 501                  * No conversion is needed, but we must still validate the data.
 502                  */
 503                 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 504                 return (char *) s;
 505         }
 506
 507         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 508         {
 509                 /*
 510                  * No conversion is possible, but we must still validate the data,
 511                  * because the client-side code might have done string escaping using
 512                  * the selected client_encoding.  If the client encoding is ASCII-safe
 513                  * then we just do a straight validation under that encoding.  For an
 514                  * ASCII-unsafe encoding we have a problem: we dare not pass such data
 515                  * to the parser but we have no way to convert it.      We compromise by
 516                  * rejecting the data if it contains any non-ASCII characters.
 517                  */
 518                 if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
 519                         (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
 520                 else
 521                 {
 522                         int                     i;
 523
 524                         for (i = 0; i < len; i++)
 525                         {
 526                                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 527                                         ereport(ERROR,
 528                                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 529                                          errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 530                                                         pg_enc2name_tbl[PG_SQL_ASCII].name,
 531                                                         (unsigned char) s[i])));
 532                         }
 533                 }
 534                 return (char *) s;
 535         }
 536
 537         return perform_default_encoding_conversion(s, len, true);
 538 }
 539
 540 /*
 541  * convert server encoding to client encoding.
 542  */
 543 char *
 544 pg_server_to_client(const char *s, int len)
 545 {
 546         Assert(DatabaseEncoding);
 547         Assert(ClientEncoding);
 548
 549         if (len <= 0)
 550                 return (char *) s;
 551
 552         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 553                 ClientEncoding->encoding == PG_SQL_ASCII ||
 554                 DatabaseEncoding->encoding == PG_SQL_ASCII)
 555                 return (char *) s;              /* assume data is valid */
 556
 557         return perform_default_encoding_conversion(s, len, false);
 558 }
 559
 560 /*
 561  *      Perform default encoding conversion using cached FmgrInfo. Since
 562  *      this function does not access database at all, it is safe to call
 563  *      outside transactions.  If the conversion has not been set up by
 564  *      SetClientEncoding(), no conversion is performed.
 565  */
 566 static char *
 567 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
 568 {
 569         char       *result;
 570         int                     src_encoding,
 571                                 dest_encoding;
 572         FmgrInfo   *flinfo;
 573
 574         if (is_client_to_server)
 575         {
 576                 src_encoding = ClientEncoding->encoding;
 577                 dest_encoding = DatabaseEncoding->encoding;
 578                 flinfo = ToServerConvProc;
 579         }
 580         else
 581         {
 582                 src_encoding = DatabaseEncoding->encoding;
 583                 dest_encoding = ClientEncoding->encoding;
 584                 flinfo = ToClientConvProc;
 585         }
 586
 587         if (flinfo == NULL)
 588                 return (char *) src;
 589
 590         /*
 591          * Allocate space for conversion result, being wary of integer overflow
 592          */
 593         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 594                 ereport(ERROR,
 595                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 596                                  errmsg("out of memory"),
 597                  errdetail("String of %d bytes is too long for encoding conversion.",
 598                                    len)));
 599
 600         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 601
 602         FunctionCall5(flinfo,
 603                                   Int32GetDatum(src_encoding),
 604                                   Int32GetDatum(dest_encoding),
 605                                   CStringGetDatum(src),
 606                                   CStringGetDatum(result),
 607                                   Int32GetDatum(len));
 608         return result;
 609 }
 610
 611
 612
 613 #ifdef USE_WIDE_UPPER_LOWER
 614
 615 /*
 616  * wchar2char --- convert wide characters to multibyte format
 617  *
 618  * This has the same API as the standard wcstombs() function; in particular,
 619  * tolen is the maximum number of bytes to store at *to, and *from must be
 620  * zero-terminated.  The output will be zero-terminated iff there is room.
 621  */
 622 size_t
 623 wchar2char(char *to, const wchar_t *from, size_t tolen)
 624 {
 625         size_t result;
 626
 627         if (tolen == 0)
 628                 return 0;
 629
 630 #ifdef WIN32
 631         /*
 632          * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding,
 633          * and for some reason mbstowcs and wcstombs won't do this for us,
 634          * so we use MultiByteToWideChar().
 635          */
 636         if (GetDatabaseEncoding() == PG_UTF8)
 637         {
 638                 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
 639                                                                 NULL, NULL);
 640                 /* A zero return is failure */
 641                 if (result <= 0)
 642                         result = -1;
 643                 else
 644                 {
 645                         Assert(result <= tolen);
 646                         /* Microsoft counts the zero terminator in the result */
 647                         result--;
 648                 }
 649         }
 650         else
 651 #endif   /* WIN32 */
 652         {
 653                 Assert( !lc_ctype_is_c() );
 654                 result = wcstombs(to, from, tolen);
 655         }
 656         return result;
 657 }
 658
 659 /*
 660  * char2wchar --- convert multibyte characters to wide characters
 661  *
 662  * This has almost the API of mbstowcs(), except that *from need not be
 663  * null-terminated; instead, the number of input bytes is specified as
 664  * fromlen.  Also, we ereport() rather than returning -1 for invalid
 665  * input encoding.      tolen is the maximum number of wchar_t's to store at *to.
 666  * The output will be zero-terminated iff there is room.
 667  */
 668 size_t
 669 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 670 {
 671         size_t          result;
 672
 673         if (tolen == 0)
 674                 return 0;
 675
 676 #ifdef WIN32
 677         /* See WIN32 "Unicode" comment above */
 678         if (GetDatabaseEncoding() == PG_UTF8)
 679         {
 680                 /* Win32 API does not work for zero-length input */
 681                 if (fromlen == 0)
 682                         result = 0;
 683                 else
 684                 {
 685                         result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
 686                         /* A zero return is failure */
 687                         if (result == 0)
 688                                 result = -1;
 689                 }
 690
 691                 if (result != -1)
 692                 {
 693                         Assert(result < tolen);
 694                         /* Append trailing null wchar (MultiByteToWideChar() does not) */
 695                         to[result] = 0;
 696                 }
 697         }
 698         else
 699 #endif   /* WIN32 */
 700         {
 701                 /* mbstowcs requires ending '\0' */
 702                 char       *str = pnstrdup(from, fromlen);
 703
 704                 Assert( !lc_ctype_is_c() );
 705                 result = mbstowcs(to, str, tolen);
 706                 pfree(str);
 707         }
 708
 709         if (result == -1)
 710         {
 711                 /*
 712                  * Invalid multibyte character encountered.  We try to give a useful
 713                  * error message by letting pg_verifymbstr check the string.  But it's
 714                  * possible that the string is OK to us, and not OK to mbstowcs ---
 715                  * this suggests that the LC_CTYPE locale is different from the
 716                  * database encoding.  Give a generic error message if verifymbstr
 717                  * can't find anything wrong.
 718                  */
 719                 pg_verifymbstr(from, fromlen, false);   /* might not return */
 720                 /* but if it does ... */
 721                 ereport(ERROR,
 722                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 723                                  errmsg("invalid multibyte character for locale"),
 724                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 725         }
 726
 727         return result;
 728 }
 729
 730 #endif
 731
 732 /* convert a multibyte string to a wchar */
 733 int
 734 pg_mb2wchar(const char *from, pg_wchar *to)
 735 {
 736         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
 737 }
 738
 739 /* convert a multibyte string to a wchar with a limited length */
 740 int
 741 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 742 {
 743         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 744 }
 745
 746 /* same, with any encoding */
 747 int
 748 pg_encoding_mb2wchar_with_len(int encoding,
 749                                                           const char *from, pg_wchar *to, int len)
 750 {
 751         return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 752 }
 753
 754 /* returns the byte length of a multibyte character */
 755 int
 756 pg_mblen(const char *mbstr)
 757 {
 758         return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
 759 }
 760
 761 /* returns the display length of a multibyte character */
 762 int
 763 pg_dsplen(const char *mbstr)
 764 {
 765         return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
 766 }
 767
 768 /* returns the length (counted in wchars) of a multibyte string */
 769 int
 770 pg_mbstrlen(const char *mbstr)
 771 {
 772         int                     len = 0;
 773
 774         /* optimization for single byte encoding */
 775         if (pg_database_encoding_max_length() == 1)
 776                 return strlen(mbstr);
 777
 778         while (*mbstr)
 779         {
 780                 mbstr += pg_mblen(mbstr);
 781                 len++;
 782         }
 783         return len;
 784 }
 785
 786 /* returns the length (counted in wchars) of a multibyte string
 787  * (not necessarily NULL terminated)
 788  */
 789 int
 790 pg_mbstrlen_with_len(const char *mbstr, int limit)
 791 {
 792         int                     len = 0;
 793
 794         /* optimization for single byte encoding */
 795         if (pg_database_encoding_max_length() == 1)
 796                 return limit;
 797
 798         while (limit > 0 && *mbstr)
 799         {
 800                 int                     l = pg_mblen(mbstr);
 801
 802                 limit -= l;
 803                 mbstr += l;
 804                 len++;
 805         }
 806         return len;
 807 }
 808
 809 /*
 810  * returns the byte length of a multibyte string
 811  * (not necessarily NULL terminated)
 812  * that is no longer than limit.
 813  * this function does not break multibyte character boundary.
 814  */
 815 int
 816 pg_mbcliplen(const char *mbstr, int len, int limit)
 817 {
 818         return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
 819                                                                  len, limit);
 820 }
 821
 822 /*
 823  * pg_mbcliplen with specified encoding
 824  */
 825 int
 826 pg_encoding_mbcliplen(int encoding, const char *mbstr,
 827                                           int len, int limit)
 828 {
 829         mblen_converter mblen_fn;
 830         int                     clen = 0;
 831         int                     l;
 832
 833         /* optimization for single byte encoding */
 834         if (pg_encoding_max_length(encoding) == 1)
 835                 return cliplen(mbstr, len, limit);
 836
 837         mblen_fn = pg_wchar_table[encoding].mblen;
 838
 839         while (len > 0 && *mbstr)
 840         {
 841                 l = (*mblen_fn) ((const unsigned char *) mbstr);
 842                 if ((clen + l) > limit)
 843                         break;
 844                 clen += l;
 845                 if (clen == limit)
 846                         break;
 847                 len -= l;
 848                 mbstr += l;
 849         }
 850         return clen;
 851 }
 852
 853 /*
 854  * Similar to pg_mbcliplen except the limit parameter specifies the
 855  * character length, not the byte length.
 856  */
 857 int
 858 pg_mbcharcliplen(const char *mbstr, int len, int limit)
 859 {
 860         int                     clen = 0;
 861         int                     nch = 0;
 862         int                     l;
 863
 864         /* optimization for single byte encoding */
 865         if (pg_database_encoding_max_length() == 1)
 866                 return cliplen(mbstr, len, limit);
 867
 868         while (len > 0 && *mbstr)
 869         {
 870                 l = pg_mblen(mbstr);
 871                 nch++;
 872                 if (nch > limit)
 873                         break;
 874                 clen += l;
 875                 len -= l;
 876                 mbstr += l;
 877         }
 878         return clen;
 879 }
 880
 881 /* mbcliplen for any single-byte encoding */
 882 static int
 883 cliplen(const char *str, int len, int limit)
 884 {
 885         int                     l = 0;
 886
 887         len = Min(len, limit);
 888         while (l < len && str[l])
 889                 l++;
 890         return l;
 891 }
 892
 893 #if defined(ENABLE_NLS)
 894 static const struct codeset_map {
 895         int     encoding;
 896         const char *codeset;
 897 } codeset_map_array[] = {
 898         {PG_UTF8, "UTF-8"},
 899         {PG_LATIN1, "LATIN1"},
 900         {PG_LATIN2, "LATIN2"},
 901         {PG_LATIN3, "LATIN3"},
 902         {PG_LATIN4, "LATIN4"},
 903         {PG_ISO_8859_5, "ISO-8859-5"},
 904         {PG_ISO_8859_6, "ISO_8859-6"},
 905         {PG_ISO_8859_7, "ISO-8859-7"},
 906         {PG_ISO_8859_8, "ISO-8859-8"},
 907         {PG_LATIN5, "LATIN5"},
 908         {PG_LATIN6, "LATIN6"},
 909         {PG_LATIN7, "LATIN7"},
 910         {PG_LATIN8, "LATIN8"},
 911         {PG_LATIN9, "LATIN-9"},
 912         {PG_LATIN10, "LATIN10"},
 913         {PG_KOI8R, "KOI8-R"},
 914         {PG_KOI8U, "KOI8-U"},
 915         {PG_WIN1250, "CP1250"},
 916         {PG_WIN1251, "CP1251"},
 917         {PG_WIN1252, "CP1252"},
 918         {PG_WIN1253, "CP1253"},
 919         {PG_WIN1254, "CP1254"},
 920         {PG_WIN1255, "CP1255"},
 921         {PG_WIN1256, "CP1256"},
 922         {PG_WIN1257, "CP1257"},
 923         {PG_WIN1258, "CP1258"},
 924         {PG_WIN866, "CP866"},
 925         {PG_WIN874, "CP874"},
 926         {PG_EUC_CN, "EUC-CN"},
 927         {PG_EUC_JP, "EUC-JP"},
 928         {PG_EUC_KR, "EUC-KR"},
 929         {PG_EUC_TW, "EUC-TW"},
 930         {PG_EUC_JIS_2004, "EUC-JP"}
 931 };
 932 #endif /* ENABLE_NLS */
 933
 934 void
 935 SetDatabaseEncoding(int encoding)
 936 {
 937         if (!PG_VALID_BE_ENCODING(encoding))
 938                 elog(ERROR, "invalid database encoding: %d", encoding);
 939
 940         DatabaseEncoding = &pg_enc2name_tbl[encoding];
 941         Assert(DatabaseEncoding->encoding == encoding);
 942 }
 943
 944 /*
 945  * Bind gettext to the codeset equivalent with the database encoding.
 946  */
 947 void
 948 pg_bind_textdomain_codeset(const char *domainname)
 949 {
 950 #if defined(ENABLE_NLS)
 951         int             encoding = GetDatabaseEncoding();
 952         int     i;
 953
 954         /*
 955          * gettext() uses the codeset specified by LC_CTYPE by default,
 956          * so if that matches the database encoding we don't need to do
 957          * anything. In CREATE DATABASE, we enforce or trust that the
 958          * locale's codeset matches database encoding, except for the C
 959          * locale. In C locale, we bind gettext() explicitly to the right
 960          * codeset.
 961          *
 962          * On Windows, though, gettext() tends to get confused so we always
 963          * bind it.
 964          */
 965 #ifndef WIN32
 966         const char *ctype = setlocale(LC_CTYPE, NULL);
 967
 968         if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0)
 969                 return;
 970 #endif
 971
 972         for (i = 0; i < lengthof(codeset_map_array); i++)
 973         {
 974                 if (codeset_map_array[i].encoding == encoding)
 975                 {
 976                         if (bind_textdomain_codeset(domainname,
 977                                                                                 codeset_map_array[i].codeset) == NULL)
 978                                 elog(LOG, "bind_textdomain_codeset failed");
 979                         break;
 980                 }
 981         }
 982 #endif
 983 }
 984
 985 int
 986 GetDatabaseEncoding(void)
 987 {
 988         Assert(DatabaseEncoding);
 989         return DatabaseEncoding->encoding;
 990 }
 991
 992 const char *
 993 GetDatabaseEncodingName(void)
 994 {
 995         Assert(DatabaseEncoding);
 996         return DatabaseEncoding->name;
 997 }
 998
 999 Datum
1000 getdatabaseencoding(PG_FUNCTION_ARGS)
1001 {
1002         Assert(DatabaseEncoding);
1003         return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1004 }
1005
1006 Datum
1007 pg_client_encoding(PG_FUNCTION_ARGS)
1008 {
1009         Assert(ClientEncoding);
1010         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1011 }