granicus.if.org Git - postgresql/blob - src/backend/utils/mb/mbutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * mbutils.c
   4  *        This file contains functions for encoding conversion.
   5  *
   6  * The string-conversion functions in this file share some API quirks.
   7  * Note the following:
   8  *
   9  * The functions return a palloc'd, null-terminated string if conversion
  10  * is required.  However, if no conversion is performed, the given source
  11  * string pointer is returned as-is.
  12  *
  13  * Although the presence of a length argument means that callers can pass
  14  * non-null-terminated strings, care is required because the same string
  15  * will be passed back if no conversion occurs.  Such callers *must* check
  16  * whether result == src and handle that case differently.
  17  *
  18  * If the source and destination encodings are the same, the source string
  19  * is returned without any verification; it's assumed to be valid data.
  20  * If that might not be the case, the caller is responsible for validating
  21  * the string using a separate call to pg_verify_mbstr().  Whenever the
  22  * source and destination encodings are different, the functions ensure that
  23  * the result is validly encoded according to the destination encoding.
  24  *
  25  *
  26  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  27  * Portions Copyright (c) 1994, Regents of the University of California
  28  *
  29  *
  30  * IDENTIFICATION
  31  *        src/backend/utils/mb/mbutils.c
  32  *
  33  *-------------------------------------------------------------------------
  34  */
  35 #include "postgres.h"
  36
  37 #include "access/xact.h"
  38 #include "catalog/namespace.h"
  39 #include "mb/pg_wchar.h"
  40 #include "utils/builtins.h"
  41 #include "utils/memutils.h"
  42 #include "utils/syscache.h"
  43
  44 /*
  45  * When converting strings between different encodings, we assume that space
  46  * for converted result is 4-to-1 growth in the worst case. The rate for
  47  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
  48  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
  49  *
  50  * Note that this is not the same as the maximum character width in any
  51  * particular encoding.
  52  */
  53 #define MAX_CONVERSION_GROWTH  4
  54
  55 /*
  56  * We maintain a simple linked list caching the fmgr lookup info for the
  57  * currently selected conversion functions, as well as any that have been
  58  * selected previously in the current session.  (We remember previous
  59  * settings because we must be able to restore a previous setting during
  60  * transaction rollback, without doing any fresh catalog accesses.)
  61  *
  62  * Since we'll never release this data, we just keep it in TopMemoryContext.
  63  */
  64 typedef struct ConvProcInfo
  65 {
  66         int                     s_encoding;             /* server and client encoding IDs */
  67         int                     c_encoding;
  68         FmgrInfo        to_server_info; /* lookup info for conversion procs */
  69         FmgrInfo        to_client_info;
  70 } ConvProcInfo;
  71
  72 static List *ConvProcList = NIL;        /* List of ConvProcInfo */
  73
  74 /*
  75  * These variables point to the currently active conversion functions,
  76  * or are NULL when no conversion is needed.
  77  */
  78 static FmgrInfo *ToServerConvProc = NULL;
  79 static FmgrInfo *ToClientConvProc = NULL;
  80
  81 /*
  82  * These variables track the currently-selected encodings.
  83  */
  84 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  85 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  86 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  87
  88 /*
  89  * During backend startup we can't set client encoding because we (a)
  90  * can't look up the conversion functions, and (b) may not know the database
  91  * encoding yet either.  So SetClientEncoding() just accepts anything and
  92  * remembers it for InitializeClientEncoding() to apply later.
  93  */
  94 static bool backend_startup_complete = false;
  95 static int      pending_client_encoding = PG_SQL_ASCII;
  96
  97
  98 /* Internal functions */
  99 static char *perform_default_encoding_conversion(const char *src,
 100                                                                         int len, bool is_client_to_server);
 101 static int      cliplen(const char *str, int len, int limit);
 102
 103
 104 /*
 105  * Prepare for a future call to SetClientEncoding.  Success should mean
 106  * that SetClientEncoding is guaranteed to succeed for this encoding request.
 107  *
 108  * (But note that success before backend_startup_complete does not guarantee
 109  * success after ...)
 110  *
 111  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
 112  */
 113 int
 114 PrepareClientEncoding(int encoding)
 115 {
 116         int                     current_server_encoding;
 117         ListCell   *lc;
 118
 119         if (!PG_VALID_FE_ENCODING(encoding))
 120                 return -1;
 121
 122         /* Can't do anything during startup, per notes above */
 123         if (!backend_startup_complete)
 124                 return 0;
 125
 126         current_server_encoding = GetDatabaseEncoding();
 127
 128         /*
 129          * Check for cases that require no conversion function.
 130          */
 131         if (current_server_encoding == encoding ||
 132                 current_server_encoding == PG_SQL_ASCII ||
 133                 encoding == PG_SQL_ASCII)
 134                 return 0;
 135
 136         if (IsTransactionState())
 137         {
 138                 /*
 139                  * If we're in a live transaction, it's safe to access the catalogs,
 140                  * so look up the functions.  We repeat the lookup even if the info is
 141                  * already cached, so that we can react to changes in the contents of
 142                  * pg_conversion.
 143                  */
 144                 Oid                     to_server_proc,
 145                                         to_client_proc;
 146                 ConvProcInfo *convinfo;
 147                 MemoryContext oldcontext;
 148
 149                 to_server_proc = FindDefaultConversionProc(encoding,
 150                                                                                                    current_server_encoding);
 151                 if (!OidIsValid(to_server_proc))
 152                         return -1;
 153                 to_client_proc = FindDefaultConversionProc(current_server_encoding,
 154                                                                                                    encoding);
 155                 if (!OidIsValid(to_client_proc))
 156                         return -1;
 157
 158                 /*
 159                  * Load the fmgr info into TopMemoryContext (could still fail here)
 160                  */
 161                 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
 162                                                                                                            sizeof(ConvProcInfo));
 163                 convinfo->s_encoding = current_server_encoding;
 164                 convinfo->c_encoding = encoding;
 165                 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
 166                                           TopMemoryContext);
 167                 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
 168                                           TopMemoryContext);
 169
 170                 /* Attach new info to head of list */
 171                 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 172                 ConvProcList = lcons(convinfo, ConvProcList);
 173                 MemoryContextSwitchTo(oldcontext);
 174
 175                 /*
 176                  * We cannot yet remove any older entry for the same encoding pair,
 177                  * since it could still be in use.  SetClientEncoding will clean up.
 178                  */
 179
 180                 return 0;                               /* success */
 181         }
 182         else
 183         {
 184                 /*
 185                  * If we're not in a live transaction, the only thing we can do is
 186                  * restore a previous setting using the cache.  This covers all
 187                  * transaction-rollback cases.  The only case it might not work for is
 188                  * trying to change client_encoding on the fly by editing
 189                  * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
 190                  * thing to do anyway.
 191                  */
 192                 foreach(lc, ConvProcList)
 193                 {
 194                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 195
 196                         if (oldinfo->s_encoding == current_server_encoding &&
 197                                 oldinfo->c_encoding == encoding)
 198                                 return 0;
 199                 }
 200
 201                 return -1;                              /* it's not cached, so fail */
 202         }
 203 }
 204
 205 /*
 206  * Set the active client encoding and set up the conversion-function pointers.
 207  * PrepareClientEncoding should have been called previously for this encoding.
 208  *
 209  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
 210  */
 211 int
 212 SetClientEncoding(int encoding)
 213 {
 214         int                     current_server_encoding;
 215         bool            found;
 216         ListCell   *lc;
 217         ListCell   *prev;
 218         ListCell   *next;
 219
 220         if (!PG_VALID_FE_ENCODING(encoding))
 221                 return -1;
 222
 223         /* Can't do anything during startup, per notes above */
 224         if (!backend_startup_complete)
 225         {
 226                 pending_client_encoding = encoding;
 227                 return 0;
 228         }
 229
 230         current_server_encoding = GetDatabaseEncoding();
 231
 232         /*
 233          * Check for cases that require no conversion function.
 234          */
 235         if (current_server_encoding == encoding ||
 236                 current_server_encoding == PG_SQL_ASCII ||
 237                 encoding == PG_SQL_ASCII)
 238         {
 239                 ClientEncoding = &pg_enc2name_tbl[encoding];
 240                 ToServerConvProc = NULL;
 241                 ToClientConvProc = NULL;
 242                 return 0;
 243         }
 244
 245         /*
 246          * Search the cache for the entry previously prepared by
 247          * PrepareClientEncoding; if there isn't one, we lose.  While at it,
 248          * release any duplicate entries so that repeated Prepare/Set cycles don't
 249          * leak memory.
 250          */
 251         found = false;
 252         prev = NULL;
 253         for (lc = list_head(ConvProcList); lc; lc = next)
 254         {
 255                 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
 256
 257                 next = lnext(lc);
 258
 259                 if (convinfo->s_encoding == current_server_encoding &&
 260                         convinfo->c_encoding == encoding)
 261                 {
 262                         if (!found)
 263                         {
 264                                 /* Found newest entry, so set up */
 265                                 ClientEncoding = &pg_enc2name_tbl[encoding];
 266                                 ToServerConvProc = &convinfo->to_server_info;
 267                                 ToClientConvProc = &convinfo->to_client_info;
 268                                 found = true;
 269                         }
 270                         else
 271                         {
 272                                 /* Duplicate entry, release it */
 273                                 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
 274                                 pfree(convinfo);
 275                                 continue;               /* prev mustn't advance */
 276                         }
 277                 }
 278
 279                 prev = lc;
 280         }
 281
 282         if (found)
 283                 return 0;                               /* success */
 284         else
 285                 return -1;                              /* it's not cached, so fail */
 286 }
 287
 288 /*
 289  * Initialize client encoding conversions.
 290  *              Called from InitPostgres() once during backend startup.
 291  */
 292 void
 293 InitializeClientEncoding(void)
 294 {
 295         Assert(!backend_startup_complete);
 296         backend_startup_complete = true;
 297
 298         if (PrepareClientEncoding(pending_client_encoding) < 0 ||
 299                 SetClientEncoding(pending_client_encoding) < 0)
 300         {
 301                 /*
 302                  * Oops, the requested conversion is not available. We couldn't fail
 303                  * before, but we can now.
 304                  */
 305                 ereport(FATAL,
 306                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 307                                  errmsg("conversion between %s and %s is not supported",
 308                                                 pg_enc2name_tbl[pending_client_encoding].name,
 309                                                 GetDatabaseEncodingName())));
 310         }
 311 }
 312
 313 /*
 314  * returns the current client encoding
 315  */
 316 int
 317 pg_get_client_encoding(void)
 318 {
 319         return ClientEncoding->encoding;
 320 }
 321
 322 /*
 323  * returns the current client encoding name
 324  */
 325 const char *
 326 pg_get_client_encoding_name(void)
 327 {
 328         return ClientEncoding->name;
 329 }
 330
 331 /*
 332  * Convert src string to another encoding (general case).
 333  *
 334  * See the notes about string conversion functions at the top of this file.
 335  */
 336 unsigned char *
 337 pg_do_encoding_conversion(unsigned char *src, int len,
 338                                                   int src_encoding, int dest_encoding)
 339 {
 340         unsigned char *result;
 341         Oid                     proc;
 342
 343         if (len <= 0)
 344                 return src;                             /* empty string is always valid */
 345
 346         if (src_encoding == dest_encoding)
 347                 return src;                             /* no conversion required, assume valid */
 348
 349         if (dest_encoding == PG_SQL_ASCII)
 350                 return src;                             /* any string is valid in SQL_ASCII */
 351
 352         if (src_encoding == PG_SQL_ASCII)
 353         {
 354                 /* No conversion is possible, but we must validate the result */
 355                 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
 356                 return src;
 357         }
 358
 359         if (!IsTransactionState())      /* shouldn't happen */
 360                 elog(ERROR, "cannot perform encoding conversion outside a transaction");
 361
 362         proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 363         if (!OidIsValid(proc))
 364                 ereport(ERROR,
 365                                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
 366                                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 367                                                 pg_encoding_to_char(src_encoding),
 368                                                 pg_encoding_to_char(dest_encoding))));
 369
 370         /*
 371          * Allocate space for conversion result, being wary of integer overflow
 372          */
 373         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 374                 ereport(ERROR,
 375                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 376                                  errmsg("out of memory"),
 377                  errdetail("String of %d bytes is too long for encoding conversion.",
 378                                    len)));
 379
 380         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 381
 382         OidFunctionCall5(proc,
 383                                          Int32GetDatum(src_encoding),
 384                                          Int32GetDatum(dest_encoding),
 385                                          CStringGetDatum(src),
 386                                          CStringGetDatum(result),
 387                                          Int32GetDatum(len));
 388         return result;
 389 }
 390
 391 /*
 392  * Convert string to encoding encoding_name. The source
 393  * encoding is the DB encoding.
 394  *
 395  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 396 Datum
 397 pg_convert_to(PG_FUNCTION_ARGS)
 398 {
 399         Datum           string = PG_GETARG_DATUM(0);
 400         Datum           dest_encoding_name = PG_GETARG_DATUM(1);
 401         Datum           src_encoding_name = DirectFunctionCall1(namein,
 402                                                                         CStringGetDatum(DatabaseEncoding->name));
 403         Datum           result;
 404
 405         /*
 406          * pg_convert expects a bytea as its first argument. We're passing it a
 407          * text argument here, relying on the fact that they are both in fact
 408          * varlena types, and thus structurally identical.
 409          */
 410         result = DirectFunctionCall3(pg_convert, string,
 411                                                                  src_encoding_name, dest_encoding_name);
 412
 413         PG_RETURN_DATUM(result);
 414 }
 415
 416 /*
 417  * Convert string from encoding encoding_name. The destination
 418  * encoding is the DB encoding.
 419  *
 420  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 421 Datum
 422 pg_convert_from(PG_FUNCTION_ARGS)
 423 {
 424         Datum           string = PG_GETARG_DATUM(0);
 425         Datum           src_encoding_name = PG_GETARG_DATUM(1);
 426         Datum           dest_encoding_name = DirectFunctionCall1(namein,
 427                                                                         CStringGetDatum(DatabaseEncoding->name));
 428         Datum           result;
 429
 430         result = DirectFunctionCall3(pg_convert, string,
 431                                                                  src_encoding_name, dest_encoding_name);
 432
 433         /*
 434          * pg_convert returns a bytea, which we in turn return as text, relying on
 435          * the fact that they are both in fact varlena types, and thus
 436          * structurally identical. Although not all bytea values are valid text,
 437          * in this case it will be because we've told pg_convert to return one
 438          * that is valid as text in the current database encoding.
 439          */
 440         PG_RETURN_DATUM(result);
 441 }
 442
 443 /*
 444  * Convert string between two arbitrary encodings.
 445  *
 446  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
 447  */
 448 Datum
 449 pg_convert(PG_FUNCTION_ARGS)
 450 {
 451         bytea      *string = PG_GETARG_BYTEA_PP(0);
 452         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 453         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 454         char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 455         int                     dest_encoding = pg_char_to_encoding(dest_encoding_name);
 456         const char *src_str;
 457         char       *dest_str;
 458         bytea      *retval;
 459         int                     len;
 460
 461         if (src_encoding < 0)
 462                 ereport(ERROR,
 463                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 464                                  errmsg("invalid source encoding name \"%s\"",
 465                                                 src_encoding_name)));
 466         if (dest_encoding < 0)
 467                 ereport(ERROR,
 468                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 469                                  errmsg("invalid destination encoding name \"%s\"",
 470                                                 dest_encoding_name)));
 471
 472         /* make sure that source string is valid */
 473         len = VARSIZE_ANY_EXHDR(string);
 474         src_str = VARDATA_ANY(string);
 475         pg_verify_mbstr_len(src_encoding, src_str, len, false);
 476
 477         /* perform conversion */
 478         dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
 479                                                                                                   len,
 480                                                                                                   src_encoding,
 481                                                                                                   dest_encoding);
 482
 483         /* update len if conversion actually happened */
 484         if (dest_str != src_str)
 485                 len = strlen(dest_str);
 486
 487         /*
 488          * build bytea data type structure.
 489          */
 490         retval = (bytea *) palloc(len + VARHDRSZ);
 491         SET_VARSIZE(retval, len + VARHDRSZ);
 492         memcpy(VARDATA(retval), dest_str, len);
 493
 494         if (dest_str != src_str)
 495                 pfree(dest_str);
 496
 497         /* free memory if allocated by the toaster */
 498         PG_FREE_IF_COPY(string, 0);
 499
 500         PG_RETURN_BYTEA_P(retval);
 501 }
 502
 503 /*
 504  * get the length of the string considered as text in the specified
 505  * encoding. Raises an error if the data is not valid in that
 506  * encoding.
 507  *
 508  * INT4 length (BYTEA string, NAME src_encoding_name)
 509  */
 510 Datum
 511 length_in_encoding(PG_FUNCTION_ARGS)
 512 {
 513         bytea      *string = PG_GETARG_BYTEA_PP(0);
 514         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 515         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 516         const char *src_str;
 517         int                     len;
 518         int                     retval;
 519
 520         if (src_encoding < 0)
 521                 ereport(ERROR,
 522                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 523                                  errmsg("invalid encoding name \"%s\"",
 524                                                 src_encoding_name)));
 525
 526         len = VARSIZE_ANY_EXHDR(string);
 527         src_str = VARDATA_ANY(string);
 528
 529         retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
 530
 531         PG_RETURN_INT32(retval);
 532 }
 533
 534 /*
 535  * Get maximum multibyte character length in the specified encoding.
 536  *
 537  * Note encoding is specified numerically, not by name as above.
 538  */
 539 Datum
 540 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
 541 {
 542         int                     encoding = PG_GETARG_INT32(0);
 543
 544         if (PG_VALID_ENCODING(encoding))
 545                 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
 546         else
 547                 PG_RETURN_NULL();
 548 }
 549
 550 /*
 551  * Convert client encoding to server encoding.
 552  *
 553  * See the notes about string conversion functions at the top of this file.
 554  */
 555 char *
 556 pg_client_to_server(const char *s, int len)
 557 {
 558         return pg_any_to_server(s, len, ClientEncoding->encoding);
 559 }
 560
 561 /*
 562  * Convert any encoding to server encoding.
 563  *
 564  * See the notes about string conversion functions at the top of this file.
 565  *
 566  * Unlike the other string conversion functions, this will apply validation
 567  * even if encoding == DatabaseEncoding->encoding.  This is because this is
 568  * used to process data coming in from outside the database, and we never
 569  * want to just assume validity.
 570  */
 571 char *
 572 pg_any_to_server(const char *s, int len, int encoding)
 573 {
 574         if (len <= 0)
 575                 return (char *) s;              /* empty string is always valid */
 576
 577         if (encoding == DatabaseEncoding->encoding ||
 578                 encoding == PG_SQL_ASCII)
 579         {
 580                 /*
 581                  * No conversion is needed, but we must still validate the data.
 582                  */
 583                 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 584                 return (char *) s;
 585         }
 586
 587         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 588         {
 589                 /*
 590                  * No conversion is possible, but we must still validate the data,
 591                  * because the client-side code might have done string escaping using
 592                  * the selected client_encoding.  If the client encoding is ASCII-safe
 593                  * then we just do a straight validation under that encoding.  For an
 594                  * ASCII-unsafe encoding we have a problem: we dare not pass such data
 595                  * to the parser but we have no way to convert it.  We compromise by
 596                  * rejecting the data if it contains any non-ASCII characters.
 597                  */
 598                 if (PG_VALID_BE_ENCODING(encoding))
 599                         (void) pg_verify_mbstr(encoding, s, len, false);
 600                 else
 601                 {
 602                         int                     i;
 603
 604                         for (i = 0; i < len; i++)
 605                         {
 606                                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 607                                         ereport(ERROR,
 608                                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 609                                          errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 610                                                         pg_enc2name_tbl[PG_SQL_ASCII].name,
 611                                                         (unsigned char) s[i])));
 612                         }
 613                 }
 614                 return (char *) s;
 615         }
 616
 617         /* Fast path if we can use cached conversion function */
 618         if (encoding == ClientEncoding->encoding)
 619                 return perform_default_encoding_conversion(s, len, true);
 620
 621         /* General case ... will not work outside transactions */
 622         return (char *) pg_do_encoding_conversion((unsigned char *) s,
 623                                                                                           len,
 624                                                                                           encoding,
 625                                                                                           DatabaseEncoding->encoding);
 626 }
 627
 628 /*
 629  * Convert server encoding to client encoding.
 630  *
 631  * See the notes about string conversion functions at the top of this file.
 632  */
 633 char *
 634 pg_server_to_client(const char *s, int len)
 635 {
 636         return pg_server_to_any(s, len, ClientEncoding->encoding);
 637 }
 638
 639 /*
 640  * Convert server encoding to any encoding.
 641  *
 642  * See the notes about string conversion functions at the top of this file.
 643  */
 644 char *
 645 pg_server_to_any(const char *s, int len, int encoding)
 646 {
 647         if (len <= 0)
 648                 return (char *) s;              /* empty string is always valid */
 649
 650         if (encoding == DatabaseEncoding->encoding ||
 651                 encoding == PG_SQL_ASCII)
 652                 return (char *) s;              /* assume data is valid */
 653
 654         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 655         {
 656                 /* No conversion is possible, but we must validate the result */
 657                 (void) pg_verify_mbstr(encoding, s, len, false);
 658                 return (char *) s;
 659         }
 660
 661         /* Fast path if we can use cached conversion function */
 662         if (encoding == ClientEncoding->encoding)
 663                 return perform_default_encoding_conversion(s, len, false);
 664
 665         /* General case ... will not work outside transactions */
 666         return (char *) pg_do_encoding_conversion((unsigned char *) s,
 667                                                                                           len,
 668                                                                                           DatabaseEncoding->encoding,
 669                                                                                           encoding);
 670 }
 671
 672 /*
 673  *      Perform default encoding conversion using cached FmgrInfo. Since
 674  *      this function does not access database at all, it is safe to call
 675  *      outside transactions.  If the conversion has not been set up by
 676  *      SetClientEncoding(), no conversion is performed.
 677  */
 678 static char *
 679 perform_default_encoding_conversion(const char *src, int len,
 680                                                                         bool is_client_to_server)
 681 {
 682         char       *result;
 683         int                     src_encoding,
 684                                 dest_encoding;
 685         FmgrInfo   *flinfo;
 686
 687         if (is_client_to_server)
 688         {
 689                 src_encoding = ClientEncoding->encoding;
 690                 dest_encoding = DatabaseEncoding->encoding;
 691                 flinfo = ToServerConvProc;
 692         }
 693         else
 694         {
 695                 src_encoding = DatabaseEncoding->encoding;
 696                 dest_encoding = ClientEncoding->encoding;
 697                 flinfo = ToClientConvProc;
 698         }
 699
 700         if (flinfo == NULL)
 701                 return (char *) src;
 702
 703         /*
 704          * Allocate space for conversion result, being wary of integer overflow
 705          */
 706         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 707                 ereport(ERROR,
 708                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 709                                  errmsg("out of memory"),
 710                  errdetail("String of %d bytes is too long for encoding conversion.",
 711                                    len)));
 712
 713         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 714
 715         FunctionCall5(flinfo,
 716                                   Int32GetDatum(src_encoding),
 717                                   Int32GetDatum(dest_encoding),
 718                                   CStringGetDatum(src),
 719                                   CStringGetDatum(result),
 720                                   Int32GetDatum(len));
 721         return result;
 722 }
 723
 724
 725 /* convert a multibyte string to a wchar */
 726 int
 727 pg_mb2wchar(const char *from, pg_wchar *to)
 728 {
 729         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
 730 }
 731
 732 /* convert a multibyte string to a wchar with a limited length */
 733 int
 734 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 735 {
 736         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 737 }
 738
 739 /* same, with any encoding */
 740 int
 741 pg_encoding_mb2wchar_with_len(int encoding,
 742                                                           const char *from, pg_wchar *to, int len)
 743 {
 744         return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 745 }
 746
 747 /* convert a wchar string to a multibyte */
 748 int
 749 pg_wchar2mb(const pg_wchar *from, char *to)
 750 {
 751         return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
 752 }
 753
 754 /* convert a wchar string to a multibyte with a limited length */
 755 int
 756 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
 757 {
 758         return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
 759 }
 760
 761 /* same, with any encoding */
 762 int
 763 pg_encoding_wchar2mb_with_len(int encoding,
 764                                                           const pg_wchar *from, char *to, int len)
 765 {
 766         return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
 767 }
 768
 769 /* returns the byte length of a multibyte character */
 770 int
 771 pg_mblen(const char *mbstr)
 772 {
 773         return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
 774 }
 775
 776 /* returns the display length of a multibyte character */
 777 int
 778 pg_dsplen(const char *mbstr)
 779 {
 780         return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
 781 }
 782
 783 /* returns the length (counted in wchars) of a multibyte string */
 784 int
 785 pg_mbstrlen(const char *mbstr)
 786 {
 787         int                     len = 0;
 788
 789         /* optimization for single byte encoding */
 790         if (pg_database_encoding_max_length() == 1)
 791                 return strlen(mbstr);
 792
 793         while (*mbstr)
 794         {
 795                 mbstr += pg_mblen(mbstr);
 796                 len++;
 797         }
 798         return len;
 799 }
 800
 801 /* returns the length (counted in wchars) of a multibyte string
 802  * (not necessarily NULL terminated)
 803  */
 804 int
 805 pg_mbstrlen_with_len(const char *mbstr, int limit)
 806 {
 807         int                     len = 0;
 808
 809         /* optimization for single byte encoding */
 810         if (pg_database_encoding_max_length() == 1)
 811                 return limit;
 812
 813         while (limit > 0 && *mbstr)
 814         {
 815                 int                     l = pg_mblen(mbstr);
 816
 817                 limit -= l;
 818                 mbstr += l;
 819                 len++;
 820         }
 821         return len;
 822 }
 823
 824 /*
 825  * returns the byte length of a multibyte string
 826  * (not necessarily NULL terminated)
 827  * that is no longer than limit.
 828  * this function does not break multibyte character boundary.
 829  */
 830 int
 831 pg_mbcliplen(const char *mbstr, int len, int limit)
 832 {
 833         return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
 834                                                                  len, limit);
 835 }
 836
 837 /*
 838  * pg_mbcliplen with specified encoding
 839  */
 840 int
 841 pg_encoding_mbcliplen(int encoding, const char *mbstr,
 842                                           int len, int limit)
 843 {
 844         mblen_converter mblen_fn;
 845         int                     clen = 0;
 846         int                     l;
 847
 848         /* optimization for single byte encoding */
 849         if (pg_encoding_max_length(encoding) == 1)
 850                 return cliplen(mbstr, len, limit);
 851
 852         mblen_fn = pg_wchar_table[encoding].mblen;
 853
 854         while (len > 0 && *mbstr)
 855         {
 856                 l = (*mblen_fn) ((const unsigned char *) mbstr);
 857                 if ((clen + l) > limit)
 858                         break;
 859                 clen += l;
 860                 if (clen == limit)
 861                         break;
 862                 len -= l;
 863                 mbstr += l;
 864         }
 865         return clen;
 866 }
 867
 868 /*
 869  * Similar to pg_mbcliplen except the limit parameter specifies the
 870  * character length, not the byte length.
 871  */
 872 int
 873 pg_mbcharcliplen(const char *mbstr, int len, int limit)
 874 {
 875         int                     clen = 0;
 876         int                     nch = 0;
 877         int                     l;
 878
 879         /* optimization for single byte encoding */
 880         if (pg_database_encoding_max_length() == 1)
 881                 return cliplen(mbstr, len, limit);
 882
 883         while (len > 0 && *mbstr)
 884         {
 885                 l = pg_mblen(mbstr);
 886                 nch++;
 887                 if (nch > limit)
 888                         break;
 889                 clen += l;
 890                 len -= l;
 891                 mbstr += l;
 892         }
 893         return clen;
 894 }
 895
 896 /* mbcliplen for any single-byte encoding */
 897 static int
 898 cliplen(const char *str, int len, int limit)
 899 {
 900         int                     l = 0;
 901
 902         len = Min(len, limit);
 903         while (l < len && str[l])
 904                 l++;
 905         return l;
 906 }
 907
 908 void
 909 SetDatabaseEncoding(int encoding)
 910 {
 911         if (!PG_VALID_BE_ENCODING(encoding))
 912                 elog(ERROR, "invalid database encoding: %d", encoding);
 913
 914         DatabaseEncoding = &pg_enc2name_tbl[encoding];
 915         Assert(DatabaseEncoding->encoding == encoding);
 916 }
 917
 918 void
 919 SetMessageEncoding(int encoding)
 920 {
 921         /* Some calls happen before we can elog()! */
 922         Assert(PG_VALID_ENCODING(encoding));
 923
 924         MessageEncoding = &pg_enc2name_tbl[encoding];
 925         Assert(MessageEncoding->encoding == encoding);
 926 }
 927
 928 #ifdef ENABLE_NLS
 929 /*
 930  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
 931  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
 932  * fail for gettext-internal causes like out-of-memory.
 933  */
 934 static bool
 935 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
 936 {
 937         bool            elog_ok = (CurrentMemoryContext != NULL);
 938         int                     i;
 939
 940         for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
 941         {
 942                 if (pg_enc2gettext_tbl[i].encoding == encoding)
 943                 {
 944                         if (bind_textdomain_codeset(domainname,
 945                                                                                 pg_enc2gettext_tbl[i].name) != NULL)
 946                                 return true;
 947
 948                         if (elog_ok)
 949                                 elog(LOG, "bind_textdomain_codeset failed");
 950                         else
 951                                 write_stderr("bind_textdomain_codeset failed");
 952
 953                         break;
 954                 }
 955         }
 956
 957         return false;
 958 }
 959
 960 /*
 961  * Bind a gettext message domain to the codeset corresponding to the database
 962  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
 963  * Return the MessageEncoding implied by the new settings.
 964  *
 965  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
 966  * When that matches the database encoding, we don't need to do anything.  In
 967  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
 968  * database encoding, except for the C locale.  (On Windows, we also permit a
 969  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
 970  * gettext to the right codeset.
 971  *
 972  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
 973  * convenient departure for software that passes the strings to Windows ANSI
 974  * APIs, but we don't do that.  Compel gettext to use database encoding or,
 975  * failing that, the LC_CTYPE encoding as it would on other platforms.
 976  *
 977  * This function is called before elog() and palloc() are usable.
 978  */
 979 int
 980 pg_bind_textdomain_codeset(const char *domainname)
 981 {
 982         bool            elog_ok = (CurrentMemoryContext != NULL);
 983         int                     encoding = GetDatabaseEncoding();
 984         int                     new_msgenc;
 985
 986 #ifndef WIN32
 987         const char *ctype = setlocale(LC_CTYPE, NULL);
 988
 989         if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
 990 #endif
 991                 if (encoding != PG_SQL_ASCII &&
 992                         raw_pg_bind_textdomain_codeset(domainname, encoding))
 993                         return encoding;
 994
 995         new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
 996         if (new_msgenc < 0)
 997                 new_msgenc = PG_SQL_ASCII;
 998
 999 #ifdef WIN32
1000         if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1001                 /* On failure, the old message encoding remains valid. */
1002                 return GetMessageEncoding();
1003 #endif
1004
1005         return new_msgenc;
1006 }
1007 #endif
1008
1009 /*
1010  * The database encoding, also called the server encoding, represents the
1011  * encoding of data stored in text-like data types.  Affected types include
1012  * cstring, text, varchar, name, xml, and json.
1013  */
1014 int
1015 GetDatabaseEncoding(void)
1016 {
1017         return DatabaseEncoding->encoding;
1018 }
1019
1020 const char *
1021 GetDatabaseEncodingName(void)
1022 {
1023         return DatabaseEncoding->name;
1024 }
1025
1026 Datum
1027 getdatabaseencoding(PG_FUNCTION_ARGS)
1028 {
1029         return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1030 }
1031
1032 Datum
1033 pg_client_encoding(PG_FUNCTION_ARGS)
1034 {
1035         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1036 }
1037
1038 /*
1039  * gettext() returns messages in this encoding.  This often matches the
1040  * database encoding, but it differs for SQL_ASCII databases, for processes
1041  * not attached to a database, and under a database encoding lacking iconv
1042  * support (MULE_INTERNAL).
1043  */
1044 int
1045 GetMessageEncoding(void)
1046 {
1047         return MessageEncoding->encoding;
1048 }
1049
1050 #ifdef WIN32
1051 /*
1052  * Result is palloc'ed null-terminated utf16 string. The character length
1053  * is also passed to utf16len if not null. Returns NULL iff failed.
1054  */
1055 WCHAR *
1056 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1057 {
1058         WCHAR      *utf16;
1059         int                     dstlen;
1060         UINT            codepage;
1061
1062         codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
1063
1064         /*
1065          * Use MultiByteToWideChar directly if there is a corresponding codepage,
1066          * or double conversion through UTF8 if not.
1067          */
1068         if (codepage != 0)
1069         {
1070                 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1071                 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1072                 utf16[dstlen] = (WCHAR) 0;
1073         }
1074         else
1075         {
1076                 char       *utf8;
1077
1078                 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1079                                                                                                   len,
1080                                                                                                   GetMessageEncoding(),
1081                                                                                                   PG_UTF8);
1082                 if (utf8 != str)
1083                         len = strlen(utf8);
1084
1085                 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1086                 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1087                 utf16[dstlen] = (WCHAR) 0;
1088
1089                 if (utf8 != str)
1090                         pfree(utf8);
1091         }
1092
1093         if (dstlen == 0 && len > 0)
1094         {
1095                 pfree(utf16);
1096                 return NULL;                    /* error */
1097         }
1098
1099         if (utf16len)
1100                 *utf16len = dstlen;
1101         return utf16;
1102 }
1103
1104 #endif