2 * This file contains public functions for conversion between
3 * client encoding and server internal encoding.
4 * (currently mule internal code (mic) is used)
7 * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.61 2006/12/24 00:57:48 tgl Exp $
11 #include "access/xact.h"
12 #include "catalog/namespace.h"
13 #include "mb/pg_wchar.h"
14 #include "utils/builtins.h"
15 #include "utils/memutils.h"
16 #include "utils/syscache.h"
19 * We handle for actual FE and BE encoding setting encoding-identificator
20 * and encoding-name too. It prevent searching and conversion from encoding
21 * to encoding name in getdatabaseencoding() and other routines.
23 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
24 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
27 * Caches for conversion function info. These values are allocated in
28 * MbProcContext. That context is a child of TopMemoryContext,
29 * which allows these values to survive across transactions. See
30 * SetClientEncoding() for more details.
32 static MemoryContext MbProcContext = NULL;
33 static FmgrInfo *ToServerConvProc = NULL;
34 static FmgrInfo *ToClientConvProc = NULL;
37 * During backend startup we can't set client encoding because we (a)
38 * can't look up the conversion functions, and (b) may not know the database
39 * encoding yet either. So SetClientEncoding() just accepts anything and
40 * remembers it for InitializeClientEncoding() to apply later.
42 static bool backend_startup_complete = false;
43 static int pending_client_encoding = PG_SQL_ASCII;
46 /* Internal functions */
47 static char *perform_default_encoding_conversion(const char *src,
48 int len, bool is_client_to_server);
49 static int cliplen(const char *str, int len, int limit);
53 * Set the client encoding and save fmgrinfo for the conversion
54 * function if necessary. Returns 0 if okay, -1 if not (bad encoding
55 * or can't support conversion)
58 SetClientEncoding(int encoding, bool doit)
60 int current_server_encoding;
65 MemoryContext oldcontext;
67 if (!PG_VALID_FE_ENCODING(encoding))
70 /* Can't do anything during startup, per notes above */
71 if (!backend_startup_complete)
74 pending_client_encoding = encoding;
78 current_server_encoding = GetDatabaseEncoding();
81 * Check for cases that require no conversion function.
83 if (current_server_encoding == encoding ||
84 current_server_encoding == PG_SQL_ASCII ||
85 encoding == PG_SQL_ASCII)
89 ClientEncoding = &pg_enc2name_tbl[encoding];
90 ToServerConvProc = NULL;
91 ToClientConvProc = NULL;
93 MemoryContextReset(MbProcContext);
99 * If we're not inside a transaction then we can't do catalog lookups, so
100 * fail. After backend startup, this could only happen if we are
101 * re-reading postgresql.conf due to SIGHUP --- so basically this just
102 * constrains the ability to change client_encoding on the fly from
103 * postgresql.conf. Which would probably be a stupid thing to do anyway.
105 if (!IsTransactionState())
109 * Look up the conversion functions.
111 to_server_proc = FindDefaultConversionProc(encoding,
112 current_server_encoding);
113 if (!OidIsValid(to_server_proc))
115 to_client_proc = FindDefaultConversionProc(current_server_encoding,
117 if (!OidIsValid(to_client_proc))
121 * Done if not wanting to actually apply setting.
126 /* Before loading the new fmgr info, remove the old info, if any */
127 ToServerConvProc = NULL;
128 ToClientConvProc = NULL;
129 if (MbProcContext != NULL)
131 MemoryContextReset(MbProcContext);
136 * This is the first time through, so create the context. Make it a
137 * child of TopMemoryContext so that these values survive across
140 MbProcContext = AllocSetContextCreate(TopMemoryContext,
142 ALLOCSET_SMALL_MINSIZE,
143 ALLOCSET_SMALL_INITSIZE,
144 ALLOCSET_SMALL_MAXSIZE);
147 /* Load the fmgr info into MbProcContext */
148 oldcontext = MemoryContextSwitchTo(MbProcContext);
149 to_server = palloc(sizeof(FmgrInfo));
150 to_client = palloc(sizeof(FmgrInfo));
151 fmgr_info(to_server_proc, to_server);
152 fmgr_info(to_client_proc, to_client);
153 MemoryContextSwitchTo(oldcontext);
155 ClientEncoding = &pg_enc2name_tbl[encoding];
156 ToServerConvProc = to_server;
157 ToClientConvProc = to_client;
163 * Initialize client encoding if necessary.
164 * called from InitPostgres() once during backend starting up.
167 InitializeClientEncoding(void)
169 Assert(!backend_startup_complete);
170 backend_startup_complete = true;
172 if (SetClientEncoding(pending_client_encoding, true) < 0)
175 * Oops, the requested conversion is not available. We couldn't fail
176 * before, but we can now.
179 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
180 errmsg("conversion between %s and %s is not supported",
181 pg_enc2name_tbl[pending_client_encoding].name,
182 GetDatabaseEncodingName())));
187 * returns the current client encoding */
189 pg_get_client_encoding(void)
191 Assert(ClientEncoding);
192 return ClientEncoding->encoding;
196 * returns the current client encoding name
199 pg_get_client_encoding_name(void)
201 Assert(ClientEncoding);
202 return ClientEncoding->name;
206 * Apply encoding conversion on src and return it. The encoding
207 * conversion function is chosen from the pg_conversion system catalog
208 * marked as "default". If it is not found in the schema search path,
209 * it's taken from pg_catalog schema. If it even is not in the schema,
210 * warn and returns src. We cannot raise an error, since it will cause
211 * an infinit loop in error message sending.
213 * In the case of no conversion, src is returned.
215 * XXX We assume that storage for converted result is 4-to-1 growth in
216 * the worst case. The rate for currently supported encoding pares are within 3
217 * (SJIS JIS X0201 half width kanna -> UTF8 is the worst case).
218 * So "4" should be enough for the moment.
221 pg_do_encoding_conversion(unsigned char *src, int len,
222 int src_encoding, int dest_encoding)
224 unsigned char *result;
227 if (!IsTransactionState())
230 if (src_encoding == dest_encoding)
233 if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
239 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
240 if (!OidIsValid(proc))
243 (errcode(ERRCODE_UNDEFINED_FUNCTION),
244 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
245 pg_encoding_to_char(src_encoding),
246 pg_encoding_to_char(dest_encoding))));
251 * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
252 * are going into infinite loop! So we have to make sure that the
253 * function exists before calling OidFunctionCall.
255 if (!SearchSysCacheExists(PROCOID,
256 ObjectIdGetDatum(proc),
259 elog(LOG, "cache lookup failed for function %u", proc);
263 result = palloc(len * 4 + 1);
265 OidFunctionCall5(proc,
266 Int32GetDatum(src_encoding),
267 Int32GetDatum(dest_encoding),
268 CStringGetDatum(src),
269 CStringGetDatum(result),
275 * Convert string using encoding_nanme. We assume that string's
276 * encoding is same as DB encoding.
278 * TEXT convert(TEXT string, NAME encoding_name) */
280 pg_convert(PG_FUNCTION_ARGS)
282 Datum string = PG_GETARG_DATUM(0);
283 Datum dest_encoding_name = PG_GETARG_DATUM(1);
284 Datum src_encoding_name = DirectFunctionCall1(
285 namein, CStringGetDatum(DatabaseEncoding->name));
288 result = DirectFunctionCall3(
289 pg_convert2, string, src_encoding_name, dest_encoding_name);
291 /* free memory allocated by namein */
292 pfree((void *) src_encoding_name);
294 PG_RETURN_TEXT_P(result);
298 * Convert string using encoding_name.
300 * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name)
303 pg_convert2(PG_FUNCTION_ARGS)
305 text *string = PG_GETARG_TEXT_P(0);
306 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
307 int src_encoding = pg_char_to_encoding(src_encoding_name);
308 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
309 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
310 unsigned char *result;
315 if (src_encoding < 0)
317 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
318 errmsg("invalid source encoding name \"%s\"",
319 src_encoding_name)));
320 if (dest_encoding < 0)
322 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
323 errmsg("invalid destination encoding name \"%s\"",
324 dest_encoding_name)));
326 /* make sure that source string is null terminated */
327 len = VARSIZE(string) - VARHDRSZ;
328 str = palloc(len + 1);
329 memcpy(str, VARDATA(string), len);
332 result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);
334 elog(ERROR, "encoding conversion failed");
337 * build text data type structure. we cannot use textin() here, since
338 * textin assumes that input string encoding is same as database encoding.
340 len = strlen((char *) result) + VARHDRSZ;
341 retval = palloc(len);
342 VARATT_SIZEP(retval) = len;
343 memcpy(VARDATA(retval), result, len - VARHDRSZ);
349 /* free memory if allocated by the toaster */
350 PG_FREE_IF_COPY(string, 0);
352 PG_RETURN_TEXT_P(retval);
356 * convert client encoding to server encoding.
359 pg_client_to_server(const char *s, int len)
361 Assert(DatabaseEncoding);
362 Assert(ClientEncoding);
367 if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
368 ClientEncoding->encoding == PG_SQL_ASCII)
371 * No conversion is needed, but we must still validate the data.
373 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
377 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
380 * No conversion is possible, but we must still validate the data,
381 * because the client-side code might have done string escaping using
382 * the selected client_encoding. If the client encoding is ASCII-safe
383 * then we just do a straight validation under that encoding. For an
384 * ASCII-unsafe encoding we have a problem: we dare not pass such data
385 * to the parser but we have no way to convert it. We compromise by
386 * rejecting the data if it contains any non-ASCII characters.
388 if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
389 (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
394 for (i = 0; i < len; i++)
396 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
398 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
399 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
400 pg_enc2name_tbl[PG_SQL_ASCII].name,
401 (unsigned char) s[i])));
407 return perform_default_encoding_conversion(s, len, true);
411 * convert server encoding to client encoding.
414 pg_server_to_client(const char *s, int len)
416 Assert(DatabaseEncoding);
417 Assert(ClientEncoding);
422 if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
423 ClientEncoding->encoding == PG_SQL_ASCII ||
424 DatabaseEncoding->encoding == PG_SQL_ASCII)
425 return (char *) s; /* assume data is valid */
427 return perform_default_encoding_conversion(s, len, false);
431 * Perform default encoding conversion using cached FmgrInfo. Since
432 * this function does not access database at all, it is safe to call
433 * outside transactions. Explicit setting client encoding required
434 * before calling this function. Otherwise no conversion is
438 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
445 if (is_client_to_server)
447 src_encoding = ClientEncoding->encoding;
448 dest_encoding = DatabaseEncoding->encoding;
449 flinfo = ToServerConvProc;
453 src_encoding = DatabaseEncoding->encoding;
454 dest_encoding = ClientEncoding->encoding;
455 flinfo = ToClientConvProc;
461 result = palloc(len * 4 + 1);
463 FunctionCall5(flinfo,
464 Int32GetDatum(src_encoding),
465 Int32GetDatum(dest_encoding),
466 CStringGetDatum(src),
467 CStringGetDatum(result),
472 /* convert a multibyte string to a wchar */
474 pg_mb2wchar(const char *from, pg_wchar *to)
476 return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
479 /* convert a multibyte string to a wchar with a limited length */
481 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
483 return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
486 /* same, with any encoding */
488 pg_encoding_mb2wchar_with_len(int encoding,
489 const char *from, pg_wchar *to, int len)
491 return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
494 /* returns the byte length of a multibyte word */
496 pg_mblen(const char *mbstr)
498 return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
501 /* returns the display length of a multibyte word */
503 pg_dsplen(const char *mbstr)
505 return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
508 /* returns the length (counted in wchars) of a multibyte string */
510 pg_mbstrlen(const char *mbstr)
514 /* optimization for single byte encoding */
515 if (pg_database_encoding_max_length() == 1)
516 return strlen(mbstr);
520 mbstr += pg_mblen(mbstr);
526 /* returns the length (counted in wchars) of a multibyte string
527 * (not necessarily NULL terminated)
530 pg_mbstrlen_with_len(const char *mbstr, int limit)
534 /* optimization for single byte encoding */
535 if (pg_database_encoding_max_length() == 1)
538 while (limit > 0 && *mbstr)
540 int l = pg_mblen(mbstr);
550 * returns the byte length of a multibyte string
551 * (not necessarily NULL terminated)
552 * that is no longer than limit.
553 * this function does not break multibyte word boundary.
556 pg_mbcliplen(const char *mbstr, int len, int limit)
561 /* optimization for single byte encoding */
562 if (pg_database_encoding_max_length() == 1)
563 return cliplen(mbstr, len, limit);
565 while (len > 0 && *mbstr)
568 if ((clen + l) > limit)
580 * Similar to pg_mbcliplen except the limit parameter specifies the
581 * character length, not the byte length. */
583 pg_mbcharcliplen(const char *mbstr, int len, int limit)
589 /* optimization for single byte encoding */
590 if (pg_database_encoding_max_length() == 1)
591 return cliplen(mbstr, len, limit);
593 while (len > 0 && *mbstr)
607 SetDatabaseEncoding(int encoding)
609 if (!PG_VALID_BE_ENCODING(encoding))
610 elog(ERROR, "invalid database encoding: %d", encoding);
612 DatabaseEncoding = &pg_enc2name_tbl[encoding];
613 Assert(DatabaseEncoding->encoding == encoding);
617 SetDefaultClientEncoding(void)
619 ClientEncoding = &pg_enc2name_tbl[GetDatabaseEncoding()];
623 GetDatabaseEncoding(void)
625 Assert(DatabaseEncoding);
626 return DatabaseEncoding->encoding;
630 GetDatabaseEncodingName(void)
632 Assert(DatabaseEncoding);
633 return DatabaseEncoding->name;
637 getdatabaseencoding(PG_FUNCTION_ARGS)
639 Assert(DatabaseEncoding);
640 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
644 pg_client_encoding(PG_FUNCTION_ARGS)
646 Assert(ClientEncoding);
647 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
651 cliplen(const char *str, int len, int limit)
656 for (s = str; *s; s++, l++)
658 if (l >= len || l >= limit)