1 /*-------------------------------------------------------------------------
3 * SASLprep normalization, for SCRAM authentication
5 * The SASLprep algorithm is used to process a user-supplied password into
6 * canonical form. For more details, see:
8 * [RFC3454] Preparation of Internationalized Strings ("stringprep"),
9 * http://www.ietf.org/rfc/rfc3454.txt
11 * [RFC4013] SASLprep: Stringprep Profile for User Names and Passwords
12 * http://www.ietf.org/rfc/rfc4013.txt
15 * Portions Copyright (c) 2017, PostgreSQL Global Development Group
18 * src/common/saslprep.c
20 *-------------------------------------------------------------------------
25 #include "postgres_fe.h"
28 #include "common/saslprep.h"
29 #include "common/unicode_norm.h"
32 * Note: The functions in this file depend on functions from
33 * src/backend/utils/mb/wchar.c, so in order to use this in frontend
34 * code, you will need to link that in, too.
36 #include "mb/pg_wchar.h"
39 * Limit on how large password's we will try to process. A password
40 * larger than this will be treated the same as out-of-memory.
42 #define MAX_PASSWORD_LENGTH 1024
45 * In backend, we will use palloc/pfree. In frontend, use malloc, and
46 * return SASLPREP_OOM on out-of-memory.
49 #define STRDUP(s) pstrdup(s)
50 #define ALLOC(size) palloc(size)
51 #define FREE(size) pfree(size)
53 #define STRDUP(s) strdup(s)
54 #define ALLOC(size) malloc(size)
55 #define FREE(size) free(size)
58 /* Prototypes for local functions */
59 static int codepoint_range_cmp(const void *a, const void *b);
60 static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
61 static int pg_utf8_string_len(const char *source);
62 static bool pg_is_ascii_string(const char *p);
65 * Stringprep Mapping Tables.
67 * The stringprep specification includes a number of tables of Unicode
68 * codepoints, used in different parts of the algorithm. They are below,
69 * as arrays of codepoint ranges. Each range is a pair of codepoints,
70 * for the first and last codepoint included the range (inclusive!).
74 * C.1.2 Non-ASCII space characters
76 * These are all mapped to the ASCII space character (U+00A0).
78 static const pg_wchar non_ascii_space_ranges[] =
89 * B.1 Commonly mapped to nothing
91 * If any of these appear in the input, they are removed.
93 static const pg_wchar commonly_mapped_to_nothing_ranges[] =
106 * prohibited_output_ranges is a union of all the characters from
107 * the following tables:
109 * C.1.2 Non-ASCII space characters
110 * C.2.1 ASCII control characters
111 * C.2.2 Non-ASCII control characters
112 * C.3 Private Use characters
113 * C.4 Non-character code points
114 * C.5 Surrogate code points
115 * C.6 Inappropriate for plain text characters
116 * C.7 Inappropriate for canonical representation characters
117 * C.7 Change display properties or deprecated characters
118 * C.8 Tagging characters
120 * These are the tables that are listed as "prohibited output"
121 * characters in the SASLprep profile.
123 * The comment after each code range indicates which source table
124 * the code came from. Note that there is some overlap in the source
125 * tables, so one code might originate from multiple source tables.
126 * Adjacent ranges have also been merged together, to save space.
128 static const pg_wchar prohibited_output_ranges[] =
130 0x0000, 0x001F, /* C.2.1 */
131 0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
132 0x0340, 0x0341, /* C.8 */
133 0x06DD, 0x06DD, /* C.2.2 */
134 0x070F, 0x070F, /* C.2.2 */
135 0x1680, 0x1680, /* C.1.2 */
136 0x180E, 0x180E, /* C.2.2 */
137 0x2000, 0x200F, /* C.1.2, C.2.2, C.8 */
138 0x2028, 0x202F, /* C.1.2, C.2.2, C.8 */
139 0x205F, 0x2063, /* C.1.2, C.2.2 */
140 0x206A, 0x206F, /* C.2.2, C.8 */
141 0x2FF0, 0x2FFB, /* C.7 */
142 0x3000, 0x3000, /* C.1.2 */
143 0xD800, 0xF8FF, /* C.3, C.5 */
144 0xFDD0, 0xFDEF, /* C.4 */
145 0xFEFF, 0xFEFF, /* C.2.2 */
146 0xFFF9, 0xFFFF, /* C.2.2, C.4, C.6 */
147 0x1D173, 0x1D17A, /* C.2.2 */
148 0x1FFFE, 0x1FFFF, /* C.4 */
149 0x2FFFE, 0x2FFFF, /* C.4 */
150 0x3FFFE, 0x3FFFF, /* C.4 */
151 0x4FFFE, 0x4FFFF, /* C.4 */
152 0x5FFFE, 0x5FFFF, /* C.4 */
153 0x6FFFE, 0x6FFFF, /* C.4 */
154 0x7FFFE, 0x7FFFF, /* C.4 */
155 0x8FFFE, 0x8FFFF, /* C.4 */
156 0x9FFFE, 0x9FFFF, /* C.4 */
157 0xAFFFE, 0xAFFFF, /* C.4 */
158 0xBFFFE, 0xBFFFF, /* C.4 */
159 0xCFFFE, 0xCFFFF, /* C.4 */
160 0xDFFFE, 0xDFFFF, /* C.4 */
161 0xE0001, 0xE0001, /* C.9 */
162 0xE0020, 0xE007F, /* C.9 */
163 0xEFFFE, 0xEFFFF, /* C.4 */
164 0xF0000, 0xFFFFF, /* C.3, C.4 */
165 0x100000, 0x10FFFF /* C.3, C.4 */
168 /* A.1 Unassigned code points in Unicode 3.2 */
169 static const pg_wchar unassigned_codepoint_ranges[] =
569 /* D.1 Characters with bidirectional property "R" or "AL" */
570 static const pg_wchar RandALCat_codepoint_ranges[] =
608 /* D.2 Characters with bidirectional property "L" */
609 static const pg_wchar LCat_codepoint_ranges[] =
973 /* End of stringprep tables */
976 /* Is the given Unicode codepoint in the given table of ranges? */
977 #define IS_CODE_IN_TABLE(code, map) is_code_in_table(code, map, lengthof(map))
980 codepoint_range_cmp(const void *a, const void *b)
982 const pg_wchar *key = (const pg_wchar *) a;
983 const pg_wchar *range = (const pg_wchar *) b;
986 return -1; /* less than lower bound */
988 return 1; /* greater than upper bound */
990 return 0; /* within range */
994 is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
996 Assert(mapsize % 2 == 0);
998 if (code < map[0] || code > map[mapsize - 1])
1001 if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
1002 codepoint_range_cmp))
1009 * Calculate the length in characters of a null-terminated UTF-8 string.
1011 * Returns -1 if the input is not valid UTF-8.
1014 pg_utf8_string_len(const char *source)
1016 const unsigned char *p = (const unsigned char *) source;
1022 l = pg_utf_mblen(p);
1024 if (!pg_utf8_islegal(p, l))
1035 * Returns true if the input string is pure ASCII.
1038 pg_is_ascii_string(const char *p)
1042 if (IS_HIGHBIT_SET(*p))
1051 * pg_saslprep - Normalize a password with SASLprep.
1053 * SASLprep requires the input to be in UTF-8 encoding, but PostgreSQL
1054 * supports many encodings, so we don't blindly assume that. pg_saslprep
1055 * will check if the input looks like valid UTF-8, and returns
1056 * SASLPREP_INVALID_UTF8 if not.
1058 * If the string contains prohibited characters (or more precisely, if the
1059 * output string would contain prohibited characters after normalization),
1060 * returns SASLPREP_PROHIBITED.
1062 * On success, returns SASLPREP_SUCCESS, and the normalized string in
1065 * In frontend, the normalized string is malloc'd, and the caller is
1066 * responsible for freeing it. If an allocation fails, returns
1067 * SASLPREP_OOM. In backend, the normalized string is palloc'd instead,
1068 * and a failed allocation leads to ereport(ERROR).
1071 pg_saslprep(const char *input, char **output)
1073 pg_wchar *input_chars = NULL;
1074 pg_wchar *output_chars = NULL;
1080 bool contains_RandALCat;
1084 /* Check that the password isn't stupendously long */
1085 if (strlen(input) > MAX_PASSWORD_LENGTH)
1089 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1090 errmsg("password too long")));
1092 return SASLPREP_OOM;
1097 * Quick check if the input is pure ASCII. An ASCII string requires no
1098 * further processing.
1100 if (pg_is_ascii_string(input))
1102 *output = STRDUP(input);
1105 return SASLPREP_SUCCESS;
1109 * Convert the input from UTF-8 to an array of Unicode codepoints.
1111 * This also checks that the input is a legal UTF-8 string.
1113 input_size = pg_utf8_string_len(input);
1117 return SASLPREP_INVALID_UTF8;
1120 input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
1124 p = (unsigned char *) input;
1125 for (i = 0; i < input_size; i++)
1127 input_chars[i] = utf8_to_unicode(p);
1128 p += pg_utf_mblen(p);
1130 input_chars[i] = (pg_wchar) '\0';
1133 * The steps below correspond to the steps listed in [RFC3454], Section
1134 * "2. Preparation Overview"
1138 * 1) Map -- For each character in the input, check if it has a mapping
1139 * and, if so, replace it with its mapping.
1142 for (i = 0; i < input_size; i++)
1144 pg_wchar code = input_chars[i];
1146 if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
1147 input_chars[count++] = 0x0020;
1148 else if (IS_CODE_IN_TABLE(code, commonly_mapped_to_nothing_ranges))
1150 /* map to nothing */
1153 input_chars[count++] = code;
1155 input_chars[count] = (pg_wchar) '\0';
1158 if (input_size == 0)
1159 goto prohibited; /* don't allow empty password */
1162 * 2) Normalize -- Normalize the result of step 1 using Unicode
1165 output_chars = unicode_normalize_kc(input_chars);
1170 * 3) Prohibit -- Check for any characters that are not allowed in the
1171 * output. If any are found, return an error.
1173 for (i = 0; i < input_size; i++)
1175 pg_wchar code = input_chars[i];
1177 if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
1179 if (IS_CODE_IN_TABLE(code, unassigned_codepoint_ranges))
1184 * 4) Check bidi -- Possibly check for right-to-left characters, and if
1185 * any are found, make sure that the whole string satisfies the
1186 * requirements for bidirectional strings. If the string does not satisfy
1187 * the requirements for bidirectional strings, return an error.
1189 * [RFC3454], Section "6. Bidirectional Characters" explains in more
1190 * detail what that means:
1192 * "In any profile that specifies bidirectional character handling, all
1193 * three of the following requirements MUST be met:
1195 * 1) The characters in section 5.8 MUST be prohibited.
1197 * 2) If a string contains any RandALCat character, the string MUST NOT
1198 * contain any LCat character.
1200 * 3) If a string contains any RandALCat character, a RandALCat character
1201 * MUST be the first character of the string, and a RandALCat character
1202 * MUST be the last character of the string."
1204 contains_RandALCat = false;
1205 for (i = 0; i < input_size; i++)
1207 pg_wchar code = input_chars[i];
1209 if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
1211 contains_RandALCat = true;
1216 if (contains_RandALCat)
1218 pg_wchar first = input_chars[0];
1219 pg_wchar last = input_chars[input_size - 1];
1221 for (i = 0; i < input_size; i++)
1223 pg_wchar code = input_chars[i];
1225 if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
1229 if (!IS_CODE_IN_TABLE(first, RandALCat_codepoint_ranges) ||
1230 !IS_CODE_IN_TABLE(last, RandALCat_codepoint_ranges))
1235 * Finally, convert the result back to UTF-8.
1238 for (wp = output_chars; *wp; wp++)
1240 unsigned char buf[4];
1242 unicode_to_utf8(*wp, buf);
1243 result_size += pg_utf_mblen(buf);
1246 result = ALLOC(result_size + 1);
1249 p = (unsigned char *) result;
1250 for (wp = output_chars; *wp; wp++)
1252 unicode_to_utf8(*wp, p);
1253 p += pg_utf_mblen(p);
1255 Assert((char *) p == result + result_size);
1262 return SASLPREP_SUCCESS;
1270 return SASLPREP_PROHIBITED;
1278 return SASLPREP_OOM;