1 /**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2017 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 struct PoolPropertyNameCtype {
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
77 #include "unicode_fold_data.c"
80 onigenc_unicode_mbc_case_fold(OnigEncoding enc,
81 OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
84 const struct ByUnfoldKey* buk;
90 code = ONIGENC_MBC_TO_CODE(enc, p, end);
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
97 return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
100 if (code == 0x0049) {
101 return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
107 buk = unicode_unfold_key(code);
109 if (buk->fold_len == 1) {
110 return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
115 FOLDS_FOLD_ADDR_BUK(buk, addr);
117 for (i = 0; i < buk->fold_len; i++) {
118 OnigCodePoint c = addr[i];
119 len = ONIGENC_CODE_TO_MBC(enc, c, fold);
127 for (i = 0; i < len; i++) {
134 apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
138 for (i = from; i < to; ) {
139 OnigCodePoint fold = *FOLDS1_FOLD(i);
140 n = FOLDS1_UNFOLDS_NUM(i);
141 for (j = 0; j < n; j++) {
142 OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
144 r = (*f)(fold, &unfold, 1, arg);
145 if (r != 0) return r;
146 r = (*f)(unfold, &fold, 1, arg);
147 if (r != 0) return r;
149 for (k = 0; k < j; k++) {
150 OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
151 r = (*f)(unfold, &unfold2, 1, arg);
152 if (r != 0) return r;
153 r = (*f)(unfold2, &unfold, 1, arg);
154 if (r != 0) return r;
158 i = FOLDS1_NEXT_INDEX(i);
165 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
169 for (i = from; i < to; ) {
170 OnigCodePoint* fold = FOLDS2_FOLD(i);
171 n = FOLDS2_UNFOLDS_NUM(i);
172 for (j = 0; j < n; j++) {
173 OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
175 r = (*f)(unfold, fold, 2, arg);
176 if (r != 0) return r;
178 for (k = 0; k < j; k++) {
179 OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
180 r = (*f)(unfold, &unfold2, 1, arg);
181 if (r != 0) return r;
182 r = (*f)(unfold2, &unfold, 1, arg);
183 if (r != 0) return r;
187 i = FOLDS2_NEXT_INDEX(i);
194 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
198 for (i = from; i < to; ) {
199 OnigCodePoint* fold = FOLDS3_FOLD(i);
200 n = FOLDS3_UNFOLDS_NUM(i);
201 for (j = 0; j < n; j++) {
202 OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
204 r = (*f)(unfold, fold, 3, arg);
205 if (r != 0) return r;
207 for (k = 0; k < j; k++) {
208 OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
209 r = (*f)(unfold, &unfold2, 1, arg);
210 if (r != 0) return r;
211 r = (*f)(unfold2, &unfold, 1, arg);
212 if (r != 0) return r;
216 i = FOLDS3_NEXT_INDEX(i);
223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
224 OnigApplyAllCaseFoldFunc f, void* arg)
228 r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
229 if (r != 0) return r;
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
234 r = (*f)(0x0049, &code, 1, arg);
235 if (r != 0) return r;
237 r = (*f)(0x0131, &code, 1, arg);
238 if (r != 0) return r;
241 r = (*f)(0x0069, &code, 1, arg);
242 if (r != 0) return r;
244 r = (*f)(0x0130, &code, 1, arg);
245 if (r != 0) return r;
249 r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
250 if (r != 0) return r;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
255 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
258 r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
259 if (r != 0) return r;
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
264 r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
265 if (r != 0) return r;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
270 r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
271 if (r != 0) return r;
277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
278 OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
279 OnigCaseFoldCodeItem items[])
281 int n, m, i, j, k, len;
282 OnigCodePoint code, codes[3];
283 const struct ByUnfoldKey* buk;
287 code = ONIGENC_MBC_TO_CODE(enc, p, end);
288 len = enclen(enc, p);
290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
291 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
292 if (code == 0x0049) {
293 items[0].byte_len = len;
294 items[0].code_len = 1;
295 items[0].code[0] = 0x0131;
298 else if (code == 0x0130) {
299 items[0].byte_len = len;
300 items[0].code_len = 1;
301 items[0].code[0] = 0x0069;
304 else if (code == 0x0131) {
305 items[0].byte_len = len;
306 items[0].code_len = 1;
307 items[0].code[0] = 0x0049;
310 else if (code == 0x0069) {
311 items[0].byte_len = len;
312 items[0].code_len = 1;
313 items[0].code[0] = 0x0130;
319 buk = unicode_unfold_key(code);
321 if (buk->fold_len == 1) {
323 items[0].byte_len = len;
324 items[0].code_len = 1;
325 items[0].code[0] = *FOLDS1_FOLD(buk->index);
328 un = FOLDS1_UNFOLDS_NUM(buk->index);
329 for (i = 0; i < un; i++) {
330 OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
331 if (unfold != code) {
332 items[n].byte_len = len;
333 items[n].code_len = 1;
334 items[n].code[0] = unfold;
338 code = items[0].code[0]; // for multi-code to unfold search.
340 else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
341 OnigCodePoint cs[3][4];
344 if (buk->fold_len == 2) {
345 m = FOLDS2_UNFOLDS_NUM(buk->index);
346 for (i = 0; i < m; i++) {
347 OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];
348 if (unfold == code) continue;
350 items[n].byte_len = len;
351 items[n].code_len = 1;
352 items[n].code[0] = unfold;
356 for (fn = 0; fn < 2; fn++) {
358 cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
359 index = unicode_fold1_key(&cs[fn][0]);
361 int m = FOLDS1_UNFOLDS_NUM(index);
362 for (i = 0; i < m; i++) {
363 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
371 for (i = 0; i < ncs[0]; i++) {
372 for (j = 0; j < ncs[1]; j++) {
373 items[n].byte_len = len;
374 items[n].code_len = 2;
375 items[n].code[0] = cs[0][i];
376 items[n].code[1] = cs[1][j];
381 else { /* fold_len == 3 */
382 m = FOLDS3_UNFOLDS_NUM(buk->index);
383 for (i = 0; i < m; i++) {
384 OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];
385 if (unfold == code) continue;
387 items[n].byte_len = len;
388 items[n].code_len = 1;
389 items[n].code[0] = unfold;
393 for (fn = 0; fn < 3; fn++) {
395 cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
396 index = unicode_fold1_key(&cs[fn][0]);
398 int m = FOLDS1_UNFOLDS_NUM(index);
399 for (i = 0; i < m; i++) {
400 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
408 for (i = 0; i < ncs[0]; i++) {
409 for (j = 0; j < ncs[1]; j++) {
410 for (k = 0; k < ncs[2]; k++) {
411 items[n].byte_len = len;
412 items[n].code_len = 3;
413 items[n].code[0] = cs[0][i];
414 items[n].code[1] = cs[1][j];
415 items[n].code[2] = cs[2][k];
422 /* multi char folded code is not head of another folded multi char */
427 int index = unicode_fold1_key(&code);
429 int m = FOLDS1_UNFOLDS_NUM(index);
430 for (i = 0; i < m; i++) {
431 items[n].byte_len = len;
432 items[n].code_len = 1;
433 items[n].code[0] = FOLDS1_UNFOLDS(index)[i];
439 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
448 code = ONIGENC_MBC_TO_CODE(enc, p, end);
450 buk = unicode_unfold_key(code);
451 if (buk != 0 && buk->fold_len == 1) {
452 codes[1] = *FOLDS1_FOLD(buk->index);
457 clen = enclen(enc, p);
460 index = unicode_fold2_key(codes);
462 m = FOLDS2_UNFOLDS_NUM(index);
463 for (i = 0; i < m; i++) {
464 items[n].byte_len = len;
465 items[n].code_len = 1;
466 items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
473 code = ONIGENC_MBC_TO_CODE(enc, p, end);
474 buk = unicode_unfold_key(code);
475 if (buk != 0 && buk->fold_len == 1) {
476 codes[2] = *FOLDS1_FOLD(buk->index);
481 clen = enclen(enc, p);
484 index = unicode_fold3_key(codes);
486 m = FOLDS3_UNFOLDS_NUM(index);
487 for (i = 0; i < m; i++) {
488 items[n].byte_len = len;
489 items[n].code_len = 1;
490 items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
501 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
503 enum EGCB_BREAK_TYPE {
506 EGCB_BREAK_UNDEF_E_MODIFIER = 2,
507 EGCB_BREAK_UNDEF_RI_RI = 3
517 EGCB_Regional_Indicator = 6,
518 EGCB_SpacingMark = 7,
521 EGCB_E_Base_GAZ = 10,
522 EGCB_E_Modifier = 11,
523 EGCB_Glue_After_Zwj = 12,
537 #include "unicode_egcb_data.c"
539 static enum EGCB_TYPE
540 egcb_get_type(OnigCodePoint code)
542 OnigCodePoint low, high, x;
545 for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
546 x = (low + high) >> 1;
547 if (code > EGCB_RANGES[x].end)
553 type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
554 code >= EGCB_RANGES[low].start) ?
555 EGCB_RANGES[low].type : EGCB_Other;
560 #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
561 #define IS_HANGUL(code) ((code) >= EGCB_L)
563 /* GB1 and GB2 are outside of this function. */
564 static enum EGCB_BREAK_TYPE
565 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
570 from = egcb_get_type(from_code);
571 to = egcb_get_type(to_code);
574 if (from == 0 && to == 0) goto GB999;
577 if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
579 if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
581 if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
583 if (IS_HANGUL(from) && IS_HANGUL(to)) {
585 if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
587 if ((from == EGCB_LV || from == EGCB_V)
588 && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
591 if ((from == EGCB_LVT || from == EGCB_T) && (to == EGCB_T))
592 return EGCB_NOT_BREAK;
598 if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
601 if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
603 if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
606 if (to == EGCB_E_Modifier) {
607 if (from == EGCB_E_Base || from == EGCB_E_Base_GAZ) return EGCB_NOT_BREAK;
608 if (from == EGCB_Extend) return EGCB_BREAK_UNDEF_E_MODIFIER;
613 if (from == EGCB_ZWJ) {
614 if (to == EGCB_Glue_After_Zwj || to == EGCB_E_Base_GAZ) return EGCB_NOT_BREAK;
619 if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
620 return EGCB_BREAK_UNDEF_RI_RI;
627 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
630 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
631 const UChar* start, const UChar* end)
635 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
636 enum EGCB_BREAK_TYPE btype;
641 if (p == start) return 1;
642 if (p == end) return 1;
645 prev = onigenc_get_prev_char_head(enc, start, p);
646 if (IS_NULL(prev)) return 1;
649 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
650 to = ONIGENC_MBC_TO_CODE(enc, p, end);
652 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
653 if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
654 if (from == 0x000d && to == 0x000a) return 0;
658 btype = unicode_egcb_is_break_2code(from, to);
667 case EGCB_BREAK_UNDEF_E_MODIFIER:
668 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
669 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
670 type = egcb_get_type(from);
671 if (type == EGCB_E_Base || type == EGCB_E_Base_GAZ)
673 if (type != EGCB_Extend)
678 case EGCB_BREAK_UNDEF_RI_RI:
681 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
682 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
683 type = egcb_get_type(from);
684 if (type != EGCB_Regional_Indicator)
689 if ((n % 2) == 0) return 0;
697 if (from == 0x000d && to == 0x000a) return 0;
699 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
704 Undefine __GNUC__ for Escape warnings in Clang.
706 ./unicode_property_data.c:26730:44: warning: static variable
707 'unicode_prop_name_pool_contents' is used in an inline function with
708 external linkage [-Wstatic-in-inline]
709 register const char *s = o + unicode_prop_name_pool;
716 #ifdef USE_UNICODE_PROPERTIES
717 #include "unicode_property_data.c"
719 #include "unicode_property_data_posix.c"
722 #define USER_DEFINED_PROPERTY_MAX_NUM 20
726 OnigCodePoint* ranges;
727 } UserDefinedPropertyValue;
729 static int UserDefinedPropertyNum;
730 static UserDefinedPropertyValue
731 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
732 static st_table* UserDefinedPropertyTable;
735 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
737 UserDefinedPropertyValue* e;
745 if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
746 return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
748 len = (int )strlen(name);
749 if (len >= PROPERTY_NAME_MAX_SIZE)
750 return ONIGERR_TOO_LONG_PROPERTY_NAME;
752 s = (char* )xmalloc(len + 1);
754 return ONIGERR_MEMORY;
757 for (i = 0; i < len; i++) {
759 if (c <= 0 || c >= 0x80) {
761 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
764 if (c != ' ' && c != '-' && c != '_') {
771 if (UserDefinedPropertyTable == 0) {
772 UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
775 e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
776 e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
778 r = onig_st_insert_strend(UserDefinedPropertyTable,
779 (const UChar* )s, (const UChar* )s + n,
780 (hash_data_type )((void* )e));
783 UserDefinedPropertyNum++;
788 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
791 #ifdef USE_UNICODE_PROPERTIES
792 ctype <= ONIGENC_MAX_STD_CTYPE &&
795 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
798 if (ctype >= CODE_RANGES_NUM) {
799 int index = ctype - CODE_RANGES_NUM;
800 if (index < UserDefinedPropertyNum)
801 return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
803 return ONIGERR_TYPE_BUG;
806 return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
811 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
813 if (ctype >= CODE_RANGES_NUM) {
814 int index = ctype - CODE_RANGES_NUM;
815 if (index < UserDefinedPropertyNum) {
816 *ranges = UserDefinedPropertyRanges[index].ranges;
820 return ONIGERR_TYPE_BUG;
823 *ranges = CodeRanges[ctype];
828 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
829 const OnigCodePoint* ranges[])
832 return onigenc_unicode_ctype_code_range(ctype, ranges);
836 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
841 const struct PoolPropertyNameCtype* pc;
842 char buf[PROPERTY_NAME_MAX_SIZE];
847 code = ONIGENC_MBC_TO_CODE(enc, p, end);
849 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
851 if (code != ' ' && code != '-' && code != '_') {
852 buf[len++] = (char )code;
853 if (len >= PROPERTY_NAME_MAX_SIZE)
854 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
862 if (UserDefinedPropertyTable != 0) {
863 UserDefinedPropertyValue* e;
864 e = (UserDefinedPropertyValue* )NULL;
865 onig_st_lookup_strend(UserDefinedPropertyTable,
866 (const UChar* )buf, (const UChar* )buf + len,
867 (hash_data_type* )((void* )(&e)));
873 pc = unicode_lookup_property_name(buf, len);
875 /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
876 #ifndef USE_UNICODE_PROPERTIES
877 if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
878 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
881 return (int )pc->ctype;
884 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;