From: Nikita Popov Date: Tue, 25 Jul 2017 16:25:52 +0000 (+0200) Subject: Port ucgendat to PHP X-Git-Tag: php-7.3.0alpha1~1788^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0c0e35fedc0e06598cbd1cc91a53990d83523ed3;p=php Port ucgendat to PHP Implemented such that the output is identical, including some quirks that should be fixed subsequently. --- diff --git a/ext/mbstring/ucgendat/README b/ext/mbstring/ucgendat/README index 7717bf89f9..b5af1b8319 100644 --- a/ext/mbstring/ucgendat/README +++ b/ext/mbstring/ucgendat/README @@ -3,7 +3,7 @@ This file is not necessary to build PHP. It's only necessary to rebuild unicode_data.h from Unicode ucd files. Example usage: -./ucgendat UnicodeData-6.0.0d7.txt -x CompositionExclusions-6.0.0d2.txt +php ucgendat.php UnicodeData.txt diff --git a/ext/mbstring/ucgendat/ucgendat.c b/ext/mbstring/ucgendat/ucgendat.c deleted file mode 100644 index 7d97681f0d..0000000000 --- a/ext/mbstring/ucgendat/ucgendat.c +++ /dev/null @@ -1,1931 +0,0 @@ -/* Further modified for PHP */ -/* $Id$ */ - -/* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.36.2.4 2007/01/02 21:43:51 kurt Exp $ */ -/* This work is part of OpenLDAP Software . - * - * Copyright 1998-2007 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available at - * . - */ - -/* Copyright 2001 Computing Research Labs, New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ -/* orig Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */ - -#include -#include -#include -#include -#include - -#define ac_uint2 unsigned short -#define ac_uint4 unsigned int -#define LDAP_DIRSEP "/" -#define AC_MEMCPY memcpy - -#ifndef HARDCODE_DATA -#define HARDCODE_DATA 1 -#endif - -#undef ishdigit -#define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ - ((cc) >= 'A' && (cc) <= 'F') ||\ - ((cc) >= 'a' && (cc) <= 'f')) - -/* - * A header written to the output file with the byte-order-mark and the number - * of property nodes. - */ -static ac_uint2 hdr[2] = {0xfeff, 0}; - -#define NUMPROPS 50 -#define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) - -typedef struct { - char *name; - int len; -} _prop_t; - -/* - * List of properties expected to be found in the Unicode Character Database - * including some implementation specific properties. - * - * The implementation specific properties are: - * Cm = Composed (can be decomposed) - * Nb = Non-breaking - * Sy = Symmetric (has left and right forms) - * Hd = Hex digit - * Qm = Quote marks - * Mr = Mirroring - * Ss = Space, other - * Cp = Defined character - */ -static _prop_t props[NUMPROPS] = { - {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2}, - {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2}, - {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2}, - {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2}, - {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1}, - {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1}, - {"S", 1}, {"WS", 2}, {"ON", 2}, - {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2}, - {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2} -}; - -typedef struct { - ac_uint4 *ranges; - ac_uint2 used; - ac_uint2 size; -} _ranges_t; - -static _ranges_t proptbl[NUMPROPS]; - -/* - * Make sure this array is sized to be on a 4-byte boundary at compile time. - */ -static ac_uint2 propcnt[NEEDPROPS]; - -/* - * Array used to collect a decomposition before adding it to the decomposition - * table. - */ -static ac_uint4 dectmp[64]; -static ac_uint4 dectmp_size; - -typedef struct { - ac_uint4 code; - ac_uint2 size; - ac_uint2 used; - ac_uint4 *decomp; -} _decomp_t; - -/* - * List of decomposition. Created and expanded in order as the characters are - * encountered. First list contains canonical mappings, second also includes - * compatibility mappings. - */ -static _decomp_t *decomps; -static ac_uint4 decomps_used; -static ac_uint4 decomps_size; - -static _decomp_t *kdecomps; -static ac_uint4 kdecomps_used; -static ac_uint4 kdecomps_size; - -/* - * Composition exclusion table stuff. - */ -#define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31))) -#define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31))) -static ac_uint4 compexs[8192]; - -/* - * Struct for holding a composition pair, and array of composition pairs - */ -typedef struct { - ac_uint4 comp; - ac_uint4 count; - ac_uint4 code1; - ac_uint4 code2; -} _comp_t; - -#if 0 -static _comp_t *comps; -#endif -static ac_uint4 comps_used; - -/* - * Types and lists for handling lists of case mappings. - */ -typedef struct { - ac_uint4 key; - ac_uint4 other; -} _case_t; - -static _case_t *upper; -static _case_t *lower; -static _case_t *title; -static ac_uint4 upper_used; -static ac_uint4 upper_size; -static ac_uint4 lower_used; -static ac_uint4 lower_size; -static ac_uint4 title_used; -static ac_uint4 title_size; - -/* - * Array used to collect case mappings before adding them to a list. - */ -static ac_uint4 cases[3]; - -/* - * An array to hold ranges for combining classes. - */ -static ac_uint4 *ccl; -static ac_uint4 ccl_used; -static ac_uint4 ccl_size; - -/* - * Structures for handling numbers. - */ -typedef struct { - ac_uint4 code; - ac_uint4 idx; -} _codeidx_t; - -typedef struct { - short numerator; - short denominator; -} _num_t; - -/* - * Arrays to hold the mapping of codes to numbers. - */ -static _codeidx_t *ncodes; -static ac_uint4 ncodes_used; -static ac_uint4 ncodes_size; - -static _num_t *nums; -static ac_uint4 nums_used; -static ac_uint4 nums_size; - -/* - * Array for holding numbers. - */ -static _num_t *nums; -static ac_uint4 nums_used; -static ac_uint4 nums_size; - -static void -add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2) -{ - int i, j, k, len; - _ranges_t *rlp; - char *name; - - for (k = 0; k < 2; k++) { - if (k == 0) { - name = p1; - len = 2; - } else { - if (p2 == 0) - break; - - name = p2; - len = 1; - } - - for (i = 0; i < NUMPROPS; i++) { - if (props[i].len == len && memcmp(props[i].name, name, len) == 0) - break; - } - - if (i == NUMPROPS) - continue; - - rlp = &proptbl[i]; - - /* - * Resize the range list if necessary. - */ - if (rlp->used == rlp->size) { - if (rlp->size == 0) - rlp->ranges = (ac_uint4 *) - malloc(sizeof(ac_uint4) << 3); - else - rlp->ranges = (ac_uint4 *) - realloc((char *) rlp->ranges, - sizeof(ac_uint4) * (rlp->size + 8)); - rlp->size += 8; - } - - /* - * If this is the first code for this property list, just add it - * and return. - */ - if (rlp->used == 0) { - rlp->ranges[0] = start; - rlp->ranges[1] = end; - rlp->used += 2; - continue; - } - - /* - * Optimize the case of adding the range to the end. - */ - j = rlp->used - 1; - if (start > rlp->ranges[j]) { - j = rlp->used; - rlp->ranges[j++] = start; - rlp->ranges[j++] = end; - rlp->used = j; - continue; - } - - /* - * Need to locate the insertion point. - */ - for (i = 0; - i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ; - - /* - * If the start value lies in the current range, then simply set the - * new end point of the range to the end value passed as a parameter. - */ - if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) { - rlp->ranges[i + 1] = end; - return; - } - - /* - * Shift following values up by two. - */ - for (j = rlp->used; j > i; j -= 2) { - rlp->ranges[j] = rlp->ranges[j - 2]; - rlp->ranges[j + 1] = rlp->ranges[j - 1]; - } - - /* - * Add the new range at the insertion point. - */ - rlp->ranges[i] = start; - rlp->ranges[i + 1] = end; - rlp->used += 2; - } -} - -static void -ordered_range_insert(ac_uint4 c, char *name, int len) -{ - int i, j; - ac_uint4 s, e; - _ranges_t *rlp; - - if (len == 0) - return; - - /* - * Deal with directionality codes introduced in Unicode 3.0. - */ - if ((len == 2 && memcmp(name, "BN", 2) == 0) || - (len == 3 && - (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 || - memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 || - memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0 || - memcmp(name, "LRI", 3) == 0 || memcmp(name, "RLI", 3) == 0 || - memcmp(name, "FSI", 3) == 0 || memcmp(name, "PDI", 3) == 0))) { - /* - * Mark all of these as Other Neutral to preserve compatibility with - * older versions. - */ - len = 2; - name = "ON"; - } - - for (i = 0; i < NUMPROPS; i++) { - if (props[i].len == len && memcmp(props[i].name, name, len) == 0) - break; - } - - if (i == NUMPROPS) { - printf("Unknown property %s\n", name); - return; - } - - /* - * Have a match, so insert the code in order. - */ - rlp = &proptbl[i]; - - /* - * Resize the range list if necessary. - */ - if (rlp->used == rlp->size) { - if (rlp->size == 0) - rlp->ranges = (ac_uint4 *) - malloc(sizeof(ac_uint4) << 3); - else - rlp->ranges = (ac_uint4 *) - realloc((char *) rlp->ranges, - sizeof(ac_uint4) * (rlp->size + 8)); - rlp->size += 8; - } - - /* - * If this is the first code for this property list, just add it - * and return. - */ - if (rlp->used == 0) { - rlp->ranges[0] = rlp->ranges[1] = c; - rlp->used += 2; - return; - } - - /* - * Optimize the cases of extending the last range and adding new ranges to - * the end. - */ - j = rlp->used - 1; - e = rlp->ranges[j]; - s = rlp->ranges[j - 1]; - - if (c == e + 1) { - /* - * Extend the last range. - */ - rlp->ranges[j] = c; - return; - } - - if (c > e + 1) { - /* - * Start another range on the end. - */ - j = rlp->used; - rlp->ranges[j] = rlp->ranges[j + 1] = c; - rlp->used += 2; - return; - } - - if (c >= s) - /* - * The code is a duplicate of a code in the last range, so just return. - */ - return; - - /* - * The code should be inserted somewhere before the last range in the - * list. Locate the insertion point. - */ - for (i = 0; - i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ; - - s = rlp->ranges[i]; - e = rlp->ranges[i + 1]; - - if (c == e + 1) - /* - * Simply extend the current range. - */ - rlp->ranges[i + 1] = c; - else if (c < s) { - /* - * Add a new entry before the current location. Shift all entries - * before the current one up by one to make room. - */ - for (j = rlp->used; j > i; j -= 2) { - rlp->ranges[j] = rlp->ranges[j - 2]; - rlp->ranges[j + 1] = rlp->ranges[j - 1]; - } - rlp->ranges[i] = rlp->ranges[i + 1] = c; - - rlp->used += 2; - } -} - -static void -add_decomp(ac_uint4 code, short compat) -{ - ac_uint4 i, j, size; - _decomp_t **pdecomps; - ac_uint4 *pdecomps_used; - ac_uint4 *pdecomps_size; - - if (compat) { - pdecomps = &kdecomps; - pdecomps_used = &kdecomps_used; - pdecomps_size = &kdecomps_size; - } else { - pdecomps = &decomps; - pdecomps_used = &decomps_used; - pdecomps_size = &decomps_size; - } - - /* - * Add the code to the composite property. - */ - if (!compat) { - ordered_range_insert(code, "Cm", 2); - } - - /* - * Locate the insertion point for the code. - */ - for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; - - /* - * Allocate space for a new decomposition. - */ - if (*pdecomps_used == *pdecomps_size) { - if (*pdecomps_size == 0) - *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); - else - *pdecomps = (_decomp_t *) - realloc((char *) *pdecomps, - sizeof(_decomp_t) * (*pdecomps_size + 8)); - (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', - sizeof(_decomp_t) << 3); - *pdecomps_size += 8; - } - - if (i < *pdecomps_used && code != (*pdecomps)[i].code) { - /* - * Shift the decomps up by one if the codes don't match. - */ - for (j = *pdecomps_used; j > i; j--) - (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], - sizeof(_decomp_t)); - } - - /* - * Insert or replace a decomposition. - */ - size = dectmp_size + (4 - (dectmp_size & 3)); - if ((*pdecomps)[i].size < size) { - if ((*pdecomps)[i].size == 0) - (*pdecomps)[i].decomp = (ac_uint4 *) - malloc(sizeof(ac_uint4) * size); - else - (*pdecomps)[i].decomp = (ac_uint4 *) - realloc((char *) (*pdecomps)[i].decomp, - sizeof(ac_uint4) * size); - (*pdecomps)[i].size = size; - } - - if ((*pdecomps)[i].code != code) - (*pdecomps_used)++; - - (*pdecomps)[i].code = code; - (*pdecomps)[i].used = dectmp_size; - (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, - sizeof(ac_uint4) * dectmp_size); - - /* - * NOTICE: This needs changing later so it is more general than simply - * pairs. This calculation is done here to simplify allocation elsewhere. - */ - if (!compat && dectmp_size == 2) - comps_used++; -} - -static void -add_to_title(ac_uint4 code) -{ - ac_uint4 i, j; - - if (title_used == title_size) { - if (title_size == 0) - title = (_case_t *) malloc(sizeof(_case_t) << 3); - else - title = (_case_t *) realloc((char *) title, - sizeof(_case_t) * (title_size + 8)); - title_size += 8; - } - - /* - * Locate the insertion point. - */ - for (i = 0; i < title_used && code > title[i].key; i++) ; - - if (i < title_used) { - /* - * Shift the array up by one. - */ - for (j = title_used; j > i; j--) - (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1], - sizeof(_case_t)); - } - - title[i].key = code; - title[i].other = cases[2]; /* Title */ - - title_used++; -} - -static void -add_to_upper(ac_uint4 code) -{ - ac_uint4 i, j; - - if (upper_used == upper_size) { - if (upper_size == 0) - upper = (_case_t *) malloc(sizeof(_case_t) << 3); - else - upper = (_case_t *) realloc((char *) upper, - sizeof(_case_t) * (upper_size + 8)); - upper_size += 8; - } - - /* - * Locate the insertion point. - */ - for (i = 0; i < upper_used && code > upper[i].key; i++) ; - - if (i < upper_used) { - /* - * Shift the array up by one. - */ - for (j = upper_used; j > i; j--) - (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1], - sizeof(_case_t)); - } - - upper[i].key = code; - upper[i].other = cases[0]; /* Upper */ - - upper_used++; -} - -static void -add_to_lower(ac_uint4 code) -{ - ac_uint4 i, j; - - if (lower_used == lower_size) { - if (lower_size == 0) - lower = (_case_t *) malloc(sizeof(_case_t) << 3); - else - lower = (_case_t *) realloc((char *) lower, - sizeof(_case_t) * (lower_size + 8)); - lower_size += 8; - } - - /* - * Locate the insertion point. - */ - for (i = 0; i < lower_used && code > lower[i].key; i++) ; - - if (i < lower_used) { - /* - * Shift the array up by one. - */ - for (j = lower_used; j > i; j--) - (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1], - sizeof(_case_t)); - } - - lower[i].key = code; - lower[i].other = cases[1]; /* Lower */ - - lower_used++; -} - -static void -ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code) -{ - ac_uint4 i, j; - - if (ccl_used == ccl_size) { - if (ccl_size == 0) - ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24); - else - ccl = (ac_uint4 *) - realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24)); - ccl_size += 24; - } - - /* - * Optimize adding the first item. - */ - if (ccl_used == 0) { - ccl[0] = ccl[1] = c; - ccl[2] = ccl_code; - ccl_used += 3; - return; - } - - /* - * Handle the special case of extending the range on the end. This - * requires that the combining class codes are the same. - */ - if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) { - ccl[ccl_used - 2] = c; - return; - } - - /* - * Handle the special case of adding another range on the end. - */ - if (c > ccl[ccl_used - 2] + 1 || - (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) { - ccl[ccl_used++] = c; - ccl[ccl_used++] = c; - ccl[ccl_used++] = ccl_code; - return; - } - - /* - * Locate either the insertion point or range for the code. - */ - for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ; - - if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) { - /* - * Extend an existing range. - */ - ccl[i + 1] = c; - return; - } else if (c < ccl[i]) { - /* - * Start a new range before the current location. - */ - for (j = ccl_used; j > i; j -= 3) { - ccl[j] = ccl[j - 3]; - ccl[j - 1] = ccl[j - 4]; - ccl[j - 2] = ccl[j - 5]; - } - ccl[i] = ccl[i + 1] = c; - ccl[i + 2] = ccl_code; - } -} - -/* - * Adds a number if it does not already exist and returns an index value - * multiplied by 2. - */ -static ac_uint4 -make_number(short num, short denom) -{ - ac_uint4 n; - - /* - * Determine if the number already exists. - */ - for (n = 0; n < nums_used; n++) { - if (nums[n].numerator == num && nums[n].denominator == denom) - return n << 1; - } - - if (nums_used == nums_size) { - if (nums_size == 0) - nums = (_num_t *) malloc(sizeof(_num_t) << 3); - else - nums = (_num_t *) realloc((char *) nums, - sizeof(_num_t) * (nums_size + 8)); - nums_size += 8; - } - - n = nums_used++; - nums[n].numerator = num; - nums[n].denominator = denom; - - return n << 1; -} - -static void -add_number(ac_uint4 code, short num, short denom) -{ - ac_uint4 i, j; - - /* - * Insert the code in order. - */ - for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ; - - /* - * Handle the case of the codes matching and simply replace the number - * that was there before. - */ - if (i < ncodes_used && code == ncodes[i].code) { - ncodes[i].idx = make_number(num, denom); - return; - } - - /* - * Resize the array if necessary. - */ - if (ncodes_used == ncodes_size) { - if (ncodes_size == 0) - ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3); - else - ncodes = (_codeidx_t *) - realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8)); - - ncodes_size += 8; - } - - /* - * Shift things around to insert the code if necessary. - */ - if (i < ncodes_used) { - for (j = ncodes_used; j > i; j--) { - ncodes[j].code = ncodes[j - 1].code; - ncodes[j].idx = ncodes[j - 1].idx; - } - } - ncodes[i].code = code; - ncodes[i].idx = make_number(num, denom); - - ncodes_used++; -} - -/* - * This routine assumes that the line is a valid Unicode Character Database - * entry. - */ -static void -read_cdata(FILE *in) -{ - ac_uint4 i, lineno, skip, code, ccl_code; - short wnum, neg, number[2], compat; - char line[512], *s, *e; - - lineno = skip = 0; - while (fgets(line, sizeof(line), in)) { - if( (s=strchr(line, '\n')) ) *s = '\0'; - lineno++; - - /* - * Skip blank lines and lines that start with a '#'. - */ - if (line[0] == 0 || line[0] == '#') - continue; - - /* - * If lines need to be skipped, do it here. - */ - if (skip) { - skip--; - continue; - } - - /* - * Collect the code. The code can be up to 6 hex digits in length to - * allow surrogates to be specified. - */ - for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) { - code <<= 4; - if (*s >= '0' && *s <= '9') - code += *s - '0'; - else if (*s >= 'A' && *s <= 'F') - code += (*s - 'A') + 10; - else if (*s >= 'a' && *s <= 'f') - code += (*s - 'a') + 10; - } - - /* - * Handle the following special cases: - * 1. 4E00-9FA5 CJK Ideographs. - * 2. AC00-D7A3 Hangul Syllables. - * 3. D800-DFFF Surrogates. - * 4. E000-F8FF Private Use Area. - * 5. F900-FA2D Han compatibility. - * ...Plus additional ranges in newer Unicode versions... - */ - switch (code) { - case 0x3400: - /* CJK Ideograph Extension A */ - add_range(0x3400, 0x4db5, "Lo", "L"); - - add_range(0x3400, 0x4db5, "Cp", 0); - - skip = 1; - break; - case 0x4e00: - /* - * The Han ideographs. - */ - add_range(0x4e00, 0x9fff, "Lo", "L"); - - /* - * Add the characters to the defined category. - */ - add_range(0x4e00, 0x9fea, "Cp", 0); - - skip = 1; - break; - case 0xac00: - /* - * The Hangul syllables. - */ - add_range(0xac00, 0xd7a3, "Lo", "L"); - - /* - * Add the characters to the defined category. - */ - add_range(0xac00, 0xd7a3, "Cp", 0); - - skip = 1; - break; - case 0xd800: - /* - * Make a range of all surrogates and assume some default - * properties. - */ - add_range(0xd800, 0xdfff, "Cs", "L"); - skip = 5; - break; - case 0xe000: - /* - * The Private Use area. Add with a default set of properties. - */ - add_range(0xe000, 0xf8ff, "Co", "L"); - skip = 1; - break; - case 0x20000: - /* CJK Ideograph Extension B */ - add_range(0x20000, 0x2a6d6, "Lo", "L"); - - add_range(0x20000, 0x2a6d6, "Cp", 0); - - skip = 1; - break; - case 0xf0000: - /* Plane 15 private use */ - add_range(0xf0000, 0xffffd, "Co", "L"); - skip = 1; - break; - - case 0x100000: - /* Plane 16 private use */ - add_range(0x100000, 0x10fffd, "Co", "L"); - skip = 1; - break; - } - - if (skip) - continue; - - /* - * Add the code to the defined category. - */ - ordered_range_insert(code, "Cp", 2); - - /* - * Locate the first character property field. - */ - for (i = 0; *s != 0 && i < 2; s++) { - if (*s == ';') - i++; - } - for (e = s; *e && *e != ';'; e++) ; - - ordered_range_insert(code, s, e - s); - - /* - * Locate the combining class code. - */ - for (s = e; *s != 0 && i < 3; s++) { - if (*s == ';') - i++; - } - - /* - * Convert the combining class code from decimal. - */ - for (ccl_code = 0, e = s; *e && *e != ';'; e++) - ccl_code = (ccl_code * 10) + (*e - '0'); - - /* - * Add the code if it not 0. - */ - if (ccl_code != 0) - ordered_ccl_insert(code, ccl_code); - - /* - * Locate the second character property field. - */ - for (s = e; *s != 0 && i < 4; s++) { - if (*s == ';') - i++; - } - for (e = s; *e && *e != ';'; e++) ; - - ordered_range_insert(code, s, e - s); - - /* - * Check for a decomposition. - */ - s = ++e; - if (*s != ';') { - compat = *s == '<'; - if (compat) { - /* - * Skip compatibility formatting tag. - */ - while (*s++ != '>'); - } - /* - * Collect the codes of the decomposition. - */ - for (dectmp_size = 0; *s != ';'; ) { - /* - * Skip all leading non-hex digits. - */ - while (!ishdigit(*s)) - s++; - - for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { - dectmp[dectmp_size] <<= 4; - if (*s >= '0' && *s <= '9') - dectmp[dectmp_size] += *s - '0'; - else if (*s >= 'A' && *s <= 'F') - dectmp[dectmp_size] += (*s - 'A') + 10; - else if (*s >= 'a' && *s <= 'f') - dectmp[dectmp_size] += (*s - 'a') + 10; - } - dectmp_size++; - } - - /* - * If there are any codes in the temporary decomposition array, - * then add the character with its decomposition. - */ - if (dectmp_size > 0) { - if (!compat) { - add_decomp(code, 0); - } - add_decomp(code, 1); - } - } - - /* - * Skip to the number field. - */ - for (i = 0; i < 3 && *s; s++) { - if (*s == ';') - i++; - } - - /* - * Scan the number in. - */ - number[0] = number[1] = 0; - for (e = s, neg = wnum = 0; *e && *e != ';'; e++) { - if (*e == '-') { - neg = 1; - continue; - } - - if (*e == '/') { - /* - * Move the denominator of the fraction. - */ - if (neg) - number[wnum] *= -1; - neg = 0; - e++; - wnum++; - } - number[wnum] = (number[wnum] * 10) + (*e - '0'); - } - - if (e > s) { - /* - * Adjust the denominator in case of integers and add the number. - */ - if (wnum == 0) - number[1] = 1; - - add_number(code, number[0], number[1]); - } - - /* - * Skip to the start of the possible case mappings. - */ - for (s = e, i = 0; i < 4 && *s; s++) { - if (*s == ';') - i++; - } - - /* - * Collect the case mappings. - */ - cases[0] = cases[1] = cases[2] = 0; - for (i = 0; i < 3; i++) { - while (ishdigit(*s)) { - cases[i] <<= 4; - if (*s >= '0' && *s <= '9') - cases[i] += *s - '0'; - else if (*s >= 'A' && *s <= 'F') - cases[i] += (*s - 'A') + 10; - else if (*s >= 'a' && *s <= 'f') - cases[i] += (*s - 'a') + 10; - s++; - } - if (*s == ';') - s++; - } - if (cases[2]) - add_to_title(code); - if (cases[1]) - add_to_lower(code); - if (cases[0]) - add_to_upper(code); - } -} - -#if 0 - -static _decomp_t * -find_decomp(ac_uint4 code, short compat) -{ - long l, r, m; - _decomp_t *decs; - - l = 0; - r = (compat ? kdecomps_used : decomps_used) - 1; - decs = compat ? kdecomps : decomps; - while (l <= r) { - m = (l + r) >> 1; - if (code > decs[m].code) - l = m + 1; - else if (code < decs[m].code) - r = m - 1; - else - return &decs[m]; - } - return 0; -} - -static void -decomp_it(_decomp_t *d, short compat) -{ - ac_uint4 i; - _decomp_t *dp; - - for (i = 0; i < d->used; i++) { - if ((dp = find_decomp(d->decomp[i], compat)) != 0) - decomp_it(dp, compat); - else - dectmp[dectmp_size++] = d->decomp[i]; - } -} - - -/* - * Expand all decompositions by recursively decomposing each character - * in the decomposition. - */ -static void -expand_decomp(void) -{ - ac_uint4 i; - - for (i = 0; i < decomps_used; i++) { - dectmp_size = 0; - decomp_it(&decomps[i], 0); - if (dectmp_size > 0) - add_decomp(decomps[i].code, 0); - } - - for (i = 0; i < kdecomps_used; i++) { - dectmp_size = 0; - decomp_it(&kdecomps[i], 1); - if (dectmp_size > 0) - add_decomp(kdecomps[i].code, 1); - } -} - -static int -cmpcomps(const void *v_comp1, const void *v_comp2) -{ - const _comp_t *comp1 = v_comp1, *comp2 = v_comp2; - long diff = comp1->code1 - comp2->code1; - - if (!diff) - diff = comp1->code2 - comp2->code2; - return (int) diff; -} - -#endif - -/* - * Load composition exclusion data - */ -static void -read_compexdata(FILE *in) -{ - ac_uint2 i; - ac_uint4 code; - char line[512], *s; - - (void) memset((char *) compexs, 0, sizeof(compexs)); - - while (fgets(line, sizeof(line), in)) { - if( (s=strchr(line, '\n')) ) *s = '\0'; - /* - * Skip blank lines and lines that start with a '#'. - */ - if (line[0] == 0 || line[0] == '#') - continue; - - /* - * Collect the code. Assume max 6 digits - */ - - for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) { - if (isspace((unsigned char)*s)) break; - code <<= 4; - if (*s >= '0' && *s <= '9') - code += *s - '0'; - else if (*s >= 'A' && *s <= 'F') - code += (*s - 'A') + 10; - else if (*s >= 'a' && *s <= 'f') - code += (*s - 'a') + 10; - } - COMPEX_SET(code); - } -} - -#if 0 - -/* - * Creates array of compositions from decomposition array - */ -static void -create_comps(void) -{ - ac_uint4 i, cu; - - comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t)); - - for (i = cu = 0; i < decomps_used; i++) { - if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code)) - continue; - comps[cu].comp = decomps[i].code; - comps[cu].count = 2; - comps[cu].code1 = decomps[i].decomp[0]; - comps[cu].code2 = decomps[i].decomp[1]; - cu++; - } - comps_used = cu; - qsort(comps, comps_used, sizeof(_comp_t), cmpcomps); -} - -#endif - -#if HARDCODE_DATA -static void -write_case(FILE *out, _case_t *tab, int num, int first) -{ - int i; - - for (i=0; i 0) { - for (j=0; j 0) - fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4), - proptbl[i].used, out); - } - - fclose(out); -#endif - - /***************************************************************** - * - * Generate the case mapping data. - * - *****************************************************************/ - -#if HARDCODE_DATA - fprintf(out, PREF "unsigned int _uccase_size = %ld;\n\n", - (long) (upper_used + lower_used + title_used)); - - fprintf(out, - "/* Starting indexes of the case tables\n" - " * UpperIndex = 0\n" - " * LowerIndex = _uccase_len[0]\n" - " * TitleIndex = LowerIndex + _uccase_len[1] */\n\n"); - fprintf(out, PREF "unsigned short _uccase_len[2] = {%ld, %ld};\n\n", - (long) upper_used, (long) lower_used); - fprintf(out, PREF "unsigned int _uccase_map[] = {"); - - if (upper_used > 0) - /* - * Write the to-upper case table. - */ - write_case(out, upper, upper_used, 1); - - if (lower_used > 0) - /* - * Write the to-lower case table. - */ - write_case(out, lower, lower_used, !upper_used); - - if (title_used > 0) - /* - * Write the to-title case table. - */ - write_case(out, title, title_used, !(upper_used||lower_used)); - - if (!(upper_used || lower_used || title_used)) - fprintf(out, "\t0"); - - fprintf(out, "\n};\n\n"); -#else - /* - * Open the case.dat file. - */ - snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath); - if ((out = fopen(path, "wb")) == 0) - return; - - /* - * Write the case mapping tables. - */ - hdr[1] = upper_used + lower_used + title_used; - casecnt[0] = upper_used; - casecnt[1] = lower_used; - - /* - * Write the header. - */ - fwrite((char *) hdr, sizeof(ac_uint2), 2, out); - - /* - * Write the upper and lower case table sizes. - */ - fwrite((char *) casecnt, sizeof(ac_uint2), 2, out); - - if (upper_used > 0) - /* - * Write the upper case table. - */ - fwrite((char *) upper, sizeof(_case_t), upper_used, out); - - if (lower_used > 0) - /* - * Write the lower case table. - */ - fwrite((char *) lower, sizeof(_case_t), lower_used, out); - - if (title_used > 0) - /* - * Write the title case table. - */ - fwrite((char *) title, sizeof(_case_t), title_used, out); - - fclose(out); -#endif - -#if 0 - - /***************************************************************** - * - * Generate the composition data. - * - *****************************************************************/ - - /* - * Create compositions from decomposition data - */ - create_comps(); - -#if HARDCODE_DATA - fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n", - comps_used * 4L); - - fprintf(out, PREF "ac_uint4 _uccomp_data[] = {"); - - /* - * Now, if comps exist, write them out. - */ - if (comps_used > 0) { - for (i=0; i 0) - fwrite((char *) comps, sizeof(_comp_t), comps_used, out); - - fclose(out); -#endif - - /***************************************************************** - * - * Generate the decomposition data. - * - *****************************************************************/ - - /* - * Fully expand all decompositions before generating the output file. - */ - expand_decomp(); - -#if HARDCODE_DATA - fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n", - decomps_used * 2L); - - fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {"); - - if (decomps_used) { - /* - * Write the list of decomp nodes. - */ - for (i = idx = 0; i < decomps_used; i++) { - fprintf(out, "\n\t0x%08lx, 0x%08lx,", - (unsigned long) decomps[i].code, (unsigned long) idx); - idx += decomps[i].used; - } - - /* - * Write the sentinel index as the last decomp node. - */ - fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); - - fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {"); - /* - * Write the decompositions themselves. - */ - k = 0; - for (i = 0; i < decomps_used; i++) - for (j=0; j 0) { - /* - * Write the combining class ranges out. - */ - for (i = 0; i 0) - /* - * Write the combining class ranges out. - */ - fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out); - - fclose(out); -#endif - - /***************************************************************** - * - * Generate the number data. - * - *****************************************************************/ - -#if HARDCODE_DATA - fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n", - (unsigned long)ncodes_used<<1); - - fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {"); - - /* - * Now, if number mappings exist, write them out. - */ - if (ncodes_used > 0) { - for (i = 0; i 0) { - fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out); - fwrite((char *) nums, sizeof(_num_t), nums_used, out); - } -#endif - -#endif - - fclose(out); -} - -static void -usage(char *prog) -{ - fprintf(stderr, - "Usage: %s [-o output-directory|-x composition-exclusions]", prog); - fprintf(stderr, " datafile1 datafile2 ...\n\n"); - fprintf(stderr, - "-o output-directory\n\t\tWrite the output files to a different"); - fprintf(stderr, " directory (default: .).\n"); - fprintf(stderr, - "-x composition-exclusion\n\t\tFile of composition codes"); - fprintf(stderr, " that should be excluded.\n"); - exit(1); -} - -int -main(int argc, char *argv[]) -{ - FILE *in; - char *prog, *opath; - - prog = argv[1]; - - opath = 0; - in = stdin; - - argc--; - argv++; - - while (argc > 0) { - if (argv[0][0] == '-') { - switch (argv[0][1]) { - case 'o': - argc--; - argv++; - opath = argv[0]; - break; - case 'x': - argc--; - argv++; - if ((in = fopen(argv[0], "r")) == 0) - fprintf(stderr, - "%s: unable to open composition exclusion file %s\n", - prog, argv[0]); - else { - read_compexdata(in); - fclose(in); - in = 0; - } - break; - default: - usage(prog); - } - } else { - if (in != stdin && in != NULL) - fclose(in); - if ((in = fopen(argv[0], "r")) == 0) - fprintf(stderr, "%s: unable to open ctype file %s\n", - prog, argv[0]); - else { - read_cdata(in); - fclose(in); - in = 0; - } - } - argc--; - argv++; - } - - if (opath == 0) - opath = "."; - write_cdata(opath); - - return 0; -} diff --git a/ext/mbstring/ucgendat/ucgendat.php b/ext/mbstring/ucgendat/ucgendat.php new file mode 100644 index 0000000000..e3b66646b5 --- /dev/null +++ b/ext/mbstring/ucgendat/ucgendat.php @@ -0,0 +1,439 @@ +. + */ + +/* Copyright 2001 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +if ($argc != 2) { + echo "Usage: php ucgendata.php UnicodeData.txt\n"; + return; +} + +$inputFile = $argv[1]; +if (!file_exists($inputFile)) { + echo "File $inputFile does not exist.\n"; + return; +} + +$outputFile = __DIR__ . "/../unicode_data.h"; + +$data = parseUnicodeData(file_get_contents($inputFile)); +file_put_contents($outputFile, generateData($data)); + +class Range { + public $start; + public $end; + + public function __construct(int $start, int $end) { + $this->start = $start; + $this->end = $end; + } +} + +class UnicodeData { + const TO_UPPER = 0; + const TO_LOWER = 1; + const TO_TITLE = 2; + + public $propIndexes; + public $numProps; + public $propRanges; + public $caseMaps; + + public function __construct() { + /* + * List of properties expected to be found in the Unicode Character Database + * including some implementation specific properties. + * + * The implementation specific properties are: + * Cm = Composed (can be decomposed) + * Nb = Non-breaking + * Sy = Symmetric (has left and right forms) + * Hd = Hex digit + * Qm = Quote marks + * Mr = Mirroring + * Ss = Space, other + * Cp = Defined character + */ + $this->propIndexes = array_flip([ + "Mn", "Mc", "Me", "Nd", "Nl", "No", + "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", + "Co", "Cn", "Lu", "Ll", "Lt", "Lm", + "Lo", "Pc", "Pd", "Ps", "Pe", "Po", + "Sm", "Sc", "Sk", "So", "L", "R", + "EN", "ES", "ET", "AN", "CS", "B", + "S", "WS", "ON", + "Cm", "Nb", "Sy", "Hd", "Qm", "Mr", + "Ss", "Cp", "Pi", "Pf", "AL" + ]); + $this->numProps = count($this->propIndexes); + + $this->propRanges = array_fill(0, $this->numProps, []); + $this->caseMaps = [ + self::TO_UPPER => [], + self::TO_LOWER => [], + self::TO_TITLE => [], + ]; + } + + function propToIndex(string $prop) : int { + /* Deal with directionality codes introduced in Unicode 3.0. */ + if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) { + /* + * Mark all of these as Other Neutral to preserve compatibility with + * older versions. + */ + $prop = "ON"; + } + + if (!isset($this->propIndexes[$prop])) { + throw new Exception("Unknown property $prop"); + } + + return $this->propIndexes[$prop]; + } + + public function addProp(int $code, string $prop) { + $propIdx = self::propToIndex($prop); + + // Check if this extends the last range + $ranges = $this->propRanges[$propIdx]; + if (!empty($ranges)) { + $lastRange = $ranges[count($ranges) - 1]; + if ($code === $lastRange->end + 1) { + $lastRange->end++; + return; + } + } + + $this->propRanges[$propIdx][] = new Range($code, $code); + } + + public function addPropRange(int $startCode, int $endCode, string $prop) { + $propIdx = self::propToIndex($prop); + $this->propRanges[$propIdx][] = new Range($startCode, $endCode); + } + + public function addCaseMapping(int $case, int $origCode, int $mappedCode) { + $this->caseMaps[$case][$origCode] = $mappedCode; + } + + public function compactRangeArray(array $ranges) : array { + // Sort by start codepoint + usort($ranges, function (Range $r1, Range $r2) { + return $r1->start <=> $r2->start; + }); + + $lastRange = new Range(-1, -1); + $newRanges = []; + foreach ($ranges as $range) { + if ($lastRange->end == -1) { + $lastRange = $range; + } else if ($range->start == $lastRange->end + 1) { + // TODO: This check can be dropped. It only makes sure the output is + // the same as previously. + if ($range->end != $range->start + 1) { + $newRanges[] = $lastRange; + $lastRange = $range; + continue; + } + + $lastRange->end = $range->end; + } else if ($range->start > $lastRange->end + 1) { + $newRanges[] = $lastRange; + $lastRange = $range; + } else { + throw new Exception(sprintf( + "Overlapping ranges [%x, %x] and [%x, %x]", + $lastRange->start, $lastRange->end, + $range->start, $range->end + )); + } + } + if ($lastRange->end != -1) { + $newRanges[] = $lastRange; + } + return $newRanges; + } + + public function compactPropRanges() { + foreach ($this->propRanges as &$ranges) { + $ranges = $this->compactRangeArray($ranges); + } + } +} + +function parseUnicodeData($input) { + $data = new UnicodeData; + $lines = array_map('trim', explode("\n", $input)); + + $skip = 0; + $i = 0; + foreach ($lines as $line) { + // Skip empty lines and comments + if ($line === '' || $line[0] === '#') { + continue; + } + + if ($skip) { + $skip--; + continue; + } + + $fields = explode(';', $line); + if (count($fields) != 15) { + throw new Exception("Line does not contain 15 fields"); + } + + $code = intval($fields[0], 16); + + /* + * Handle the following special cases: + * 1. 4E00-9FA5 CJK Ideographs. + * 2. AC00-D7A3 Hangul Syllables. + * 3. D800-DFFF Surrogates. + * 4. E000-F8FF Private Use Area. + * 5. F900-FA2D Han compatibility. + * ...Plus additional ranges in newer Unicode versions... + */ + switch ($code) { + case 0x3400: + /* CJK Ideograph Extension A */ + $data->addPropRange(0x3400, 0x4db5, "Lo"); + $data->addPropRange(0x3400, 0x4db5, "L"); + $data->addPropRange(0x3400, 0x4db5, "Cp"); + $skip = 1; + break; + case 0x4e00: + /* The Han ideographs. */ + $data->addPropRange(0x4e00, 0x9fff, "Lo"); + $data->addPropRange(0x4e00, 0x9fff, "L"); + $data->addPropRange(0x4e00, 0x9fea, "Cp"); + $skip = 1; + break; + case 0xac00: + /* The Hangul syllables. */ + $data->addPropRange(0xac00, 0xd7a3, "Lo"); + $data->addPropRange(0xac00, 0xd7a3, "L"); + $data->addPropRange(0xac00, 0xd7a3, "Cp"); + $skip = 1; + break; + case 0xd800: + /* + * Make a range of all surrogates and assume some default + * properties. + */ + $data->addPropRange(0xd800, 0xdfff, "Cs"); + $data->addPropRange(0xd800, 0xdfff, "L"); + $skip = 5; + break; + case 0xe000: + /* The Private Use area. Add with a default set of properties. */ + $data->addPropRange(0xe000, 0xf8ff, "Co"); + $data->addPropRange(0xe000, 0xf8ff, "L"); + $skip = 1; + break; + case 0x20000: + /* CJK Ideograph Extension B */ + $data->addPropRange(0x20000, 0x2a6d6, "Lo"); + $data->addPropRange(0x20000, 0x2a6d6, "L"); + $data->addPropRange(0x20000, 0x2a6d6, "Cp"); + $skip = 1; + break; + case 0xf0000: + /* Plane 15 private use */ + $data->addPropRange(0xf0000, 0xffffd, "Co"); + $data->addPropRange(0xf0000, 0xffffd, "L"); + $skip = 1; + break; + case 0x100000: + /* Plane 16 private use */ + $data->addPropRange(0x100000, 0x10fffd, "Co"); + $data->addPropRange(0x100000, 0x10fffd, "L"); + $skip = 1; + break; + } + + if ($skip) { + continue; + } + + /* Add the code to the defined category. */ + $data->addProp($code, "Cp"); + + $generalCategory = $fields[2]; + $data->addProp($code, $generalCategory); + + $bidiClass = $fields[4]; + $data->addProp($code, $bidiClass); + + $composition = $fields[5]; + if ($composition && $composition[0] != '<') { + $data->addProp($code, "Cm"); + } + + $upperCase = intval($fields[12], 16); + $lowerCase = intval($fields[13], 16); + $titleCase = intval($fields[14], 16); + if ($upperCase) { + $data->addCaseMapping(UnicodeData::TO_UPPER, $code, $upperCase); + } + if ($lowerCase) { + $data->addCaseMapping(UnicodeData::TO_LOWER, $code, $lowerCase); + } + if ($titleCase) { + $data->addCaseMapping(UnicodeData::TO_TITLE, $code, $titleCase); + } + } + + return $data; +} + +function formatArray(array $values, int $width, int $hexWidth) : string { + $result = ''; + $i = 0; + $c = count($values); + for ($i = 0; $i < $c; $i++) { + if ($i != 0) { + $result .= ','; + } + + $result .= $i % $width == 0 ? "\n\t" : " "; + $result .= sprintf("0x%0" . $hexWidth . "x", $values[$i]); + } + return $result; +} + +function formatShortARray(array $values, int $width) : string { + return formatArray($values, $width, 4); +} +function formatIntArray(array $values, int $width) : string { + return formatArray($values, $width, 8); +} + +function generatePropData(UnicodeData $data) { + $data->compactPropRanges(); + + $propOffsets = []; + $idx = 0; + foreach ($data->propRanges as $ranges) { + $num = count($ranges); + $propOffsets[] = $num ? $idx : 0xffff; + $idx += 2*$num; + } + + // Add sentinel for binary search + $propOffsets[] = $idx; + + // TODO ucgendat.c pads the prop offsets to the next multiple of 4 + // for rather debious reasons of alignment. This should probably be + // dropped + while (count($propOffsets) % 4 != 0) { + $propOffsets[] = 0; + } + + $totalRanges = $idx; + + $result = ""; + $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n"; + $result .= "static const unsigned short _ucprop_offsets[] = {"; + $result .= formatShortArray($propOffsets, 8); + $result .= "\n};\n\n"; + + $values = []; + foreach ($data->propRanges as $ranges) { + foreach ($ranges as $range) { + $values[] = $range->start; + $values[] = $range->end; + } + } + + $result .= "static const unsigned int _ucprop_ranges[] = {"; + $result .= formatIntArray($values, 4); + $result .= "\n};\n\n"; + return $result; +} + +function generateCaseData(UnicodeData $data) { + $numUpper = count($data->caseMaps[UnicodeData::TO_UPPER]); + $numLower = count($data->caseMaps[UnicodeData::TO_LOWER]); + $numTitle = count($data->caseMaps[UnicodeData::TO_TITLE]); + + $result = ""; + $result .= sprintf("static const unsigned int _uccase_size = %d;\n\n", + $numUpper + $numLower + $numTitle); + + $result .= <<<'HEADER' +/* Starting indexes of the case tables + * UpperIndex = 0 + * LowerIndex = _uccase_len[0] + * TitleIndex = LowerIndex + _uccase_len[1] */ +HEADER; + $result .= "\n\n"; + $result .= sprintf("static const unsigned short _uccase_len[2] = {%d, %d};\n\n", + $numUpper, $numLower); + + $values = []; + foreach ($data->caseMaps as $map) { + foreach ($map as $orig => $mapped) { + $values[] = $orig; + $values[] = $mapped; + } + } + + $result .= "static const unsigned int _uccase_map[] = {"; + $result .= formatIntArray($values, 2); + $result .= "\n};\n\n"; + return $result; +} + +function generateData(UnicodeData $data) { + $result = <<<'HEADER' +/* This file was generated from a modified version UCData's ucgendat. + * + * DO NOT EDIT THIS FILE! + * + * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download + * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt + * files from http://www.unicode.org/Public/ and run this program. + * + * More information can be found in the UCData package. Unfortunately, + * the project's page doesn't seem to be live anymore, so you can use + * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */ +HEADER; + $result .= "\n\n" . generatePropData($data); + $result .= generateCaseData($data); + + return $result; +}