From c25708b4c3e33d91656e51d6af39fa623c808d5b Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Wed, 14 Feb 2018 23:55:39 +0000 Subject: [PATCH] ICU-13569 refresh dev branch from trunk. X-SVN-Rev: 40917 --- .gitattributes | 1 - .gitignore | 2 + icu4c/LICENSE | 29 ++++ icu4c/packaging/distrelease.ps1 | 104 ++++++------ icu4c/source/common/ucnv_u8.cpp | 45 ++---- icu4c/source/common/unicode/platform.h | 2 +- icu4c/source/common/unicode/utf8.h | 50 +++++- icu4c/source/common/utf_impl.cpp | 87 +++++----- icu4c/source/config/dist.mk | 14 +- icu4c/source/i18n/islamcal.cpp | 2 +- icu4c/source/i18n/nfrs.cpp | 11 +- icu4c/source/i18n/nfrs.h | 2 +- icu4c/source/i18n/nfrule.cpp | 6 + icu4c/source/i18n/nfrule.h | 2 + icu4c/source/i18n/nfsubs.cpp | 21 ++- icu4c/source/i18n/nfsubs.h | 1 + icu4c/source/i18n/number_fluent.cpp | 8 +- icu4c/source/i18n/number_grouping.cpp | 2 +- icu4c/source/i18n/rbnf.cpp | 2 +- icu4c/source/i18n/unicode/numberformatter.h | 36 +++-- icu4c/source/test/cintltst/utf8tst.c | 60 +++++++ icu4c/source/test/intltest/calregts.cpp | 36 ++++- icu4c/source/test/intltest/calregts.h | 1 + icu4c/source/test/intltest/itrbnf.cpp | 20 +++ icu4c/source/test/intltest/itrbnf.h | 1 + .../test/intltest/numberformattesttuple.cpp | 1 + .../test/intltest/numberformattesttuple.h | 3 + icu4c/source/test/intltest/numbertest_api.cpp | 14 ++ icu4c/source/test/intltest/numfmtst.cpp | 3 + .../numberformattestspecification.txt | 151 +++++++++++------- 30 files changed, 480 insertions(+), 237 deletions(-) diff --git a/.gitattributes b/.gitattributes index d10d0ec8c26..fb59db4eca5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -49,7 +49,6 @@ README text !eol *.tri2 -text icu4c/icu4c.css -text -icu4c/packaging/distrelease.ps1 -text icu4c/source/aclocal.m4 -text icu4c/source/config/m4/icu-conditional.m4 -text icu4c/source/data/curr/pool.res -text diff --git a/.gitignore b/.gitignore index 1d2af48c714..95cc42712f9 100644 --- a/.gitignore +++ b/.gitignore @@ -635,6 +635,8 @@ icu4c/source/tools/ctestfw/libsicutest* icu4c/source/tools/ctestfw/release icu4c/source/tools/ctestfw/x64 icu4c/source/tools/ctestfw/x86 +icu4c/source/tools/escapesrc/*.d +icu4c/source/tools/escapesrc/Makefile icu4c/source/tools/genbrk/*.d icu4c/source/tools/genbrk/*.o icu4c/source/tools/genbrk/*.pdb diff --git a/icu4c/LICENSE b/icu4c/LICENSE index c84076cd072..923219da903 100644 --- a/icu4c/LICENSE +++ b/icu4c/LICENSE @@ -383,3 +383,32 @@ Database section 7. # by ICANN or the IETF Trust on the database or the code. Any person # making a contribution to the database or code waives all rights to # future claims in that contribution or in the TZ Database. + +6. Google double-conversion + +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/icu4c/packaging/distrelease.ps1 b/icu4c/packaging/distrelease.ps1 index 34e54bc28bc..41cd9ee4a28 100644 --- a/icu4c/packaging/distrelease.ps1 +++ b/icu4c/packaging/distrelease.ps1 @@ -1,53 +1,53 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -#------------------------- -# Script: icu\packaging\distrelease.ps1 -# Author: Steven R. Loomis -# Date: 2017-04-14 -#------------------------- -# -# This builds a zipfile containing the *64 bit* Windows binary -# -# Usage: (after building ICU using MSVC) -# (bring up Powershell ISE) -# cd C:\icu\icu4c\ -# Set-ExecutionPolicy -Scope Process AllSigned -# .\packaging\distrelease.ps1 -# -# Will emit: c:\icu4c\icu\source\dist\icu-windows.zip -# -# -# You will get warnings from the execution policy and the script itself. -# see https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-5.1&viewFallbackFrom=powershell-Microsoft.PowerShell.Core -# for more about execution policies. - - -$icuDir = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent -$icuDir = Resolve-Path -Path '$icuDir\..' - -echo $icuDir - -# ok, create some work areas -New-Item -Path "$icuDir\source\dist" -ErrorAction SilentlyContinue -ItemType "directory" -$source = "$icuDir\source\dist\icu" -Get-ChildItem -Path $source -ErrorAction SilentlyContinue | Remove-Item -Recurse -New-Item -Path $source -ItemType "directory" -ErrorAction SilentlyContinue - -# copy required stuff -Copy-Item -Path "$icuDir\lib64" -Destination $source -Recurse -Copy-Item -Path "$icuDir\include" -Destination $source -Recurse -Copy-Item -Path "$icuDir\bin64" -Destination $source -Recurse -Copy-Item -Path "$icuDir\APIChangeReport.html" -Destination $source -Recurse -Copy-Item -Path "$icuDir\icu4c.css" -Destination $source -Recurse -Copy-Item -Path "$icuDir\LICENSE" -Destination $source -Recurse -Copy-Item -Path "$icuDir\readme.html" -Destination $source -Recurse - - -$destination = "$icuDir\source\dist\icu-windows.zip" -Remove-Item -Path $destination -ErrorAction Continue -Add-Type -assembly "system.io.compression.filesystem" -Echo $source -Echo $destination -[io.compression.zipfile]::CreateFromDirectory($source, $destination) - +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#------------------------- +# Script: icu\packaging\distrelease.ps1 +# Author: Steven R. Loomis +# Date: 2017-04-14 +#------------------------- +# +# This builds a zipfile containing the *64 bit* Windows binary +# +# Usage: (after building ICU using MSVC) +# (bring up Powershell ISE) +# cd C:\icu\icu4c\ +# Set-ExecutionPolicy -Scope Process AllSigned +# .\packaging\distrelease.ps1 +# +# Will emit: c:\icu4c\icu\source\dist\icu-windows.zip +# +# +# You will get warnings from the execution policy and the script itself. +# see https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-5.1&viewFallbackFrom=powershell-Microsoft.PowerShell.Core +# for more about execution policies. + + +$icuDir = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent +$icuDir = Resolve-Path -Path '$icuDir\..' + +echo $icuDir + +# ok, create some work areas +New-Item -Path "$icuDir\source\dist" -ErrorAction SilentlyContinue -ItemType "directory" +$source = "$icuDir\source\dist\icu" +Get-ChildItem -Path $source -ErrorAction SilentlyContinue | Remove-Item -Recurse +New-Item -Path $source -ItemType "directory" -ErrorAction SilentlyContinue + +# copy required stuff +Copy-Item -Path "$icuDir\lib64" -Destination $source -Recurse +Copy-Item -Path "$icuDir\include" -Destination $source -Recurse +Copy-Item -Path "$icuDir\bin64" -Destination $source -Recurse +Copy-Item -Path "$icuDir\APIChangeReport.html" -Destination $source -Recurse +Copy-Item -Path "$icuDir\icu4c.css" -Destination $source -Recurse +Copy-Item -Path "$icuDir\LICENSE" -Destination $source -Recurse +Copy-Item -Path "$icuDir\readme.html" -Destination $source -Recurse + + +$destination = "$icuDir\source\dist\icu-windows.zip" +Remove-Item -Path $destination -ErrorAction Continue +Add-Type -assembly "system.io.compression.filesystem" +Echo $source +Echo $destination +[io.compression.zipfile]::CreateFromDirectory($source, $destination) + echo $destination \ No newline at end of file diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp index 094e2dfb6f4..7089d9400c6 100644 --- a/icu4c/source/common/ucnv_u8.cpp +++ b/icu4c/source/common/ucnv_u8.cpp @@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, // Use a single counter for source and target, counting the minimum of // the source length and the target capacity. // Let the standard converter handle edge cases. - const uint8_t *limit=sourceLimit; if(count>targetCapacity) { - limit-=(count-targetCapacity); count=targetCapacity; } - // The conversion loop checks count>0 only once per 1/2/3-byte character. - // If the buffer ends with a truncated 2- or 3-byte sequence, + // The conversion loop checks count>0 only once per character. + // If the buffer ends with a truncated sequence, // then we reduce the count to stop before that, // and collect the remaining bytes after the conversion loop. - { - // Do not go back into the bytes that will be read for finishing a partial - // sequence from the previous buffer. - int32_t length=count-toULimit; - if(length>0) { - uint8_t b1=*(limit-1); - if(U8_IS_SINGLE(b1)) { - // common ASCII character - } else if(U8_IS_TRAIL(b1) && length>=2) { - uint8_t b2=*(limit-2); - if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { - // truncated 3-byte sequence - count-=2; - } - } else if(0xc2<=b1 && b1<0xf0) { - // truncated 2- or 3-byte sequence - --count; - } - } - } + + // Do not go back into the bytes that will be read for finishing a partial + // sequence from the previous buffer. + int32_t length=count-toULimit; + U8_TRUNCATE_IF_INCOMPLETE(source, 0, length); + count=toULimit+length; } if(c!=0) { @@ -815,7 +799,7 @@ moreBytes: } /* copy the legal byte sequence to the target */ - if(count>=toULength) { + { int8_t i; for(i=0; isource=(char *)source; - pFromUArgs->target=(char *)target; - *pErrorCode=U_USING_DEFAULT_WARNING; - return; } } } @@ -857,8 +833,7 @@ moreBytes: utf8->toULength=toULength; utf8->mode=toULimit; break; - } else if(!U8_IS_TRAIL(b=*source)) { - /* lead byte in trail byte position */ + } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) { utf8->toULength=toULength; *pErrorCode=U_ILLEGAL_CHAR_FOUND; break; diff --git a/icu4c/source/common/unicode/platform.h b/icu4c/source/common/unicode/platform.h index f220b1fc34d..217de4a1801 100644 --- a/icu4c/source/common/unicode/platform.h +++ b/icu4c/source/common/unicode/platform.h @@ -631,7 +631,7 @@ namespace std { */ #ifdef U_CHARSET_IS_UTF8 /* Use the predefined value. */ -#elif U_PLATFORM == U_PF_ANDROID || U_PLATFORM_IS_DARWIN_BASED +#elif U_PLATFORM_IS_LINUX_BASED || U_PLATFORM_IS_DARWIN_BASED # define U_CHARSET_IS_UTF8 1 #else # define U_CHARSET_IS_UTF8 0 diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 021fdcf1f24..8ccc7dfebab 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -380,7 +380,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) { \ (c)=(uint8_t)(s)[(i)++]; \ if(!U8_IS_SINGLE(c)) { \ - uint8_t __t; \ + uint8_t __t = 0; \ if((i)!=(length) && \ /* fetch/validate/assemble all but last trail byte */ \ ((c)>=0xe0 ? \ @@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * If the offset points to a UTF-8 trail byte, * then the offset is moved backward to the corresponding lead byte. * Otherwise, it is not modified. + * * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, must be start<=i * @see U8_SET_CP_START_UNSAFE + * @see U8_TRUNCATE_IF_INCOMPLETE * @stable ICU 2.4 */ #define U8_SET_CP_START(s, start, i) { \ @@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); } \ } +/** + * If the string ends with a UTF-8 byte sequence that is valid so far + * but incomplete, then reduce the length of the string to end before + * the lead byte of that incomplete sequence. + * For example, if the string ends with E1 80, the length is reduced by 2. + * + * Useful for processing text split across multiple buffers + * (save the incomplete sequence for later) + * and for optimizing iteration + * (check for string length only once per character). + * + * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_SET_CP_START(), this macro never reads s[length]. + * + * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param length int32_t string length, must be start<=length + * @see U8_SET_CP_START + * @draft ICU 61 + */ +#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \ + if((length)>(start)) { \ + uint8_t __b1=s[(length)-1]; \ + if(U8_IS_SINGLE(__b1)) { \ + /* common ASCII character */ \ + } else if(U8_IS_LEAD(__b1)) { \ + --(length); \ + } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ + uint8_t __b2=s[(length)-2]; \ + if(0xe0<=__b2 && __b2<=0xf4) { \ + if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ + U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ + (length)-=2; \ + } \ + } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ + uint8_t __b3=s[(length)-3]; \ + if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ + (length)-=3; \ + } \ + } \ + } \ + } + /* definitions with backward iteration -------------------------------------- */ /** diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp index f78c566e098..9dd241a12bf 100644 --- a/icu4c/source/common/utf_impl.cpp +++ b/icu4c/source/common/utf_impl.cpp @@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U int32_t i=*pi; if(U8_IS_TRAIL(c) && i>start) { uint8_t b1=s[--i]; - if(0xc2<=b1 && b1<0xe0) { - *pi=i; - return ((b1-0xc0)<<6)|(c&0x3f); + if(U8_IS_LEAD(b1)) { + if(b1<0xe0) { + *pi=i; + return ((b1-0xc0)<<6)|(c&0x3f); + } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) { + // Truncated 3- or 4-byte sequence. + *pi=i; + return errorValue(1, strict); + } } else if(U8_IS_TRAIL(b1) && i>start) { // Extract the value bits from the last trail byte. c&=0x3f; uint8_t b2=s[--i]; - if(0xe0<=b2 && b2<0xf0) { - b2&=0xf; - if(strict!=-2) { - if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { - *pi=i; - c=(b2<<12)|((b1&0x3f)<<6)|c; - if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { - return c; - } else { - // strict: forbid non-characters like U+fffe - return errorValue(2, strict); + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0) { + b2&=0xf; + if(strict!=-2) { + if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + *pi=i; + c=(b2<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(2, strict); + } + } + } else { + // strict=-2 -> lenient: allow surrogates + b1-=0x80; + if((b2>0 || b1>=0x20)) { + *pi=i; + return (b2<<12)|(b1<<6)|c; } } - } else { - // strict=-2 -> lenient: allow surrogates - b1-=0x80; - if((b2>0 || b1>=0x20)) { - *pi=i; - return (b2<<12)|(b1<<6)|c; - } + } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + // Truncated 4-byte sequence. + *pi=i; + return errorValue(2, strict); } } else if(U8_IS_TRAIL(b2) && i>start) { uint8_t b3=s[--i]; @@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U } } } - } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { - // Truncated 4-byte sequence. - *pi=i; - return errorValue(2, strict); } - } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { - // Truncated 3- or 4-byte sequence. - *pi=i; - return errorValue(1, strict); } } return errorValue(0, strict); @@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { uint8_t c=s[i]; if(U8_IS_TRAIL(c) && i>start) { uint8_t b1=s[--i]; - if(0xc2<=b1 && b1<0xe0) { - return i; + if(U8_IS_LEAD(b1)) { + if(b1<0xe0 || + (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + return i; + } } else if(U8_IS_TRAIL(b1) && i>start) { uint8_t b2=s[--i]; - if(0xe0<=b2 && b2<0xf0) { - if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { return i; } } else if(U8_IS_TRAIL(b2) && i>start) { uint8_t b3=s[--i]; - if(0xf0<=b3 && b3<=0xf4) { - if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { - return i; - } + if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { + return i; } - } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { - // Truncated 4-byte sequence. - return i; } - } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) || - (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) { - // Truncated 3- or 4-byte sequence. - return i; } } return orig_i; diff --git a/icu4c/source/config/dist.mk b/icu4c/source/config/dist.mk index ccc5837a76c..3e6e42e50dd 100644 --- a/icu4c/source/config/dist.mk +++ b/icu4c/source/config/dist.mk @@ -19,7 +19,7 @@ DISTY_DIR=dist DISTY_TMP=dist/tmp DISTY_ICU=$(DISTY_TMP)/icu DISTY_DATA=$(DISTY_ICU)/source/data -DISTY_RMV=brkitr coll curr lang locales mappings rbnf region translit xml zone misc unit +DISTY_RMV=brkitr coll curr lang locales mappings rbnf region translit xml zone misc/*.txt misc/*.mk unit DISTY_RMDIR=$(DISTY_RMV:%=$(DISTY_DATA)/%) DISTY_IN=$(DISTY_DATA)/in DOCZIP=icu-docs.zip @@ -49,7 +49,7 @@ $(DISTY_TMP): $(DISTY_DOC_ZIP): $(DOCZIP) $(DISTY_FILE_DIR) cp $(DOCZIP) $(DISTY_DOC_ZIP) - ln -sf $(DISTY_DOC_ZIP) $(DISTY_FILE_DIR)/icu4c-docs.zip + ln -sf $(shell basename $(DISTY_DOC_ZIP)) $(DISTY_FILE_DIR)/icu4c-docs.zip $(DISTY_DAT): echo Missing $@ @@ -74,14 +74,14 @@ $(DISTY_FILE_TGZ) $(DISTY_FILE_ZIP) $(DISTY_DATA_ZIP): $(DISTY_DAT) $(DISTY_TMP $(MKINSTALLDIRS) $(DISTY_IN) echo DISTY_DAT=$(DISTY_DAT) cp $(DISTY_DAT) $(DISTY_IN) - ( cd $(DISTY_TMP)/icu ; python as_is/bomlist.py > as_is/bomlist.txt || rm -f as_is/bomlist.txt ) - ( cd $(DISTY_TMP) ; zip -rlq $(DISTY_FILE_ZIP) icu ) $(RMV) $(DISTY_RMDIR) ( cd $(DISTY_TMP)/icu ; python as_is/bomlist.py > as_is/bomlist.txt || rm -f as_is/bomlist.txt ) ( cd $(DISTY_TMP) ; tar cfpz $(DISTY_FILE_TGZ) icu ) - ln -sf $(DISTY_FILE_ZIP) $(DISTY_FILE_DIR)/icu4c-src.zip - ln -sf $(DISTY_FILE_TGZ) $(DISTY_FILE_DIR)/icu4c-src.tgz - ln -sf $(DISTY_DATA_ZIP) $(DISTY_FILE_DIR)/icu4c-data.zip + ( cd $(DISTY_TMP) ; zip -rlq $(DISTY_FILE_ZIP) icu ) + $(RMV) $(DISTY_TMP) + ln -sf $(shell basename $(DISTY_FILE_ZIP)) $(DISTY_FILE_DIR)/icu4c-src.zip + ln -sf $(shell basename $(DISTY_FILE_TGZ)) $(DISTY_FILE_DIR)/icu4c-src.tgz + ln -sf $(shell basename $(DISTY_DATA_ZIP)) $(DISTY_FILE_DIR)/icu4c-data.zip ls -l $(DISTY_FILE_TGZ) $(DISTY_FILE_ZIP) $(DISTY_DATA_ZIP) diff --git a/icu4c/source/i18n/islamcal.cpp b/icu4c/source/i18n/islamcal.cpp index 4fd0e07d920..b84bedfa091 100644 --- a/icu4c/source/i18n/islamcal.cpp +++ b/icu4c/source/i18n/islamcal.cpp @@ -614,7 +614,7 @@ void IslamicCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) days = julianDay - ASTRONOMICAL_EPOC; } // Use the civil calendar approximation, which is just arithmetic - year = (int)ClockMath::floorDivide( (double)(30 * days + 10646) , 10631.0 ); + year = (int32_t)ClockMath::floorDivide(30 * (int64_t)days + 10646, (int64_t)10631); month = (int32_t)uprv_ceil((days - 29 - yearStart(year)) / 29.5 ); month = month<11?month:11; startDate = monthStart(year, month); diff --git a/icu4c/source/i18n/nfrs.cpp b/icu4c/source/i18n/nfrs.cpp index 769fad353fb..d5b368d4230 100644 --- a/icu4c/source/i18n/nfrs.cpp +++ b/icu4c/source/i18n/nfrs.cpp @@ -681,7 +681,7 @@ static void dumpUS(FILE* f, const UnicodeString& us) { #endif UBool -NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBound, Formattable& result) const +NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBound, int32_t nonNumericalExecutedRuleMask, Formattable& result) const { // try matching each rule in the rule set against the text being // parsed. Whichever one matches the most characters is the one @@ -707,9 +707,12 @@ NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBoun #endif // Try each of the negative rules, fraction rules, infinity rules and NaN rules for (int i = 0; i < NON_NUMERICAL_RULE_LENGTH; i++) { - if (nonNumericalRules[i]) { + if (nonNumericalRules[i] && ((nonNumericalExecutedRuleMask >> i) & 1) == 0) { + // Mark this rule as being executed so that we don't try to execute it again. + nonNumericalExecutedRuleMask |= 1 << i; + Formattable tempResult; - UBool success = nonNumericalRules[i]->doParse(text, workingPos, 0, upperBound, tempResult); + UBool success = nonNumericalRules[i]->doParse(text, workingPos, 0, upperBound, nonNumericalExecutedRuleMask, tempResult); if (success && (workingPos.getIndex() > highWaterMark.getIndex())) { result = tempResult; highWaterMark = workingPos; @@ -748,7 +751,7 @@ NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBoun continue; } Formattable tempResult; - UBool success = rules[i]->doParse(text, workingPos, fIsFractionRuleSet, upperBound, tempResult); + UBool success = rules[i]->doParse(text, workingPos, fIsFractionRuleSet, upperBound, nonNumericalExecutedRuleMask, tempResult); if (success && workingPos.getIndex() > highWaterMark.getIndex()) { result = tempResult; highWaterMark = workingPos; diff --git a/icu4c/source/i18n/nfrs.h b/icu4c/source/i18n/nfrs.h index 1e39b289b4d..d4797e7ff55 100644 --- a/icu4c/source/i18n/nfrs.h +++ b/icu4c/source/i18n/nfrs.h @@ -55,7 +55,7 @@ public: void format(int64_t number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; void format(double number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; - UBool parse(const UnicodeString& text, ParsePosition& pos, double upperBound, Formattable& result) const; + UBool parse(const UnicodeString& text, ParsePosition& pos, double upperBound, int32_t nonNumericalExecutedRuleMask, Formattable& result) const; void appendRules(UnicodeString& result) const; // toString diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp index f24be11bcdc..f32ed5a747c 100644 --- a/icu4c/source/i18n/nfrule.cpp +++ b/icu4c/source/i18n/nfrule.cpp @@ -900,6 +900,7 @@ NFRule::doParse(const UnicodeString& text, ParsePosition& parsePosition, UBool isFractionRule, double upperBound, + int32_t nonNumericalExecutedRuleMask, Formattable& resVal) const { // internally we operate on a copy of the string being parsed @@ -1002,6 +1003,7 @@ NFRule::doParse(const UnicodeString& text, temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos); double partialResult = matchToDelimiter(workText, start, tempBaseValue, temp, pp, sub1, + nonNumericalExecutedRuleMask, upperBound); // if we got a successful match (or were trying to match a @@ -1022,6 +1024,7 @@ NFRule::doParse(const UnicodeString& text, temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos); partialResult = matchToDelimiter(workText2, 0, partialResult, temp, pp2, sub2, + nonNumericalExecutedRuleMask, upperBound); // if we got a successful match on this second @@ -1158,6 +1161,7 @@ NFRule::matchToDelimiter(const UnicodeString& text, const UnicodeString& delimiter, ParsePosition& pp, const NFSubstitution* sub, + int32_t nonNumericalExecutedRuleMask, double upperBound) const { UErrorCode status = U_ZERO_ERROR; @@ -1191,6 +1195,7 @@ NFRule::matchToDelimiter(const UnicodeString& text, #else formatter->isLenient(), #endif + nonNumericalExecutedRuleMask, result); // if the substitution could match all the text up to @@ -1244,6 +1249,7 @@ NFRule::matchToDelimiter(const UnicodeString& text, #else formatter->isLenient(), #endif + nonNumericalExecutedRuleMask, result); if (success && (tempPP.getIndex() != 0)) { // if there's a successful match (or it's a null diff --git a/icu4c/source/i18n/nfrule.h b/icu4c/source/i18n/nfrule.h index 809119ca6c6..0fabe202373 100644 --- a/icu4c/source/i18n/nfrule.h +++ b/icu4c/source/i18n/nfrule.h @@ -74,6 +74,7 @@ public: ParsePosition& pos, UBool isFractional, double upperBound, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const; UBool shouldRollBack(int64_t number) const; @@ -94,6 +95,7 @@ private: int32_t indexOfAnyRulePrefix() const; double matchToDelimiter(const UnicodeString& text, int32_t startPos, double baseValue, const UnicodeString& delimiter, ParsePosition& pp, const NFSubstitution* sub, + int32_t nonNumericalExecutedRuleMask, double upperBound) const; void stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const; diff --git a/icu4c/source/i18n/nfsubs.cpp b/icu4c/source/i18n/nfsubs.cpp index b5da9821d55..ec9e9b873cb 100644 --- a/icu4c/source/i18n/nfsubs.cpp +++ b/icu4c/source/i18n/nfsubs.cpp @@ -155,6 +155,7 @@ public: double baseValue, double upperBound, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const { @@ -221,6 +222,7 @@ public: double baseValue, double upperBound, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue + oldRuleValue; } @@ -292,6 +294,7 @@ public: double baseValue, double upperBound, UBool /*lenientParse*/, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue / oldRuleValue; } @@ -689,6 +692,7 @@ NFSubstitution::doParse(const UnicodeString& text, double baseValue, double upperBound, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const { #ifdef RBNF_DEBUG @@ -709,7 +713,7 @@ NFSubstitution::doParse(const UnicodeString& text, // on), then also try parsing the text using a default- // constructed NumberFormat if (ruleSet != NULL) { - ruleSet->parse(text, parsePosition, upperBound, result); + ruleSet->parse(text, parsePosition, upperBound, nonNumericalExecutedRuleMask, result); if (lenientParse && !ruleSet->isFractionRuleSet() && parsePosition.getIndex() == 0) { UErrorCode status = U_ZERO_ERROR; NumberFormat* fmt = NumberFormat::createInstance(status); @@ -931,18 +935,19 @@ ModulusSubstitution::doParse(const UnicodeString& text, double baseValue, double upperBound, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const { // if this isn't a >>> substitution, we can just use the // inherited parse() routine to do the parsing if (ruleToUse == NULL) { - return NFSubstitution::doParse(text, parsePosition, baseValue, upperBound, lenientParse, result); + return NFSubstitution::doParse(text, parsePosition, baseValue, upperBound, lenientParse, nonNumericalExecutedRuleMask, result); // but if it IS a >>> substitution, we have to do it here: we // use the specific rule's doParse() method, and then we have to // do some of the other work of NFRuleSet.parse() } else { - ruleToUse->doParse(text, parsePosition, FALSE, upperBound, result); + ruleToUse->doParse(text, parsePosition, FALSE, upperBound, nonNumericalExecutedRuleMask, result); if (parsePosition.getIndex() != 0) { UErrorCode status = U_ZERO_ERROR; @@ -1118,12 +1123,13 @@ FractionalPartSubstitution::doParse(const UnicodeString& text, double baseValue, double /*upperBound*/, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& resVal) const { // if we're not in byDigits mode, we can just use the inherited // doParse() if (!byDigits) { - return NFSubstitution::doParse(text, parsePosition, baseValue, 0, lenientParse, resVal); + return NFSubstitution::doParse(text, parsePosition, baseValue, 0, lenientParse, nonNumericalExecutedRuleMask, resVal); // if we ARE in byDigits mode, parse the text one digit at a time // using this substitution's owning rule set (we do this by setting @@ -1141,7 +1147,7 @@ FractionalPartSubstitution::doParse(const UnicodeString& text, while (workText.length() > 0 && workPos.getIndex() != 0) { workPos.setIndex(0); Formattable temp; - getRuleSet()->parse(workText, workPos, 10, temp); + getRuleSet()->parse(workText, workPos, 10, nonNumericalExecutedRuleMask, temp); UErrorCode status = U_ZERO_ERROR; digit = temp.getLong(status); // digit = temp.getType() == Formattable::kLong ? @@ -1249,6 +1255,7 @@ NumeratorSubstitution::doParse(const UnicodeString& text, double baseValue, double upperBound, UBool /*lenientParse*/, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const { // we don't have to do anything special to do the parsing here, @@ -1267,7 +1274,7 @@ NumeratorSubstitution::doParse(const UnicodeString& text, while (workText.length() > 0 && workPos.getIndex() != 0) { workPos.setIndex(0); - getRuleSet()->parse(workText, workPos, 1, temp); // parse zero or nothing at all + getRuleSet()->parse(workText, workPos, 1, nonNumericalExecutedRuleMask, temp); // parse zero or nothing at all if (workPos.getIndex() == 0) { // we failed, either there were no more zeros, or the number was formatted with digits // either way, we're done @@ -1289,7 +1296,7 @@ NumeratorSubstitution::doParse(const UnicodeString& text, } // we've parsed off the zeros, now let's parse the rest from our current position - NFSubstitution::doParse(workText, parsePosition, withZeros ? 1 : baseValue, upperBound, FALSE, result); + NFSubstitution::doParse(workText, parsePosition, withZeros ? 1 : baseValue, upperBound, FALSE, nonNumericalExecutedRuleMask, result); if (withZeros) { // any base value will do in this case. is there a way to diff --git a/icu4c/source/i18n/nfsubs.h b/icu4c/source/i18n/nfsubs.h index e8b259137ed..b8a5fc66198 100644 --- a/icu4c/source/i18n/nfsubs.h +++ b/icu4c/source/i18n/nfsubs.h @@ -191,6 +191,7 @@ public: double baseValue, double upperBound, UBool lenientParse, + int32_t nonNumericalExecutedRuleMask, Formattable& result) const; /** diff --git a/icu4c/source/i18n/number_fluent.cpp b/icu4c/source/i18n/number_fluent.cpp index 3be3401ef3a..27113106c50 100644 --- a/icu4c/source/i18n/number_fluent.cpp +++ b/icu4c/source/i18n/number_fluent.cpp @@ -33,12 +33,13 @@ Derived NumberFormatterSettings::unit(const icu::MeasureUnit &unit) con } template -Derived NumberFormatterSettings::adoptUnit(const icu::MeasureUnit *unit) const { +Derived NumberFormatterSettings::adoptUnit(icu::MeasureUnit *unit) const { Derived copy(*this); // Just copy the unit into the MacroProps by value, and delete it since we have ownership. // NOTE: Slicing occurs here. However, CurrencyUnit can be restored from MeasureUnit. // TimeUnit may be affected, but TimeUnit is not as relevant to number formatting. if (unit != nullptr) { + // TODO: On nullptr, reset to default value? copy.fMacros.unit = *unit; delete unit; } @@ -54,10 +55,11 @@ Derived NumberFormatterSettings::perUnit(const icu::MeasureUnit &perUni } template -Derived NumberFormatterSettings::adoptPerUnit(const icu::MeasureUnit *perUnit) const { +Derived NumberFormatterSettings::adoptPerUnit(icu::MeasureUnit *perUnit) const { Derived copy(*this); // See comments above about slicing and ownership. if (perUnit != nullptr) { + // TODO: On nullptr, reset to default value? copy.fMacros.perUnit = *perUnit; delete perUnit; } @@ -96,7 +98,7 @@ Derived NumberFormatterSettings::symbols(const DecimalFormatSymbols &sy } template -Derived NumberFormatterSettings::adoptSymbols(const NumberingSystem *ns) const { +Derived NumberFormatterSettings::adoptSymbols(NumberingSystem *ns) const { Derived copy(*this); copy.fMacros.symbols.setTo(ns); return copy; diff --git a/icu4c/source/i18n/number_grouping.cpp b/icu4c/source/i18n/number_grouping.cpp index 67fd4c94317..a2b1bbd6b33 100644 --- a/icu4c/source/i18n/number_grouping.cpp +++ b/icu4c/source/i18n/number_grouping.cpp @@ -44,7 +44,7 @@ Grouper Grouper::forStrategy(UGroupingStrategy grouping) { return {-2, -2, -3}; case UNUM_GROUPING_ON_ALIGNED: return {-4, -4, 1}; - case UNUM_GROUPING_WESTERN: + case UNUM_GROUPING_THOUSANDS: return {3, 3, 1}; default: U_ASSERT(FALSE); diff --git a/icu4c/source/i18n/rbnf.cpp b/icu4c/source/i18n/rbnf.cpp index 66f532e79aa..1b75e5ee1b7 100644 --- a/icu4c/source/i18n/rbnf.cpp +++ b/icu4c/source/i18n/rbnf.cpp @@ -1371,7 +1371,7 @@ RuleBasedNumberFormat::parse(const UnicodeString& text, ParsePosition working_pp(0); Formattable working_result; - rp->parse(workingText, working_pp, kMaxDouble, working_result); + rp->parse(workingText, working_pp, kMaxDouble, 0, working_result); if (working_pp.getIndex() > high_pp.getIndex()) { high_pp = working_pp; high_result = working_result; diff --git a/icu4c/source/i18n/unicode/numberformatter.h b/icu4c/source/i18n/unicode/numberformatter.h index ac852f27e8e..1152f154ed4 100644 --- a/icu4c/source/i18n/unicode/numberformatter.h +++ b/icu4c/source/i18n/unicode/numberformatter.h @@ -172,7 +172,7 @@ typedef enum UNumberUnitWidth { *
  • MIN2: 1234 and 12,34,567 *
  • AUTO: 1,234 and 12,34,567 *
  • ON_ALIGNED: 1,234 and 12,34,567 - *
  • WESTERN: 1,234 and 1,234,567 + *
  • THOUSANDS: 1,234 and 1,234,567 * * *

    @@ -248,7 +248,7 @@ typedef enum UGroupingStrategy { * * @draft ICU 61 */ - UNUM_GROUPING_WESTERN + UNUM_GROUPING_THOUSANDS } UGroupingStrategy; @@ -1515,7 +1515,8 @@ class U_I18N_API NumberFormatterSettings { * All units will be properly localized with locale data, and all units are compatible with notation styles, * rounding strategies, and other number formatter settings. * - * Pass this method any instance of {@link MeasureUnit}. For units of measure: + * Pass this method any instance of {@link MeasureUnit}. For units of measure (which often involve the + * factory methods that return a pointer): * *

          * NumberFormatter::with().adoptUnit(MeasureUnit::createMeter(status))
    @@ -1550,7 +1551,11 @@ class U_I18N_API NumberFormatterSettings {
     
         /**
          * Like unit(), but takes ownership of a pointer.  Convenient for use with the MeasureFormat factory
    -     * methods, which return pointers that need ownership.
    +     * methods, which return pointers that need ownership.  Example:
    +     *
    +     * 
    +     * NumberFormatter::with().adoptUnit(MeasureUnit::createMeter(status))
    +     * 
    * * @param unit * The unit to render. @@ -1559,19 +1564,14 @@ class U_I18N_API NumberFormatterSettings { * @see MeasureUnit * @draft ICU 60 */ - Derived adoptUnit(const icu::MeasureUnit *unit) const; + Derived adoptUnit(icu::MeasureUnit *unit) const; /** * Sets a unit to be used in the denominator. For example, to format "3 m/s", pass METER to the unit and SECOND to * the perUnit. * - * Pass this method any instance of {@link MeasureUnit}. For example: - * - *
    -     * NumberFormatter::with()
    -     *      .adoptUnit(MeasureUnit::createMeter(status))
    -     *      .adoptPerUnit(MeasureUnit::createSecond(status))
    -     * 
    + * Pass this method any instance of {@link MeasureUnit}. Since MeasureUnit factory methods return pointers, the + * {@link #adoptPerUnit} version of this method is often more useful. * * The default is not to display any unit in the denominator. * @@ -1587,7 +1587,13 @@ class U_I18N_API NumberFormatterSettings { /** * Like perUnit(), but takes ownership of a pointer. Convenient for use with the MeasureFormat factory - * methods, which return pointers that need ownership. + * methods, which return pointers that need ownership. Example: + * + *
    +     * NumberFormatter::with()
    +     *      .adoptUnit(MeasureUnit::createMeter(status))
    +     *      .adoptPerUnit(MeasureUnit::createSecond(status))
    +     * 
    * * @param perUnit * The unit to render in the denominator. @@ -1596,7 +1602,7 @@ class U_I18N_API NumberFormatterSettings { * @see MeasureUnit * @draft ICU 61 */ - Derived adoptPerUnit(const icu::MeasureUnit *perUnit) const; + Derived adoptPerUnit(icu::MeasureUnit *perUnit) const; /** * Specifies the rounding strategy to use when formatting numbers. @@ -1761,7 +1767,7 @@ class U_I18N_API NumberFormatterSettings { * @see NumberingSystem * @draft ICU 60 */ - Derived adoptSymbols(const NumberingSystem *symbols) const; + Derived adoptSymbols(NumberingSystem *symbols) const; /** * Sets the width of the unit (measure unit or currency). Most common values: diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index 0bbb5e5413d..b7062e3b82f 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -94,6 +94,7 @@ static void TestFwdBack(void); static void TestFwdBackUnsafe(void); static void TestSetChar(void); static void TestSetCharUnsafe(void); +static void TestTruncateIfIncomplete(void); static void TestAppendChar(void); static void TestAppend(void); static void TestSurrogates(void); @@ -114,6 +115,7 @@ addUTF8Test(TestNode** root) addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); addTest(root, &TestSetChar, "utf8tst/TestSetChar"); addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); + addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete"); addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); addTest(root, &TestAppend, "utf8tst/TestAppend"); addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); @@ -927,6 +929,64 @@ static void TestSetCharUnsafe() { } } +static void TestTruncateIfIncomplete() { + // Difference from U8_SET_CP_START(): + // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length]. + // Therefore, if the last byte is a lead byte, then this macro truncates + // even if the byte at the input index cannot continue a valid sequence + // (including when that is not a trail byte). + // On the other hand, if the last byte is a trail byte, then the two macros behave the same. + static const struct { + const char *s; + int32_t expected; + } cases[] = { + { "", 0 }, + { "a", 1 }, + { "\x80", 1 }, + { "\xC1", 1 }, + { "\xC2", 0 }, + { "\xE0", 0 }, + { "\xF4", 0 }, + { "\xF5", 1 }, + { "\x80\x80", 2 }, + { "\xC2\xA0", 2 }, + { "\xE0\x9F", 2 }, + { "\xE0\xA0", 0 }, + { "\xED\x9F", 0 }, + { "\xED\xA0", 2 }, + { "\xF0\x8F", 2 }, + { "\xF0\x90", 0 }, + { "\xF4\x8F", 0 }, + { "\xF4\x90", 2 }, + { "\xF5\x80", 2 }, + { "\x80\x80\x80", 3 }, + { "\xC2\xA0\x80", 3 }, + { "\xE0\xA0\x80", 3 }, + { "\xF0\x8F\x80", 3 }, + { "\xF0\x90\x80", 0 }, + { "\xF4\x8F\x80", 0 }, + { "\xF4\x90\x80", 3 }, + { "\xF5\x80\x80", 3 }, + { "\x80\x80\x80\x80", 4 }, + { "\xC2\xA0\x80\x80", 4 }, + { "\xE0\xA0\x80\x80", 4 }, + { "\xF0\x90\x80\x80", 4 }, + { "\xF5\x80\x80\x80", 4 } + }; + int32_t i; + for (i = 0; i < UPRV_LENGTHOF(cases); ++i) { + const char *s = cases[i].s; + int32_t expected = cases[i].expected; + int32_t length = (int32_t)strlen(s); + int32_t adjusted = length; + U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted); + if (adjusted != expected) { + log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n", + (int)i, (int)length, (int)expected, (int)adjusted); + } + } +} + static void TestAppendChar(){ #if !U_HIDE_OBSOLETE_UTF_OLD_H static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}; diff --git a/icu4c/source/test/intltest/calregts.cpp b/icu4c/source/test/intltest/calregts.cpp index f1eb17bbed3..24951e5b8aa 100644 --- a/icu4c/source/test/intltest/calregts.cpp +++ b/icu4c/source/test/intltest/calregts.cpp @@ -93,6 +93,7 @@ CalendarRegressionTest::runIndexedTest( int32_t index, UBool exec, const char* & CASE(50,TestT9452); CASE(51,TestT11632); CASE(52,TestPersianCalOverflow); + CASE(53,TestIslamicCalOverflow); default: name = ""; break; } } @@ -3009,9 +3010,9 @@ void CalendarRegressionTest::TestPersianCalOverflow(void) { month = cal->get(UCAL_MONTH, status); dayOfMonth = cal->get(UCAL_DATE, status); if ( U_FAILURE(status) ) { - errln("FAIL: Calendar->get MONTH/DATE for localeID %s, julianDay %d, status %s\n", localeID, jd, u_errorName(status)); + errln("FAIL: Calendar->get MONTH/DATE for localeID %s, julianDay %d, status %s", localeID, jd, u_errorName(status)); } else if (month > maxMonth || dayOfMonth > maxDayOfMonth) { - errln("FAIL: localeID %s, julianDay %d; maxMonth %d, got month %d; maxDayOfMonth %d, got dayOfMonth %d\n", + errln("FAIL: localeID %s, julianDay %d; maxMonth %d, got month %d; maxDayOfMonth %d, got dayOfMonth %d", localeID, jd, maxMonth, month, maxDayOfMonth, dayOfMonth); } } @@ -3019,4 +3020,35 @@ void CalendarRegressionTest::TestPersianCalOverflow(void) { } } +/** + * @bug tickets 12661, 13538 + */ +void CalendarRegressionTest::TestIslamicCalOverflow(void) { + const char* localeID = "ar@calendar=islamic-civil"; + UErrorCode status = U_ZERO_ERROR; + Calendar* cal = Calendar::createInstance(Locale(localeID), status); + if(U_FAILURE(status)) { + dataerrln("FAIL: Calendar::createInstance for localeID %s: %s", localeID, u_errorName(status)); + } else { + int32_t maxMonth = cal->getMaximum(UCAL_MONTH); + int32_t maxDayOfMonth = cal->getMaximum(UCAL_DATE); + int32_t jd, year, month, dayOfMonth; + for (jd = 73530872; jd <= 73530876; jd++) { // year 202002, int32_t overflow if jd >= 73530874 + status = U_ZERO_ERROR; + cal->clear(); + cal->set(UCAL_JULIAN_DAY, jd); + year = cal->get(UCAL_YEAR, status); + month = cal->get(UCAL_MONTH, status); + dayOfMonth = cal->get(UCAL_DATE, status); + if ( U_FAILURE(status) ) { + errln("FAIL: Calendar->get YEAR/MONTH/DATE for localeID %s, julianDay %d, status %s", localeID, jd, u_errorName(status)); + } else if (month > maxMonth || dayOfMonth > maxDayOfMonth) { + errln("FAIL: localeID %s, julianDay %d; got year %d; maxMonth %d, got month %d; maxDayOfMonth %d, got dayOfMonth %d", + localeID, jd, year, maxMonth, month, maxDayOfMonth, dayOfMonth); + } + } + delete cal; + } +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/test/intltest/calregts.h b/icu4c/source/test/intltest/calregts.h index 15d55029093..7d36fab0b45 100644 --- a/icu4c/source/test/intltest/calregts.h +++ b/icu4c/source/test/intltest/calregts.h @@ -79,6 +79,7 @@ public: void TestT9452(void); void TestT11632(void); void TestPersianCalOverflow(void); + void TestIslamicCalOverflow(void); void printdate(GregorianCalendar *cal, const char *string); void dowTest(UBool lenient) ; diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp index 97700251a38..719df6202aa 100644 --- a/icu4c/source/test/intltest/itrbnf.cpp +++ b/icu4c/source/test/intltest/itrbnf.cpp @@ -75,6 +75,7 @@ void IntlTestRBNF::runIndexedTest(int32_t index, UBool exec, const char* &name, TESTCASE(23, TestVariableDecimalPoint); TESTCASE(24, TestLargeNumbers); TESTCASE(25, TestCompactDecimalFormatStyle); + TESTCASE(26, TestParseFailure); #else TESTCASE(0, TestRBNFDisabled); #endif @@ -2283,6 +2284,25 @@ void IntlTestRBNF::TestCompactDecimalFormatStyle() { doTest(&rbnf, enTestFullData, false); } +void IntlTestRBNF::TestParseFailure() { + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat rbnf(URBNF_SPELLOUT, Locale::getJapanese(), status); + static const char* testData[][1] = { + { "\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB\u30FB" }, + { NULL } + }; + for (int i = 0; testData[i][0]; ++i) { + const char* spelledNumber = testData[i][0]; // spelled-out number + + UnicodeString spelledNumberString = UnicodeString(spelledNumber).unescape(); + Formattable actualNumber; + rbnf.parse(spelledNumberString, actualNumber, status); + if (status != U_INVALID_FORMAT_ERROR) { // I would have expected U_PARSE_ERROR, but NumberFormat::parse gives U_INVALID_FORMAT_ERROR + errln("FAIL: string should be unparseable %s %s", spelledNumber, u_errorName(status)); + } + } +} + void IntlTestRBNF::doTest(RuleBasedNumberFormat* formatter, const char* const testData[][2], UBool testParsing) { diff --git a/icu4c/source/test/intltest/itrbnf.h b/icu4c/source/test/intltest/itrbnf.h index 540b8033342..e58d321362c 100644 --- a/icu4c/source/test/intltest/itrbnf.h +++ b/icu4c/source/test/intltest/itrbnf.h @@ -147,6 +147,7 @@ class IntlTestRBNF : public IntlTest { void TestRounding(); void TestLargeNumbers(); void TestCompactDecimalFormatStyle(); + void TestParseFailure(); protected: virtual void doTest(RuleBasedNumberFormat* formatter, const char* const testData[][2], UBool testParsing); diff --git a/icu4c/source/test/intltest/numberformattesttuple.cpp b/icu4c/source/test/intltest/numberformattesttuple.cpp index 01c2815d5ec..496aaeccde2 100644 --- a/icu4c/source/test/intltest/numberformattesttuple.cpp +++ b/icu4c/source/test/intltest/numberformattesttuple.cpp @@ -325,6 +325,7 @@ const NumberFormatTestTupleFieldData gFieldData[] = { FIELD_INIT(positiveSuffix, &gStrOps), FIELD_INIT(negativePrefix, &gStrOps), FIELD_INIT(negativeSuffix, &gStrOps), + FIELD_INIT(signAlwaysShown, &gIntOps), FIELD_INIT(localizedPattern, &gStrOps), FIELD_INIT(toPattern, &gStrOps), FIELD_INIT(toLocalizedPattern, &gStrOps), diff --git a/icu4c/source/test/intltest/numberformattesttuple.h b/icu4c/source/test/intltest/numberformattesttuple.h index f417b3ef995..685c3d698e1 100644 --- a/icu4c/source/test/intltest/numberformattesttuple.h +++ b/icu4c/source/test/intltest/numberformattesttuple.h @@ -55,6 +55,7 @@ enum ENumberFormatTestTupleField { kPositiveSuffix, kNegativePrefix, kNegativeSuffix, + kSignAlwaysShown, kLocalizedPattern, kToPattern, kToLocalizedPattern, @@ -118,6 +119,7 @@ public: UnicodeString positiveSuffix; UnicodeString negativePrefix; UnicodeString negativeSuffix; + int32_t signAlwaysShown; UnicodeString localizedPattern; UnicodeString toPattern; UnicodeString toLocalizedPattern; @@ -164,6 +166,7 @@ public: UBool positiveSuffixFlag; UBool negativePrefixFlag; UBool negativeSuffixFlag; + UBool signAlwaysShownFlag; UBool localizedPatternFlag; UBool toPatternFlag; UBool toLocalizedPatternFlag; diff --git a/icu4c/source/test/intltest/numbertest_api.cpp b/icu4c/source/test/intltest/numbertest_api.cpp index 2d625877f30..783bb00e2f6 100644 --- a/icu4c/source/test/intltest/numbertest_api.cpp +++ b/icu4c/source/test/intltest/numbertest_api.cpp @@ -1097,6 +1097,20 @@ void NumberFormatterApiTest::grouping() { u"8.765", u"0"); + assertFormatDescendingBig( + u"Indic locale with THOUSANDS grouping", + NumberFormatter::with().grouping(UNUM_GROUPING_THOUSANDS), + Locale("en-IN"), + u"87,650,000", + u"8,765,000", + u"876,500", + u"87,650", + u"8,765", + u"876.5", + u"87.65", + u"8.765", + u"0"); + // NOTE: Hungarian is interesting because it has minimumGroupingDigits=4 in locale data // If this test breaks due to data changes, find another locale that has minimumGroupingDigits. assertFormatDescendingBig( diff --git a/icu4c/source/test/intltest/numfmtst.cpp b/icu4c/source/test/intltest/numfmtst.cpp index 78b1029cf28..eac5cf82808 100644 --- a/icu4c/source/test/intltest/numfmtst.cpp +++ b/icu4c/source/test/intltest/numfmtst.cpp @@ -229,6 +229,9 @@ static void adjustDecimalFormat( if (tuple.negativeSuffixFlag) { fmt.setNegativeSuffix(tuple.negativeSuffix); } + if (tuple.signAlwaysShownFlag) { + // Not currently supported + } if (tuple.localizedPatternFlag) { UErrorCode status = U_ZERO_ERROR; fmt.applyLocalizedPattern(tuple.localizedPattern, status); diff --git a/icu4c/source/test/testdata/numberformattestspecification.txt b/icu4c/source/test/testdata/numberformattestspecification.txt index 113473a2a57..afddf315698 100644 --- a/icu4c/source/test/testdata/numberformattestspecification.txt +++ b/icu4c/source/test/testdata/numberformattestspecification.txt @@ -441,11 +441,10 @@ en_US 1 123,456 123456 en_US 0 123,456 123 en_US 1 123.456 123.456 en_US 0 123.456 123.456 -fr_FR 1 123,456 123.456 -fr_FR 0 123,456 123.456 -// JDK returns 123 here; not sure why. -fr_FR 1 123.456 123456 K -fr_FR 0 123.456 123 +it_IT 1 123,456 123.456 +it_IT 0 123,456 123.456 +it_IT 1 123.456 123456 +it_IT 0 123.456 123 test no grouping in pattern with parsing set pattern 0 @@ -466,9 +465,8 @@ output grouping breaks grouping2 minGroupingDigits 1,2345,6789 4 1,23,45,6789 4 K 2 1,23,45,6789 4 K 2 2 -// Q only supports minGrouping<=2 123,456789 6 6 3 -123456789 6 JKQ 6 4 +123456789 6 JK 6 4 test multiplier setters set locale en_US @@ -754,6 +752,7 @@ parse output breaks +3.52EE4 3.52 +1,234,567.8901 1234567.8901 +1,23,4567.8901 1234567.8901 +// Fraction grouping is disabled by default +1,23,4567.89,01 1234567.89 +1,23,456.78.9 123456.78 +12.34,56 12.34 @@ -831,15 +830,14 @@ parse output breaks // JDK does allow separators in the wrong place and parses as -5347.25 (53,47.25) fail K // strict requires prefix or suffix, except in C -65,347.25 fail +65,347.25 fail +3.52E4 35200 (34.8E-3) -0.0348 (3425E-1) -342.5 // Strict doesn't allow separators in sci notation. (63,425) -63425 -// JDK and S allow separators in sci notation and parses as -342.5 -// C passes -(63,425E-1) fail CKS +// J does not allow grouping separators in scientific notation. +(63,425E-1) -6342.5 J // Both prefix and suffix needed for strict. // JDK accepts this and parses as -342.5 (3425E-1 fail K @@ -954,12 +952,12 @@ set negativeSuffix 9N begin parse output breaks // S is the only implementation that passes these cases. -// C consumes the '9' as a digit and assumes number is negative +// C and P consume the '9' as a digit and assumes number is negative // J and JDK bail -6549K 654 CJK -// C consumes the '9' as a digit and assumes number is negative +6549K 654 CJKP +// C and P consume the '9' as a digit and assumes number is negative // J and JDK bail -6549N -654 CJK +6549N -654 CJKP test really strange prefix set locale en @@ -974,7 +972,7 @@ test parse pattern with quotes set locale en set pattern '-'#y begin -parse output +parse output breaks -45y 45 test parse with locale symbols @@ -1187,17 +1185,17 @@ $53.45 fail USD J USD 53.45 53.45 USD J 53.45USD 53.45 USD CJ USD53.45 53.45 USD -// S fails these because '(' is an incomplete prefix. -(7.92) USD -7.92 USD CJS -(7.92) GBP -7.92 GBP CJS -(7.926) USD -7.926 USD CJS -(7.926 USD) -7.926 USD CJS +// P fails these because '(' is an incomplete prefix. +(7.92) USD -7.92 USD CJP +(7.92) GBP -7.92 GBP CJP +(7.926) USD -7.926 USD CJP +(7.926 USD) -7.926 USD CJP (USD 7.926) -7.926 USD J -USD (7.926) -7.926 USD CJS -USD (7.92) -7.92 USD CJS -(7.92)USD -7.92 USD CJS -USD(7.92) -7.92 USD CJS -(8) USD -8 USD CJS +USD (7.926) -7.926 USD CJP +USD (7.92) -7.92 USD CJP +(7.92)USD -7.92 USD CJP +USD(7.92) -7.92 USD CJP +(8) USD -8 USD CJP -8 USD -8 USD C 67 USD 67 USD C 53.45$ fail USD @@ -1223,37 +1221,38 @@ test parse foreign currency symbol set pattern \u00a4 0.00;\u00a4 -# set locale fa_IR begin -parse output outputCurrency +parse output outputCurrency breaks \u0631\u06cc\u0627\u0644 \u06F1\u06F2\u06F3\u06F5 1235 IRR IRR \u06F1\u06F2\u06F3\u06F5 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR +// P fails here because this currency name is in the Trie only, but it has the same prefix as the non-Trie currency +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR P IRR 1235 1235 IRR \u0631\u06cc\u0627\u0644 1235 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR P test parse foreign currency ISO set pattern \u00a4\u00a4 0.00;\u00a4\u00a4 -# set locale fa_IR begin -parse output outputCurrency +parse output outputCurrency breaks \u0631\u06cc\u0627\u0644 \u06F1\u06F2\u06F3\u06F5 1235 IRR IRR \u06F1\u06F2\u06F3\u06F5 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR P IRR 1235 1235 IRR \u0631\u06cc\u0627\u0644 1235 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR P test parse foreign currency full set pattern \u00a4\u00a4\u00a4 0.00;\u00a4\u00a4\u00a4 -# set locale fa_IR begin -parse output outputCurrency +parse output outputCurrency breaks \u0631\u06cc\u0627\u0644 \u06F1\u06F2\u06F3\u06F5 1235 IRR IRR \u06F1\u06F2\u06F3\u06F5 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 \u06F1\u06F2\u06F3\u06F5 1235 IRR P IRR 1235 1235 IRR \u0631\u06cc\u0627\u0644 1235 1235 IRR -\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR +\u0631\u06cc\u0627\u0644 \u0627\u06cc\u0631\u0627\u0646 1235 1235 IRR P test parse currency with foreign symbols symbol english set pattern \u00a4 0.00;\u00a4 (#) @@ -1288,16 +1287,17 @@ Euros 7.82 7.82 EUR test parse currency without currency mode // Should accept a symbol associated with the currency specified by the API, // but should not traverse the full currency data. +// P always traverses full currency data. set locale en_US set pattern \u00a4#,##0.00 begin parse currency output breaks $52.41 USD 52.41 USD52.41 USD 52.41 K -\u20ac52.41 USD fail -EUR52.41 USD fail -$52.41 EUR fail -USD52.41 EUR fail +\u20ac52.41 USD fail P +EUR52.41 USD fail P +$52.41 EUR fail P +USD52.41 EUR fail P \u20ac52.41 EUR 52.41 K EUR52.41 EUR 52.41 @@ -1307,11 +1307,11 @@ set locale en_US set lenient 0 begin parse output outputCurrency breaks -$53.45 53.45 USD +$53.45 53.45 USD P 53.45 USD 53.45 USD USD 53.45 fail USD 53.45USD fail USD -USD53.45 53.45 USD +USD53.45 53.45 USD P (7.92) USD -7.92 USD (7.92) EUR -7.92 EUR (7.926) USD -7.926 USD @@ -1329,9 +1329,9 @@ US Dollars 53.45 fail USD 53.45 US Dollars 53.45 USD US Dollar 53.45 fail USD 53.45 US Dollar 53.45 USD -US Dollars53.45 53.45 USD +US Dollars53.45 53.45 USD P 53.45US Dollars fail USD -US Dollar53.45 53.45 USD +US Dollar53.45 53.45 USD P US Dollat53.45 fail USD 53.45US Dollar fail USD US Dollars (53.45) fail USD @@ -1376,13 +1376,15 @@ test parse minus sign set locale en set pattern # begin -parse output breaks --123 -123 -- 123 -123 JK - -123 -123 JK - - 123 -123 JK -123- -123 CJKS -123 - -123 CJKS +pattern parse output breaks +# -123 -123 +# - 123 -123 JK +# -123 -123 JK +# - 123 -123 JK +# 123- 123 +# 123 - 123 +#;#- 123- -123 +#;#- 123 - -123 JK test parse case sensitive set locale en @@ -1423,8 +1425,8 @@ NaN NaN K 1E2147483646 1E2147483646 1E-2147483649 0 1E-2147483648 0 -// S returns zero here -1E-2147483647 1E-2147483647 S +// P returns zero here +1E-2147483647 1E-2147483647 P 1E-2147483646 1E-2147483646 test format push limits @@ -1439,7 +1441,7 @@ maxFractionDigits format output breaks 100 9999999999999.9950000000001 9999999999999.9950000000001 C 2 9999999999999.9950000000001 10000000000000.00 C 2 9999999.99499999 9999999.99 -// K doesn't support halfDowm rounding mode? +// K doesn't support halfDown rounding mode? 2 9999999.995 9999999.99 K 2 9999999.99500001 10000000.00 100 56565656565656565656565656565656565656565656565656565656565656 56565656565656565656565656565656565656565656565656565656565656.00 C @@ -1453,8 +1455,8 @@ set locale en set pattern #,##0 begin parse output breaks -// K and J return null; S and C return 99 - 9 9 9 CJKS +// K and J return null; S, C, and P return 99 + 9 9 9 CJKP // K returns null 9 999 9999 K @@ -1497,7 +1499,7 @@ y g h56 -56 JK 56i j‎k -56 CJK 56‎i jk -56 CJK // S and C get 56 (accepts ' ' gs grouping); J and K get null -5 6 fail CS +5 6 fail CP 5‎6 5 JK test parse spaces in grouping @@ -1507,9 +1509,9 @@ set locale en set pattern #,##0 begin parse output breaks -// C, J and S get "12" here -1 2 1 CJS -1 23 1 CJS +// C, J, S, and P get "12" here +1 2 1 CJP +1 23 1 CJP // K gets 1 here; doesn't pick up the grouping separator 1 234 1234 K @@ -1543,7 +1545,8 @@ begin parse output breaks 55% 0.55 // J and K get null -55 0.55 JK +// P requires the symbol to be present and gets 55 +55 0.55 JKP test trailing grouping separators in pattern // This test is for #13115 @@ -1573,6 +1576,34 @@ begin parse output breaks 9223372036854775807% 92233720368547758.07 +test sign always shown +set locale en +set pattern 0 +set signAlwaysShown 1 +begin +format output breaks +// C, J and K do not support this feature +42 +42 CJK +0 +0 CJK +-42 -42 + +test parse strict with plus sign +set locale en +set pattern 0 +set signAlwaysShown 1 +begin +lenient parse output breaks +1 42 42 +1 -42 -42 +1 +42 42 CJK +1 0 0 +1 +0 0 CJK +0 42 fail CJK +0 -42 -42 +0 +42 42 CJK +0 0 fail CJK +0 +0 0 CJK + -- 2.50.1