From edac6e7206ee832a05d54b62da2a5e09ead3f907 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 13 Sep 2017 22:29:43 +0000 Subject: [PATCH] ICU-13244 add U16_GET_OR_FFFD(), U16_NEXT_OR_FFFD(), U16_PREV_OR_FFFD() X-SVN-Rev: 40404 --- icu4c/source/common/unicode/utf16.h | 132 ++++++++++++++++++++++++-- icu4c/source/test/cintltst/utf16tst.c | 82 ++++++++++++---- 2 files changed, 190 insertions(+), 24 deletions(-) diff --git a/icu4c/source/common/unicode/utf16.h b/icu4c/source/common/unicode/utf16.h index 9cc73f7bfbd..35fd0986114 100644 --- a/icu4c/source/common/unicode/utf16.h +++ b/icu4c/source/common/unicode/utf16.h @@ -185,8 +185,8 @@ * * The length can be negative for a NUL-terminated string. * - * If the offset points to a single, unpaired surrogate, then that itself - * will be returned as the code point. + * If the offset points to a single, unpaired surrogate, then + * c is set to that unpaired surrogate. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. * * @param s const UChar * string @@ -213,6 +213,53 @@ } \ } +#ifndef U_HIDE_DRAFT_API + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to a single, unpaired surrogate, then + * c is set to U+FFFD. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } else { \ + (c)=0xfffd; \ + } \ + } \ + } \ +} + +#endif // U_HIDE_DRAFT_API + /* definitions with forward iteration --------------------------------------- */ /** @@ -253,8 +300,7 @@ * for a supplementary code point, in which case the macro will read * the following trail surrogate as well. * If the offset points to a trail surrogate or - * to a single, unpaired lead surrogate, then that itself - * will be returned as the code point. + * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. * * @param s const UChar * string * @param i string offset, must be i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } else { \ + (c)=0xfffd; \ + } \ + } \ +} + +#endif // U_HIDE_DRAFT_API + /** * Move the string offset from one code point boundary to the previous one. * (Pre-decrementing backward iteration.) diff --git a/icu4c/source/test/cintltst/utf16tst.c b/icu4c/source/test/cintltst/utf16tst.c index 8ce403a59af..b73c6083efd 100644 --- a/icu4c/source/test/cintltst/utf16tst.c +++ b/icu4c/source/test/cintltst/utf16tst.c @@ -147,7 +147,7 @@ static void TestGetChar() 0x11734, 0xd800, UTF_ERROR_VALUE }; uint16_t i=0; - UChar32 c; + UChar32 c, expected; uint16_t offset=0; for(offset=0; offset