From 0321204894b6be4c9c5b87c0fc1a5b2d9ba7c093 Mon Sep 17 00:00:00 2001 From: Charles Kerr Date: Sat, 10 Jan 2009 17:59:49 +0000 Subject: [PATCH] (trunk libT) upgrade our copy of Unicode Inc's freeware ConvertUTF.[ch] --- libtransmission/ConvertUTF.c | 985 ++++++++++++++--------------------- libtransmission/ConvertUTF.h | 130 ++--- 2 files changed, 437 insertions(+), 678 deletions(-) diff --git a/libtransmission/ConvertUTF.c b/libtransmission/ConvertUTF.c index 9001b1590..67ab49fe8 100644 --- a/libtransmission/ConvertUTF.c +++ b/libtransmission/ConvertUTF.c @@ -1,8 +1,8 @@ /* * Copyright 2001-2004 Unicode, Inc. - * + * * Disclaimer - * + * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine @@ -10,9 +10,9 @@ * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. - * + * * Limitations on Rights to Redistribute This Code - * + * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form @@ -26,25 +26,26 @@ Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. + mods suggested by S. Parent & A. Lillich. June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + May 2006: updated isLegalUTF8Sequence. See the header file "ConvertUTF.h" for complete documentation. - ------------------------------------------------------------------------ */ +------------------------------------------------------------------------ */ #include "ConvertUTF.h" #ifdef CVTUTF_DEBUG - #include +#include #endif -static const int halfShift = 10; /* used for shifting by 10 bits */ +static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; @@ -53,76 +54,53 @@ static const UTF32 halfMask = 0x3FFUL; #define UNI_SUR_HIGH_END (UTF32)0xDBFF #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF -#define false 0 -#define true 1 +#define false 0 +#define true 1 /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF32toUTF16( const UTF32** sourceStart, - const UTF32* sourceEnd, - UTF16** targetStart, - UTF16* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - - while( source < sourceEnd ) - { - UTF32 ch; - if( target >= targetEnd ) - { - result = targetExhausted; break; - } - ch = *source++; - if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */ - { /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are - both reserved values */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) - { - if( flags == strictConversion ) - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - else - { - *target++ = UNI_REPLACEMENT_CHAR; - } - } - else - { - *target++ = (UTF16)ch; /* normal case */ - } - } - else if( ch > UNI_MAX_LEGAL_UTF32 ) - { - if( flags == strictConversion ) - { - result = sourceIllegal; - } - else - { - *target++ = UNI_REPLACEMENT_CHAR; - } - } - else - { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if( target + 1 >= targetEnd ) - { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START ); - *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START ); - } + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } } - *sourceStart = source; *targetStart = target; return result; @@ -130,81 +108,57 @@ ConvertUTF32toUTF16( const UTF32** sourceStart, /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF16toUTF32( const UTF16** sourceStart, - const UTF16* sourceEnd, - UTF32** targetStart, - UTF32* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - - while( source < sourceEnd ) - { - const UTF16* oldSource = source; /* In case we have to back up because - of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END ) - { - /* If the 16 bits following the high surrogate are in the source - buffer... */ - if( source < sourceEnd ) - { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if( ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END ) - { - ch = ( ( ch - UNI_SUR_HIGH_START ) << halfShift ) - + ( ch2 - UNI_SUR_LOW_START ) + halfBase; - ++source; - } - else if( flags == strictConversion ) /* it's an unpaired high - surrogate */ - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - else /* We don't have the 16 bits following the high surrogate. */ - { - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } - else if( flags == strictConversion ) - { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END ) - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if( target >= targetEnd ) - { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; } - *sourceStart = source; *targetStart = target; #ifdef CVTUTF_DEBUG - if( result == sourceIllegal ) - { - fprintf( stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", - ch, - ch2 ); - fflush( stderr ); - } +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} #endif return result; } @@ -218,23 +172,15 @@ ConvertUTF16toUTF32( const UTF16** sourceStart, * left as-is for anyone who may want to do such conversion, which was * allowed in earlier algorithms. */ -static const char trailingBytesForUTF8[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 5, 5, 5, 5 +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* @@ -242,11 +188,8 @@ static const char trailingBytesForUTF8[256] = { * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ -static const UTF32 offsetsFromUTF8[6] = -{ 0x00000000UL, 0x00003080UL, - 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, - 0x82082080UL }; +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed @@ -255,8 +198,7 @@ static const UTF32 offsetsFromUTF8[6] = * (I.e., one byte sequence, two byte... etc.). Remember that sequencs * for *legal* UTF-8 will be 4 or fewer bytes total. */ -static const UTF8 firstByteMark[7] = -{ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; /* --------------------------------------------------------------------- */ @@ -270,115 +212,69 @@ static const UTF8 firstByteMark[7] = /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF16toUTF8( const UTF16** sourceStart, - const UTF16* sourceEnd, - UTF8** targetStart, - UTF8* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - - while( source < sourceEnd ) - { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because - of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END ) - { - /* If the 16 bits following the high surrogate are in the source - buffer... */ - if( source < sourceEnd ) - { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if( ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END ) - { - ch = ( ( ch - UNI_SUR_HIGH_START ) << halfShift ) - + ( ch2 - UNI_SUR_LOW_START ) + halfBase; - ++source; - } - else if( flags == strictConversion ) /* it's an unpaired high - surrogate */ - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - else /* We don't have the 16 bits following the high surrogate. */ - { - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } - else if( flags == strictConversion ) - { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END ) - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if( ch < (UTF32)0x80 ) - { - bytesToWrite = 1; - } - else if( ch < (UTF32)0x800 ) - { - bytesToWrite = 2; - } - else if( ch < (UTF32)0x10000 ) - { - bytesToWrite = 3; - } - else if( ch < (UTF32)0x110000 ) - { - bytesToWrite = 4; - } - else - { - bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if( target > targetEnd ) - { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch( bytesToWrite ) /* note: everything falls through. */ - { - case 4: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 3: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 2: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 1: - *--target = (UTF8)( ch | firstByteMark[bytesToWrite] ); - } - target += bytesToWrite; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; } - *sourceStart = source; *targetStart = target; return result; @@ -397,51 +293,28 @@ ConvertUTF16toUTF8( const UTF16** sourceStart, * definition of UTF-8 goes up to 4-byte sequences. */ -static Boolean -isLegalUTF8( const UTF8 *source, - int length ) -{ - UTF8 a; - const UTF8 *srcptr = source + length; - - switch( length ) - { - default: - return false; - - /* Everything else falls through when "true"... */ - case 4: - if( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return false; - - case 3: - if( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return false; - - case 2: - if( ( a = ( *--srcptr ) ) > 0xBF ) return false; - - switch( *source ) - { - /* no fall-through in this inner switch */ - case 0xE0: - if( a < 0xA0 ) return false;break; - - case 0xED: - if( a > 0x9F ) return false;break; - - case 0xF0: - if( a < 0x90 ) return false;break; - - case 0xF4: - if( a > 0x8F ) return false;break; - - default: - if( a < 0x80 ) return false; - } - - case 1: - if( *source >= 0x80 && *source < 0xC2 ) return false; +static Boolean isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } - if( *source > 0xF4 ) return false; + if (*source > 0xF4) return false; return true; } @@ -451,125 +324,95 @@ isLegalUTF8( const UTF8 *source, * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ -Boolean -isLegalUTF8Sequence( const UTF8 *source, - const UTF8 *sourceEnd ) -{ - int length = trailingBytesForUTF8[*source] + 1; - - if( source + length > sourceEnd ) - { - return false; + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length; + if (source == sourceEnd) { + return true; + } + while (true) { + length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + if (!isLegalUTF8(source, length)) { + return false; + } + source += length; + if (source >= sourceEnd) { + return true; + } } - return isLegalUTF8( source, length ); } /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF8toUTF16( const UTF8** sourceStart, - const UTF8* sourceEnd, - UTF16** targetStart, - UTF16* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - - while( source < sourceEnd ) - { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if( source + extraBytesToRead >= sourceEnd ) - { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) - { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch( extraBytesToRead ) - { - case 5: - ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - - case 4: - ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - - case 3: - ch += *source++; ch <<= 6; - - case 2: - ch += *source++; ch <<= 6; - - case 1: - ch += *source++; ch <<= 6; - - case 0: - ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if( target >= targetEnd ) - { - source -= ( extraBytesToRead + 1 ); /* Back up source pointer! */ - result = targetExhausted; break; - } - if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */ - { /* UTF-16 surrogate values are illegal in UTF-32 */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) - { - if( flags == strictConversion ) - { - source -= ( extraBytesToRead + 1 ); /* return to the illegal - value itself */ - result = sourceIllegal; - break; - } - else - { - *target++ = UNI_REPLACEMENT_CHAR; - } - } - else - { - *target++ = (UTF16)ch; /* normal case */ - } - } - else if( ch > UNI_MAX_UTF16 ) - { - if( flags == strictConversion ) - { - result = sourceIllegal; - source -= ( extraBytesToRead + 1 ); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } - else - { - *target++ = UNI_REPLACEMENT_CHAR; - } - } - else - { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if( target + 1 >= targetEnd ) - { - source -= ( extraBytesToRead + 1 ); /* Back up source pointer! - */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START ); - *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START ); - } + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } } - *sourceStart = source; *targetStart = target; return result; @@ -577,87 +420,52 @@ ConvertUTF8toUTF16( const UTF8** sourceStart, /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF32toUTF8( const UTF32** sourceStart, - const UTF32* sourceEnd, - UTF8** targetStart, - UTF8* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - - while( source < sourceEnd ) - { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if( flags == strictConversion ) - { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) - { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if( ch < (UTF32)0x80 ) - { - bytesToWrite = 1; - } - else if( ch < (UTF32)0x800 ) - { - bytesToWrite = 2; - } - else if( ch < (UTF32)0x10000 ) - { - bytesToWrite = 3; - } - else if( ch <= UNI_MAX_LEGAL_UTF32 ) - { - bytesToWrite = 4; - } - else - { - bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if( target > targetEnd ) - { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch( bytesToWrite ) /* note: everything falls through. */ - { - case 4: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 3: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 2: - *--target = - (UTF8)( ( ch | byteMark ) & byteMask ); ch >>= 6; - - case 1: - *--target = (UTF8) ( ch | firstByteMark[bytesToWrite] ); - } - target += bytesToWrite; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; } - *sourceStart = source; *targetStart = target; return result; @@ -665,94 +473,61 @@ ConvertUTF32toUTF8( const UTF32** sourceStart, /* --------------------------------------------------------------------- */ -ConversionResult -ConvertUTF8toUTF32( const UTF8** sourceStart, - const UTF8* sourceEnd, - UTF32** targetStart, - UTF32* targetEnd, - ConversionFlags flags ) -{ +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - - while( source < sourceEnd ) - { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if( source + extraBytesToRead >= sourceEnd ) - { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) - { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch( extraBytesToRead ) - { - case 5: - ch += *source++; ch <<= 6; - - case 4: - ch += *source++; ch <<= 6; - - case 3: - ch += *source++; ch <<= 6; - - case 2: - ch += *source++; ch <<= 6; - - case 1: - ch += *source++; ch <<= 6; - - case 0: - ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if( target >= targetEnd ) - { - source -= ( extraBytesToRead + 1 ); /* Back up the source pointer! - */ - result = targetExhausted; break; - } - if( ch <= UNI_MAX_LEGAL_UTF32 ) - { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) - { - if( flags == strictConversion ) - { - source -= ( extraBytesToRead + 1 ); /* return to the illegal - value itself */ - result = sourceIllegal; - break; - } - else - { - *target++ = UNI_REPLACEMENT_CHAR; - } - } - else - { - *target++ = ch; - } - } - else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - { - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } } - *sourceStart = source; *targetStart = target; return result; @@ -764,14 +539,14 @@ ConvertUTF8toUTF32( const UTF8** sourceStart, The fall-through switches in UTF-8 reading code save a temp variable, some decrements & conditionals. The switches are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } In UTF-8 writing code, the switches on "bytesToWrite" are similarly unrolled loops. diff --git a/libtransmission/ConvertUTF.h b/libtransmission/ConvertUTF.h index 73086bd72..b64e45b58 100644 --- a/libtransmission/ConvertUTF.h +++ b/libtransmission/ConvertUTF.h @@ -7,9 +7,9 @@ /* * Copyright 2001-2004 Unicode, Inc. - * + * * Disclaimer - * + * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine @@ -17,9 +17,9 @@ * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. - * + * * Limitations on Rights to Redistribute This Code - * + * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form @@ -40,8 +40,8 @@ Each routine converts the text between *sourceStart and sourceEnd, putting the result into the buffer between *targetStart and - targetEnd. Note: the end pointers are *after* the last item: e.g. - *(sourceEnd - 1) is the last item. + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. The return result indicates whether the conversion was successful, and if not, whether the problem was in the source or target buffers. @@ -52,12 +52,12 @@ the respective buffers. Input parameters: - sourceStart - pointer to a pointer to the source buffer. - The contents of this are modified on return so that - it points at the next thing to be converted. - targetStart - similarly, pointer to pointer to the target buffer. - sourceEnd, targetEnd - respectively pointers to the ends of the - two buffers, for overflow checking only. + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. These conversion functions take a ConversionFlags argument. When this flag is set to strict, both irregular sequences and isolated surrogates @@ -74,17 +74,17 @@ they constitute an error. Output parameters: - The value "sourceIllegal" is returned from some routines if the input - sequence is malformed. When "sourceIllegal" is returned, the source - value will point to the illegal value that caused the problem. E.g., - in UTF-8 when a sequence is malformed, it points to the start of the - malformed sequence. + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. - Fixes & updates, Sept 2001. + Fixes & updates, Sept 2001. - ------------------------------------------------------------------------ */ +------------------------------------------------------------------------ */ /* --------------------------------------------------------------------- The following 4 definitions are compiler-specific. @@ -92,13 +92,12 @@ 16 bits, so wchar_t is no less portable than unsigned short! All should be unsigned values to avoid sign extension during bit mask & shift operations. - ------------------------------------------------------------------------ */ +------------------------------------------------------------------------ */ - -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ +typedef unsigned long UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ /* Some fundamental constants */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD @@ -107,18 +106,16 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF -typedef enum -{ - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ } ConversionResult; -typedef enum -{ - strictConversion = 0, - lenientConversion +typedef enum { + strictConversion = 0, + lenientConversion } ConversionFlags; /* This is for C++ and does no harm in C */ @@ -126,44 +123,31 @@ typedef enum extern "C" { #endif -ConversionResult ConvertUTF8toUTF16( const UTF8** sourceStart, - const UTF8* sourceEnd, - UTF16** targetStart, - UTF16* targetEnd, - ConversionFlags flags ); - -ConversionResult ConvertUTF16toUTF8( const UTF16** sourceStart, - const UTF16* sourceEnd, - UTF8** targetStart, - UTF8* targetEnd, - ConversionFlags flags ); - -ConversionResult ConvertUTF8toUTF32( const UTF8** sourceStart, - const UTF8* sourceEnd, - UTF32** targetStart, - UTF32* targetEnd, - ConversionFlags flags ); - -ConversionResult ConvertUTF32toUTF8( const UTF32** sourceStart, - const UTF32* sourceEnd, - UTF8** targetStart, - UTF8* targetEnd, - ConversionFlags flags ); - -ConversionResult ConvertUTF16toUTF32( const UTF16** sourceStart, - const UTF16* sourceEnd, - UTF32** targetStart, - UTF32* targetEnd, - ConversionFlags flags ); - -ConversionResult ConvertUTF32toUTF16( const UTF32** sourceStart, - const UTF32* sourceEnd, - UTF16** targetStart, - UTF16* targetEnd, - ConversionFlags flags ); - -Boolean isLegalUTF8Sequence( const UTF8 *source, - const UTF8 *sourceEnd ); +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); #ifdef __cplusplus } -- 2.40.0