# include "unicode.h"
#endif
-namespace
+namespace
{
inline unsigned short combine(unsigned char c1, unsigned char c2)
debug("String::copyFromUTF8() - Unicode conversion error.");
}
-#endif
+#endif
}
}
class String::StringPrivate : public RefCounter
{
public:
- StringPrivate()
- : RefCounter()
+ StringPrivate()
+ : RefCounter()
{
}
- StringPrivate(const wstring &s)
+ StringPrivate(const wstring &s)
: RefCounter()
- , data(s)
+ , data(s)
{
}
-
- StringPrivate(uint n, wchar_t c)
+
+ StringPrivate(uint n, wchar_t c)
: RefCounter()
- , data(static_cast<size_t>(n), c)
+ , data(static_cast<size_t>(n), c)
{
}
/*!
- * Stores string in UTF-16. The byte order depends on the CPU endian.
+ * Stores string in UTF-16. The byte order depends on the CPU endian.
*/
TagLib::wstring data;
////////////////////////////////////////////////////////////////////////////////
-String::String()
+String::String()
: d(new StringPrivate())
{
}
-String::String(const String &s)
+String::String(const String &s)
: d(s.d)
{
d->ref();
: d(new StringPrivate())
{
if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
- // This looks ugly but needed for the compatibility with TagLib1.8.
+ // This looks ugly but needed for the compatibility with TagLib1.8.
// Should be removed in TabLib2.0.
if (t == UTF16BE)
t = WCharByteOrder;
: d(new StringPrivate())
{
if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
- // This looks ugly but needed for the compatibility with TagLib1.8.
+ // This looks ugly but needed for the compatibility with TagLib1.8.
// Should be removed in TabLib2.0.
if (t == UTF16BE)
t = WCharByteOrder;
if(v.isEmpty())
return;
- if(t == Latin1)
+ if(t == Latin1)
copyFromLatin1(v.data(), v.size());
- else if(t == UTF8)
+ else if(t == UTF8)
copyFromUTF8(v.data(), v.size());
- else
+ else
copyFromUTF16(v.data(), v.size(), t);
// If we hit a null in the ByteVector, shrink the string again.
std::string String::to8Bit(bool unicode) const
{
- std::string s;
-
- if(!unicode) {
- s.resize(d->data.size());
-
- std::string::iterator targetIt = s.begin();
- for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
- *targetIt = static_cast<char>(*it);
- ++targetIt;
- }
- }
- else {
- s.resize(d->data.size() * 4 + 1);
-
- UTF16toUTF8(&d->data[0], d->data.size(), &s[0], s.size());
- s.resize(::strlen(s.c_str()));
- }
-
- return s;
+ const ByteVector v = data(unicode ? UTF8 : Latin1);
+ return std::string(v.data(), v.size());
}
TagLib::wstring String::toWString() const
ByteVector String::data(Type t) const
{
- switch(t)
+ switch(t)
{
case Latin1:
{
return v;
}
case UTF8:
+ if(!d->data.empty())
{
ByteVector v(size() * 4 + 1, 0);
- UTF16toUTF8(&d->data[0], d->data.size(), v.data(), v.size());
+ UTF16toUTF8(d->data.c_str(), d->data.size(), v.data(), v.size());
v.resize(::strlen(v.data()));
return v;
}
+ else {
+ return ByteVector::null;
+ }
case UTF16:
{
ByteVector v(2 + size() * 2, 0);
default:
{
debug("String::data() - Invalid Type value.");
- return ByteVector();
+ return ByteVector::null;
}
}
}
{
bool swap;
if(t == UTF16) {
- if(length >= 1 && s[0] == 0xfeff)
+ if(length >= 1 && s[0] == 0xfeff)
swap = false; // Same as CPU endian. No need to swap bytes.
- else if(length >= 1 && s[0] == 0xfffe)
+ else if(length >= 1 && s[0] == 0xfffe)
swap = true; // Not same as CPU endian. Need to swap bytes.
else {
debug("String::copyFromUTF16() - Invalid UTF16 string.");
s++;
length--;
}
- else
+ else
swap = (t != WCharByteOrder);
d->data.resize(length);
ushort bom;
::memcpy(&bom, s, 2);
- if(bom == 0xfeff)
+ if(bom == 0xfeff)
swap = false; // Same as CPU endian. No need to swap bytes.
- else if(bom == 0xfffe)
+ else if(bom == 0xfffe)
swap = true; // Not same as CPU endian. Need to swap bytes.
else {
debug("String::copyFromUTF16() - Invalid UTF16 string.");
s += 2;
length -= 2;
}
- else
+ else
swap = (t != WCharByteOrder);
d->data.resize(length / 2);
}
}
-const String::Type String::WCharByteOrder
+const String::Type String::WCharByteOrder
= (Utils::SystemByteOrder == Utils::BigEndian) ? String::UTF16BE : String::UTF16LE;
}
/*!
* Makes a deep copy of the data in \a s.
*
- * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
+ * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
* of the CPU byte order. If UTF16BE, it will not be swapped. This behavior
* will be changed in TagLib2.0.
*/
/*!
* Makes a deep copy of the data in \a s.
*
- * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
+ * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
* of the CPU byte order. If UTF16BE, it will not be swapped. This behavior
* will be changed in TagLib2.0.
*/
virtual ~String();
/*!
- * Returns a deep copy of this String as an std::string. The returned string
+ * Returns a deep copy of this String as an std::string. The returned string
* is encoded in UTF8 if \a unicode is true, otherwise Latin1.
*
* \see toCString()
std::string to8Bit(bool unicode = false) const;
/*!
- * Returns a deep copy of this String as a wstring. The returned string is
+ * Returns a deep copy of this String as a wstring. The returned string is
* encoded in UTF-16 (without BOM/CPU byte order).
*
* \see toCWString()
wstring toWString() const;
/*!
- * Creates and returns a standard C-style (null-terminated) version of this
- * String. The returned string is encoded in UTF8 if \a unicode is true,
+ * Creates and returns a standard C-style (null-terminated) version of this
+ * String. The returned string is encoded in UTF8 if \a unicode is true,
* otherwise Latin1.
- *
- * The returned string is still owned by this String and should not be deleted
+ *
+ * The returned string is still owned by this String and should not be deleted
* by the user.
*
- * The returned pointer remains valid until this String instance is destroyed
+ * The returned pointer remains valid until this String instance is destroyed
* or toCString() is called again.
*
* \warning This however has the side effect that the returned string will remain
- * in memory <b>in addition to</b> other memory that is consumed by this
+ * in memory <b>in addition to</b> other memory that is consumed by this
* String instance. So, this method should not be used on large strings or
* where memory is critical. Consider using to8Bit() instead to avoid it.
*
* \see to8Bit()
*/
const char *toCString(bool unicode = false) const;
-
+
/*!
- * Returns a standard C-style (null-terminated) wide character version of
- * this String. The returned string is encoded in UTF-16 (without BOM/CPU byte
+ * Returns a standard C-style (null-terminated) wide character version of
+ * this String. The returned string is encoded in UTF-16 (without BOM/CPU byte
* order).
- *
- * The returned string is still owned by this String and should not be deleted
+ *
+ * The returned string is still owned by this String and should not be deleted
* by the user.
*
- * The returned pointer remains valid until this String instance is destroyed
+ * The returned pointer remains valid until this String instance is destroyed
* or any other method of this String is called.
*
- * \note This returns a pointer to the String's internal data without any
+ * \note This returns a pointer to the String's internal data without any
* conversions.
*
* \see toWString()
*/
const wchar_t *toCWString() const;
-
+
/*!
* Returns an iterator pointing to the beginning of the string.
*/
* Returns a ByteVector containing the string's data. If \a t is Latin1 or
* UTF8, this will return a vector of 8 bit characters, otherwise it will use
* 16 bit characters.
+ *
+ * \note The returned data is not null terminated.
*/
ByteVector data(Type t) const;
private:
/*!
- * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order)
+ * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order)
* and copies it to the internal buffer.
*/
void copyFromLatin1(const char *s, size_t length);
/*!
- * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order)
+ * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order)
* and copies it to the internal buffer.
*/
void copyFromUTF8(const char *s, size_t length);
/*!
- * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
+ * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
* \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
*/
void copyFromUTF16(const wchar_t *s, size_t length, Type t);
/*!
- * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
+ * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
* \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
*/
void copyFromUTF16(const char *s, size_t length, Type t);
-
+
/*!
- * Indicates which byte order of UTF-16 is used to store strings internally.
+ * Indicates which byte order of UTF-16 is used to store strings internally.
*
* \note \e String::UTF16BE or \e String::UTF16LE
*/
CPPUNIT_TEST(testToInt);
CPPUNIT_TEST(testSubstr);
CPPUNIT_TEST(testNewline);
+ CPPUNIT_TEST(testEncode);
CPPUNIT_TEST_SUITE_END();
public:
CPPUNIT_ASSERT_EQUAL(L'\x0a', String(crlf)[4]);
}
+ void testEncode()
+ {
+ String jpn(L"\u65E5\u672C\u8A9E");
+ ByteVector jpn1 = jpn.data(String::Latin1);
+ ByteVector jpn2 = jpn.data(String::UTF8);
+ ByteVector jpn3 = jpn.data(String::UTF16);
+ ByteVector jpn4 = jpn.data(String::UTF16LE);
+ ByteVector jpn5 = jpn.data(String::UTF16BE);
+ std::string jpn6 = jpn.to8Bit(false);
+ std::string jpn7 = jpn.to8Bit(true);
+
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x2C\x9E"), jpn1);
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn2);
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE\xE5\x65\x2C\x67\x9E\x8A"), jpn3);
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x65\x2C\x67\x9E\x8A"), jpn4);
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\x65\xE5\x67\x2C\x8A\x9E"), jpn5);
+ CPPUNIT_ASSERT_EQUAL(std::string("\xE5\x2C\x9E"), jpn6);
+ CPPUNIT_ASSERT_EQUAL(std::string("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn7);
+
+ String empty;
+ ByteVector empty1 = empty.data(String::Latin1);
+ ByteVector empty2 = empty.data(String::UTF8);
+ ByteVector empty3 = empty.data(String::UTF16);
+ ByteVector empty4 = empty.data(String::UTF16LE);
+ ByteVector empty5 = empty.data(String::UTF16BE);
+ std::string empty6 = empty.to8Bit(false);
+ std::string empty7 = empty.to8Bit(true);
+
+ CPPUNIT_ASSERT(empty1.isEmpty());
+ CPPUNIT_ASSERT(empty2.isEmpty());
+ CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE"), empty3);
+ CPPUNIT_ASSERT(empty4.isEmpty());
+ CPPUNIT_ASSERT(empty5.isEmpty());
+ CPPUNIT_ASSERT(empty6.empty());
+ CPPUNIT_ASSERT(empty7.empty());
+ }
+
};
CPPUNIT_TEST_SUITE_REGISTRATION(TestString);