* http://www.mozilla.org/MPL/ *
***************************************************************************/
+// This class assumes that std::basic_string<T> has a contiguous and null-terminated buffer.
+//
+
#include "tstring.h"
-#include "unicode.h"
#include "tdebug.h"
#include "tstringlist.h"
#include <string.h>
-namespace TagLib {
+// Determine if the compiler supports codecvt.
+
+#if (defined(_MSC_VER) && _MSC_VER >= 1600)
+# define TAGLIB_USE_CODECVT
+#endif
+
+#ifdef TAGLIB_USE_CODECVT
+# include <codecvt>
+typedef std::codecvt_utf8_utf16<wchar_t> utf8_utf16_t;
+#else
+# include "unicode.h"
+#endif
+
+namespace {
inline unsigned short byteSwap(unsigned short x)
{
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+
+ return _byteswap_ushort(x);
+
+#else
+
return (((x) >> 8) & 0xff) | (((x) & 0xff) << 8);
+
+#endif
}
inline unsigned short combine(unsigned char c1, unsigned char c2)
{
return (c1 << 8) | c2;
}
+
+#if !defined(TAGLIB_LITTLE_ENDIAN) && !defined(TAGLIB_BIG_ENDIAN)
+
+ TagLib::String::Type wcharByteOrder()
+ {
+ // Detect CPU endian at run time.
+ union {
+ TagLib::ushort w;
+ char c;
+ } x = { 0x1234 };
+
+ if(x.c == 0x34)
+ return String::UTF16LE;
+ else
+ return String::UTF16BE;
+ }
+
+#endif
}
-using namespace TagLib;
+namespace TagLib {
class String::StringPrivate : public RefCounter
{
public:
- StringPrivate(const wstring &s) :
- RefCounter(),
- data(s),
- CString(0) {}
-
- StringPrivate() :
- RefCounter(),
- CString(0) {}
+ StringPrivate(const wstring &s) : RefCounter(), data(s) {}
+ StringPrivate() : RefCounter() {}
- ~StringPrivate() {
- delete [] CString;
- }
-
- wstring data;
+ /*!
+ * Stores string in UTF-16. The byte order depends on the CPU endian.
+ */
+ TagLib::wstring data;
/*!
- * This is only used to hold the a pointer to the most recent value of
- * toCString.
+ * This is only used to hold the the most recent value of toCString().
*/
- char *CString;
+ std::string cstring;
};
String String::null;
////////////////////////////////////////////////////////////////////////////////
-String::String()
+String::String()
+ : d(new StringPrivate())
{
- d = new StringPrivate;
}
-String::String(const String &s) : d(s.d)
+String::String(const String &s)
+ : d(s.d)
{
d->ref();
}
String::String(const std::string &s, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate;
-
- if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
+ if(t == Latin1)
+ copyFromLatin1(&s[0], s.length());
+ else if(t == String::UTF8)
+ copyFromUTF8(&s[0], s.length());
+ else {
debug("String::String() -- A std::string should not contain UTF16.");
- return;
}
-
- int length = s.length();
- d->data.resize(length);
- wstring::iterator targetIt = d->data.begin();
-
- for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
- *targetIt = uchar(*it);
- ++targetIt;
- }
-
- prepare(t);
}
String::String(const wstring &s, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate(s);
- prepare(t);
+ if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+ copyFromUTF16(s.c_str(), s.length(), t);
+ else {
+ debug("String::String() -- A TagLib::wstring should not contain Latin1 or UTF-8.");
+ }
}
String::String(const wchar_t *s, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate(s);
- prepare(t);
+ if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+ copyFromUTF16(s, ::wcslen(s), t);
+ else {
+ debug("String::String() -- A const wchar_t * should not contain Latin1 or UTF-8.");
+ }
}
String::String(const char *s, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate;
-
- if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
+ if(t == Latin1)
+ copyFromLatin1(s, ::strlen(s));
+ else if(t == String::UTF8)
+ copyFromUTF8(s, ::strlen(s));
+ else {
debug("String::String() -- A const char * should not contain UTF16.");
- return;
}
-
- int length = ::strlen(s);
- d->data.resize(length);
-
- wstring::iterator targetIt = d->data.begin();
-
- for(int i = 0; i < length; i++) {
- *targetIt = uchar(s[i]);
- ++targetIt;
- }
-
- prepare(t);
}
String::String(wchar_t c, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate;
- d->data += c;
- prepare(t);
+ if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+ copyFromUTF16(&c, 1, t);
+ else {
+ debug("String::String() -- A const wchar_t should not contain Latin1 or UTF-8.");
+ }
}
String::String(char c, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate;
-
- if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
- debug("String::String() -- A std::string should not contain UTF16.");
- return;
+ if(t == Latin1 || t == UTF8) {
+ d->data.resize(1);
+ d->data[0] = static_cast<uchar>(c);
+ }
+ else {
+ debug("String::String() -- A char should not contain UTF16.");
}
-
- d->data += uchar(c);
- prepare(t);
}
String::String(const ByteVector &v, Type t)
+ : d(new StringPrivate())
{
- d = new StringPrivate;
-
if(v.isEmpty())
return;
- if(t == Latin1 || t == UTF8) {
-
- int length = 0;
- d->data.resize(v.size());
- wstring::iterator targetIt = d->data.begin();
- for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
- *targetIt = uchar(*it);
- ++targetIt;
- ++length;
- }
- d->data.resize(length);
- }
- else {
- d->data.resize(v.size() / 2);
- wstring::iterator targetIt = d->data.begin();
-
- for(ByteVector::ConstIterator it = v.begin();
- it != v.end() && it + 1 != v.end() && combine(*it, *(it + 1));
- it += 2)
- {
- *targetIt = combine(*it, *(it + 1));
- ++targetIt;
- }
- }
- prepare(t);
+ if(t == Latin1)
+ copyFromLatin1(v.data(), v.size());
+ else if(t == UTF8)
+ copyFromUTF8(v.data(), v.size());
+ else
+ copyFromUTF16(v.data(), v.size(), t);
}
////////////////////////////////////////////////////////////////////////////////
std::string String::to8Bit(bool unicode) const
{
std::string s;
- s.resize(d->data.size());
if(!unicode) {
+ s.resize(d->data.size());
+
std::string::iterator targetIt = s.begin();
for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
- *targetIt = char(*it);
+ *targetIt = static_cast<char>(*it);
++targetIt;
}
- return s;
}
+ else {
+ s.resize(d->data.size() * 4 + 1);
- const int outputBufferSize = d->data.size() * 3 + 1;
+#ifdef TAGLIB_USE_CODECVT
- Unicode::UTF16 *sourceBuffer = new Unicode::UTF16[d->data.size() + 1];
- Unicode::UTF8 *targetBuffer = new Unicode::UTF8[outputBufferSize];
+ std::mbstate_t st = 0;
+ const wchar_t *source;
+ char *target;
+ std::codecvt_base::result result = utf8_utf16_t().out(
+ st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target);
- for(unsigned int i = 0; i < d->data.size(); i++)
- sourceBuffer[i] = Unicode::UTF16(d->data[i]);
+ if(result != utf8_utf16_t::ok) {
+ debug("String::copyFromUTF8() - Unicode conversion error.");
+ }
- const Unicode::UTF16 *source = sourceBuffer;
- Unicode::UTF8 *target = targetBuffer;
+#else
- Unicode::ConversionResult result =
- Unicode::ConvertUTF16toUTF8(&source, sourceBuffer + d->data.size(),
- &target, targetBuffer + outputBufferSize,
- Unicode::lenientConversion);
+ const Unicode::UTF16 *source = &d->data[0];
+ Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
- if(result != Unicode::conversionOK) {
- debug("String::to8Bit() - Unicode conversion error.");
- }
+ Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
+ &source, source + d->data.size(),
+ &target, target + s.size(),
+ Unicode::lenientConversion);
- int newSize = target - targetBuffer;
- s.resize(newSize);
- targetBuffer[newSize] = 0;
+ if(result != Unicode::conversionOK) {
+ debug("String::to8Bit() - Unicode conversion error.");
+ }
- s = (char *) targetBuffer;
+#endif
- delete [] sourceBuffer;
- delete [] targetBuffer;
+ s.resize(::strlen(s.c_str()));
+ }
return s;
}
const char *String::toCString(bool unicode) const
{
- delete [] d->CString;
-
- std::string buffer = to8Bit(unicode);
- d->CString = new char[buffer.size() + 1];
-
-#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later
-
- strcpy_s(d->CString, buffer.size() + 1, buffer.c_str());
-
-#else
-
- strcpy(d->CString, buffer.c_str());
-
-#endif
-
- return d->CString;
+ d->cstring = to8Bit(unicode);
+ return d->cstring.c_str();
}
String::Iterator String::begin()
int String::find(const String &s, int offset) const
{
- wstring::size_type position = d->data.find(s.d->data, offset);
-
- if(position != wstring::npos)
- return position;
- else
- return -1;
+ return d->data.find(s.d->data, offset);
}
int String::rfind(const String &s, int offset) const
{
- wstring::size_type position =
- d->data.rfind(s.d->data, offset == -1 ? wstring::npos : offset);
-
- if(position != wstring::npos)
- return position;
- else
- return -1;
+ return d->data.rfind(s.d->data, offset);
}
StringList String::split(const String &separator) const
String String::substr(uint position, uint n) const
{
- String s;
- s.d->data = d->data.substr(position, n);
- return s;
+ return String(d->data.substr(position, n));
}
String &String::append(const String &s)
ByteVector String::data(Type t) const
{
- ByteVector v;
+ switch(t)
+ {
+ case Latin1:
+ {
+ ByteVector v(size(), 0);
+ char *p = v.data();
- switch(t) {
+ for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++)
+ *p++ = static_cast<char>(*it);
- case Latin1:
- {
- for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++)
- v.append(char(*it));
- break;
- }
+ return v;
+ }
case UTF8:
- {
- std::string s = to8Bit(true);
- v.setData(s.c_str(), s.length());
- break;
- }
+ {
+ ByteVector v(size() * 4 + 1, 0);
+
+#ifdef TAGLIB_USE_CODECVT
+
+ std::mbstate_t st = 0;
+ const wchar_t *source;
+ char *target;
+ std::codecvt_base::result result = utf8_utf16_t().out(
+ st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target);
+
+ if(result != utf8_utf16_t::ok) {
+ debug("String::data() - Unicode conversion error.");
+ }
+
+#else
+
+ const Unicode::UTF16 *source = &d->data[0];
+ Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(v.data());
+
+ Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
+ &source, source + d->data.size(),
+ &target, target + v.size(),
+ Unicode::lenientConversion);
+
+ if(result != Unicode::conversionOK) {
+ debug("String::data() - Unicode conversion error.");
+ }
+
+#endif
+
+ v.resize(::strlen(v.data()));
+
+ return v;
+ }
case UTF16:
- {
- // Assume that if we're doing UTF16 and not UTF16BE that we want little
- // endian encoding. (Byte Order Mark)
+ {
+ ByteVector v(2 + size() * 2, 0);
+ char *p = v.data();
- v.append(char(0xff));
- v.append(char(0xfe));
+ // Assume that if we're doing UTF16 and not UTF16BE that we want little
+ // endian encoding. (Byte Order Mark)
- for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ *p++ = '\xff';
+ *p++ = '\xfe';
- char c1 = *it & 0xff;
- char c2 = *it >> 8;
+ for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ *p++ = static_cast<char>(*it & 0xff);
+ *p++ = static_cast<char>(*it >> 8);
+ }
- v.append(c1);
- v.append(c2);
+ return v;
}
- break;
- }
case UTF16BE:
- {
- for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ {
+ ByteVector v(size() * 2, 0);
+ char *p = v.data();
- char c1 = *it >> 8;
- char c2 = *it & 0xff;
+ for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ *p++ = static_cast<char>(*it >> 8);
+ *p++ = static_cast<char>(*it & 0xff);
+ }
- v.append(c1);
- v.append(c2);
+ return v;
}
- break;
- }
case UTF16LE:
- {
- for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ {
+ ByteVector v(size() * 2, 0);
+ char *p = v.data();
- char c1 = *it & 0xff;
- char c2 = *it >> 8;
+ for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+ *p++ = static_cast<char>(*it & 0xff);
+ *p++ = static_cast<char>(*it >> 8);
+ }
- v.append(c1);
- v.append(c2);
+ return v;
+ }
+ default:
+ {
+ debug("String::data() - Invalid Type value.");
+ return ByteVector();
}
- break;
- }
}
-
- return v;
}
int String::toInt() const
TagLib::wchar &String::operator[](int i)
{
detach();
-
return d->data[i];
}
delete d;
d = new StringPrivate;
-
- d->data.resize(s.size());
-
- wstring::iterator targetIt = d->data.begin();
- for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
- *targetIt = uchar(*it);
- ++targetIt;
- }
+ copyFromLatin1(s.c_str(), s.length());
return *this;
}
delete d;
d = new StringPrivate;
-
- int length = ::strlen(s);
- d->data.resize(length);
-
- wstring::iterator targetIt = d->data.begin();
- for(int i = 0; i < length; i++) {
- *targetIt = uchar(s[i]);
- ++targetIt;
- }
+ copyFromLatin1(s, ::strlen(s));
return *this;
}
delete d;
d = new StringPrivate;
- d->data.resize(v.size());
- wstring::iterator targetIt = d->data.begin();
-
- uint i = 0;
-
- for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
- *targetIt = uchar(*it);
- ++targetIt;
- ++i;
- }
+ copyFromLatin1(v.data(), v.size());
// If we hit a null in the ByteVector, shrink the string again.
-
- d->data.resize(i);
+ d->data.resize(::wcslen(d->data.c_str()));
return *this;
}
// private members
////////////////////////////////////////////////////////////////////////////////
-void String::prepare(Type t)
+void String::copyFromLatin1(const char *s, size_t length)
{
- switch(t) {
- case UTF16:
- {
- if(d->data.size() >= 1 && (d->data[0] == 0xfeff || d->data[0] == 0xfffe)) {
- bool swap = d->data[0] != 0xfeff;
- d->data.erase(d->data.begin(), d->data.begin() + 1);
- if(swap) {
- for(uint i = 0; i < d->data.size(); i++)
- d->data[i] = byteSwap((unsigned short)d->data[i]);
- }
- }
- else {
- debug("String::prepare() - Invalid UTF16 string.");
- d->data.erase(d->data.begin(), d->data.end());
- }
- break;
+ d->data.resize(length);
+
+ for(size_t i = 0; i < length; ++i)
+ d->data[i] = static_cast<uchar>(s[i]);
+}
+
+void String::copyFromUTF8(const char *s, size_t length)
+{
+ d->data.resize(length);
+
+#ifdef TAGLIB_USE_CODECVT
+
+ std::mbstate_t st = 0;
+ const char *source;
+ wchar_t *target;
+ std::codecvt_base::result result = utf8_utf16_t().in(
+ st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target);
+
+ if(result != utf8_utf16_t::ok) {
+ debug("String::copyFromUTF8() - Unicode conversion error.");
}
- case UTF8:
- {
- int bufferSize = d->data.size() + 1;
- Unicode::UTF8 *sourceBuffer = new Unicode::UTF8[bufferSize];
- Unicode::UTF16 *targetBuffer = new Unicode::UTF16[bufferSize];
- unsigned int i = 0;
- for(; i < d->data.size(); i++)
- sourceBuffer[i] = Unicode::UTF8(d->data[i]);
- sourceBuffer[i] = 0;
+#else
- const Unicode::UTF8 *source = sourceBuffer;
- Unicode::UTF16 *target = targetBuffer;
+ const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
+ Unicode::UTF16 *target = &d->data[0];
- Unicode::ConversionResult result =
- Unicode::ConvertUTF8toUTF16(&source, sourceBuffer + bufferSize,
- &target, targetBuffer + bufferSize,
- Unicode::lenientConversion);
+ Unicode::ConversionResult result = Unicode::ConvertUTF8toUTF16(
+ &source, source + length,
+ &target, target + length,
+ Unicode::lenientConversion);
- if(result != Unicode::conversionOK) {
- debug("String::prepare() - Unicode conversion error.");
- }
+ if(result != Unicode::conversionOK) {
+ debug("String::copyFromUTF8() - Unicode conversion error.");
+ }
- int newSize = target != targetBuffer ? target - targetBuffer - 1 : 0;
- d->data.resize(newSize);
+#endif
- for(int i = 0; i < newSize; i++)
- d->data[i] = targetBuffer[i];
+ d->data.resize(::wcslen(d->data.c_str()));
+}
- delete [] sourceBuffer;
- delete [] targetBuffer;
+void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
+{
+ bool swap;
+ if(t == UTF16) {
+ if(length >= 1 && s[0] == 0xfeff)
+ swap = false; // Same as CPU endian. No need to swap bytes.
+ else if(length >= 1 && s[0] == 0xfffe)
+ swap = true; // Not same as CPU endian. Need to swap bytes.
+ else {
+ debug("String::copyFromUTF16() - Invalid UTF16 string.");
+ return;
+ }
- break;
+ s++;
+ length--;
}
- case UTF16LE:
- {
- for(uint i = 0; i < d->data.size(); i++)
- d->data[i] = byteSwap((unsigned short)d->data[i]);
- break;
+ else
+ swap = (t != WCharByteOrder);
+
+ d->data.resize(length);
+ memcpy(&d->data[0], s, length * sizeof(wchar_t));
+
+ if(swap) {
+ for(size_t i = 0; i < length; ++i)
+ d->data[i] = byteSwap(static_cast<unsigned short>(s[i]));
}
- default:
- break;
+}
+
+template <size_t sizeOfWcharT>
+void String::internalCopyFromUTF16(const char *s, size_t length, Type t)
+{
+ // Non specialized version. Used where sizeof(wchar_t) != 2.
+
+ bool swap;
+ if(t == UTF16) {
+ if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfeff)
+ swap = false; // Same as CPU endian. No need to swap bytes.
+ else if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfffe)
+ swap = true; // Not same as CPU endian. Need to swap bytes.
+ else {
+ debug("String::copyFromUTF16() - Invalid UTF16 string.");
+ return;
+ }
+
+ s += 2;
+ length -= 2;
}
+ else
+ swap = (t != WCharByteOrder);
+
+ d->data.resize(length / 2);
+ for(size_t i = 0; i < length / 2; ++i) {
+ d->data[i] = swap ? combine(*s, *(s + 1)) : combine(*(s + 1), *s);
+ s += 2;
+ }
+}
+
+template <>
+void String::internalCopyFromUTF16<2>(const char *s, size_t length, Type t)
+{
+ // Specialized version for where sizeof(wchar_t) == 2.
+
+ copyFromUTF16(reinterpret_cast<const wchar_t*>(s), length / 2, t);
+}
+
+void String::copyFromUTF16(const char *s, size_t length, Type t)
+{
+ internalCopyFromUTF16<sizeof(wchar_t)>(s, length, t);
+}
+
+#if defined(TAGLIB_LITTLE_ENDIAN)
+
+const String::Type String::WCharByteOrder = String::UTF16LE;
+
+#elif defined(TAGLIB_BIG_ENDIAN)
+
+const String::Type String::WCharByteOrder = String::UTF16BE;
+
+#else
+
+const String::Type String::WCharByteOrder = wcharByteOrder();
+
+#endif
}
////////////////////////////////////////////////////////////////////////////////
const TagLib::String operator+(const TagLib::String &s1, const TagLib::String &s2)
{
- String s(s1);
+ TagLib::String s(s1);
s.append(s2);
return s;
}
const TagLib::String operator+(const char *s1, const TagLib::String &s2)
{
- String s(s1);
+ TagLib::String s(s1);
s.append(s2);
return s;
}
const TagLib::String operator+(const TagLib::String &s1, const char *s2)
{
- String s(s1);
+ TagLib::String s(s1);
s.append(s2);
return s;
}
-std::ostream &operator<<(std::ostream &s, const String &str)
+std::ostream &operator<<(std::ostream &s, const TagLib::String &str)
{
s << str.to8Bit();
return s;
}
+