return s;
}
+/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
+static jv_nontrivial jvp_string_copy_replace_bad(const char* data, uint32_t length) {
+ const char* end = data + length;
+ const char* i = data;
+ const char* cstart;
+
+ uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
+ jvp_string* s = jvp_string_alloc(maxlength);
+ char* out = s->data;
+ int c = 0;
+
+ while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+ if (c == -1) {
+ c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+ }
+ out += jvp_utf8_encode(c, out);
+ assert(out < s->data + maxlength);
+ }
+ length = out - s->data;
+ s->data[length] = 0;
+ s->length_hashed = length << 1;
+ jv_nontrivial r = {&s->refcnt, {0,0}};
+ return r;
+}
+
+/* Assumes valid UTF8 */
static jv_nontrivial jvp_string_new(const char* data, uint32_t length) {
jvp_string* s = jvp_string_alloc(length);
s->length_hashed = length << 1;
jv jv_string_sized(const char* str, int len) {
jv j;
j.kind = JV_KIND_STRING;
- j.val.nontrivial = jvp_string_new(str, len);
+ j.val.nontrivial = jvp_utf8_is_valid(str, str+len) ?
+ jvp_string_new(str, len) :
+ jvp_string_copy_replace_bad(str, len);
return j;
}
}
jv jv_string_append_buf(jv a, const char* buf, int len) {
- jvp_string_append(&a.val.nontrivial, buf, len);
+ if (jvp_utf8_is_valid(buf, buf+len)) {
+ jvp_string_append(&a.val.nontrivial, buf, len);
+ } else {
+ jv b;
+ b.kind = JV_KIND_STRING;
+ b.val.nontrivial = jvp_string_copy_replace_bad(buf, len);
+ a = jv_string_concat(a, b);
+ }
return a;
}
jv jv_string_append_str(jv a, const char* str) {
return jv_string_append_buf(a, str, strlen(str));
}
-
+
jv jv_string_fmt(const char* fmt, ...) {
int size = 1024;
while (1) {
#include "jv_unicode.h"
#include "jv_utf8_tables.h"
-const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) {
+const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
+ assert(in <= end);
if (in == end) {
- codepoint = 0;
return 0;
}
+ int codepoint = -1;
unsigned char first = (unsigned char)in[0];
int length = utf8_coding_length[first];
- if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) {
- *codepoint = -1;
- return 0;
- }
- *codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
- for (int i=1; i<length; i++) {
- int ch = (unsigned char)in[i];
- if (utf8_coding_length[(unsigned char)in[i]] != UTF8_CONTINUATION_BYTE){
- *codepoint = -1;
- return 0;
+ if ((first & 0x80) == 0) {
+ /* Fast-path for ASCII */
+ codepoint = first;
+ length = 1;
+ } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
+ /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
+ length = 1;
+ } else if (in + length > end) {
+ /* String ends before UTF8 sequence ends */
+ length = end - in;
+ } else {
+ codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
+ for (int i=1; i<length; i++) {
+ unsigned ch = (unsigned char)in[i];
+ if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
+ /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
+ codepoint = -1;
+ length = i;
+ break;
+ }
+ codepoint = (codepoint << 6) | (ch & 0x3f);
+ }
+ if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
+ /* Surrogate codepoints can't be encoded in UTF8 */
+ codepoint = -1;
+ }
+ if (codepoint > 0x10FFFF) {
+ /* Outside Unicode range */
+ codepoint = -1;
}
- *codepoint = (*codepoint << 6) | (ch & 0x3f);
}
+ assert(length > 0);
+ *codepoint_ret = codepoint;
return in + length;
}
-int jvp_utf8_verify(const char* in, const char* end) {
- int codepoint = 0;
+int jvp_utf8_is_valid(const char* in, const char* end) {
+ int codepoint;
while ((in = jvp_utf8_next(in, end, &codepoint))) {
if (codepoint == -1) return 0;
}
- return codepoint != -1;
+ return 1;
}
int jvp_utf8_encode_length(int codepoint) {