From: Stephen Dolan Date: Tue, 14 May 2013 23:37:38 +0000 (+0100) Subject: 'length' function now measures string length in codepoints, not bytes. X-Git-Tag: jq-1.3~9 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e83e51eb56a1de6e627d346f027d3ceb09ae3807;p=jq 'length' function now measures string length in codepoints, not bytes. --- diff --git a/builtin.c b/builtin.c index 09ef9fc..0b79714 100644 --- a/builtin.c +++ b/builtin.c @@ -187,7 +187,7 @@ static jv f_length(jv input) { } else if (jv_get_kind(input) == JV_KIND_OBJECT) { return jv_number(jv_object_length(input)); } else if (jv_get_kind(input) == JV_KIND_STRING) { - return jv_number(jv_string_length(input)); + return jv_number(jv_string_length_codepoints(input)); } else if (jv_get_kind(input) == JV_KIND_NULL) { jv_free(input); return jv_number(0); @@ -220,7 +220,7 @@ static jv escape_string(jv input, const char* escapings) { jv ret = jv_string(""); const char* i = jv_string_value(input); - const char* end = i + jv_string_length(jv_copy(input)); + const char* end = i + jv_string_length_bytes(jv_copy(input)); const char* cstart; int c = 0; while ((i = jvp_utf8_next((cstart = i), end, &c))) { @@ -299,7 +299,7 @@ static jv f_format(jv input, jv fmt) { jv line = jv_string(""); const char* s = jv_string_value(input); - for (int i=0; i= 3 ? 3 : len-i; diff --git a/jq_test.c b/jq_test.c index c71fdc6..25cea1c 100644 --- a/jq_test.c +++ b/jq_test.c @@ -63,7 +63,7 @@ static void run_jq_tests(FILE *testdata) { pass = 0; } jv as_string = jv_dump_string(jv_copy(expected), rand() & ~JV_PRINT_COLOUR); - jv reparsed = jv_parse_sized(jv_string_value(as_string), jv_string_length(jv_copy(as_string))); + jv reparsed = jv_parse_sized(jv_string_value(as_string), jv_string_length_bytes(jv_copy(as_string))); assert(jv_equal(jv_copy(expected), jv_copy(reparsed))); jv_free(as_string); jv_free(reparsed); @@ -191,8 +191,8 @@ static void jv_test() { assert(jv_equal(jv_string("foo"), jv_string_sized("foo", 3))); char nasty[] = "foo\0"; jv shortstr = jv_string(nasty), longstr = jv_string_sized(nasty, sizeof(nasty)); - assert(jv_string_length(shortstr) == (int)strlen(nasty)); - assert(jv_string_length(longstr) == (int)sizeof(nasty)); + assert(jv_string_length_bytes(shortstr) == (int)strlen(nasty)); + assert(jv_string_length_bytes(longstr) == (int)sizeof(nasty)); char a1s[] = "hello", a2s[] = "hello", bs[] = "goodbye"; @@ -213,7 +213,7 @@ static void jv_test() { for (int i=0; i<(int)sizeof(big); i++) big[i] = 'a'; big[sizeof(big)-1] = 0; jv str = jv_string_fmt("%s", big); - assert(jv_string_length(jv_copy(str)) == sizeof(big) - 1); + assert(jv_string_length_bytes(jv_copy(str)) == sizeof(big) - 1); assert(!strcmp(big, jv_string_value(str))); jv_free(str); } diff --git a/jv.c b/jv.c index b03c024..9316aec 100644 --- a/jv.c +++ b/jv.c @@ -8,6 +8,7 @@ #include "jv_alloc.h" #include "jv.h" +#include "jv_unicode.h" /* * Internal refcounting helpers @@ -530,13 +531,23 @@ jv jv_string(const char* str) { return jv_string_sized(str, strlen(str)); } -int jv_string_length(jv j) { +int jv_string_length_bytes(jv j) { assert(jv_get_kind(j) == JV_KIND_STRING); int r = jvp_string_length(jvp_string_ptr(&j.val.nontrivial)); jv_free(j); return r; } +int jv_string_length_codepoints(jv j) { + assert(jv_get_kind(j) == JV_KIND_STRING); + const char* i = jv_string_value(j); + const char* end = i + jv_string_length_bytes(jv_copy(j)); + int c = 0, len = 0; + while ((i = jvp_utf8_next(i, end, &c))) len++; + jv_free(j); + return len; +} + uint32_t jv_string_hash(jv j) { assert(jv_get_kind(j) == JV_KIND_STRING); uint32_t hash = jvp_string_hash(jvp_string_ptr(&j.val.nontrivial)); diff --git a/jv.h b/jv.h index a64f3ac..d14e5d7 100644 --- a/jv.h +++ b/jv.h @@ -82,7 +82,8 @@ jv jv_array_slice(jv, int, int); jv jv_string(const char*); jv jv_string_sized(const char*, int); -int jv_string_length(jv); +int jv_string_length_bytes(jv); +int jv_string_length_codepoints(jv); uint32_t jv_string_hash(jv); const char* jv_string_value(jv); jv jv_string_concat(jv, jv); diff --git a/jv_aux.c b/jv_aux.c index 68811cd..0c8cd8b 100644 --- a/jv_aux.c +++ b/jv_aux.c @@ -380,8 +380,8 @@ jv jv_delpaths(jv object, jv paths) { static int string_cmp(const void* pa, const void* pb){ const jv* a = pa; const jv* b = pb; - int lena = jv_string_length(jv_copy(*a)); - int lenb = jv_string_length(jv_copy(*b)); + int lena = jv_string_length_bytes(jv_copy(*a)); + int lenb = jv_string_length_bytes(jv_copy(*b)); int minlen = lena < lenb ? lena : lenb; int r = memcmp(jv_string_value(*a), jv_string_value(*b), minlen); if (r == 0) r = lena - lenb; diff --git a/jv_print.c b/jv_print.c index fc1370e..5784337 100644 --- a/jv_print.c +++ b/jv_print.c @@ -45,7 +45,7 @@ static void put_space(int n, FILE* fout, jv* strout) { static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S) { assert(jv_get_kind(str) == JV_KIND_STRING); const char* i = jv_string_value(str); - const char* end = i + jv_string_length(jv_copy(str)); + const char* end = i + jv_string_length_bytes(jv_copy(str)); const char* cstart; int c = 0; char buf[32]; diff --git a/lexer.l b/lexer.l index 633b863..59e527d 100644 --- a/lexer.l +++ b/lexer.l @@ -93,7 +93,7 @@ struct lexer_param; (\\[^u(]|\\u[a-zA-Z0-9]{0,4})+ { /* pass escapes to the json parser */ jv escapes = jv_string_fmt("\"%.*s\"", yyleng, yytext); - yylval->literal = jv_parse_sized(jv_string_value(escapes), jv_string_length(jv_copy(escapes))); + yylval->literal = jv_parse_sized(jv_string_value(escapes), jv_string_length_bytes(jv_copy(escapes))); jv_free(escapes); return QQSTRING_TEXT; } diff --git a/main.c b/main.c index 77c7654..73359ff 100644 --- a/main.c +++ b/main.c @@ -69,7 +69,7 @@ static void process(jv value, int flags) { jv result; while (jv_is_valid(result = jq_next(jq))) { if ((options & RAW_OUTPUT) && jv_get_kind(result) == JV_KIND_STRING) { - fwrite(jv_string_value(result), 1, jv_string_length(jv_copy(result)), stdout); + fwrite(jv_string_value(result), 1, jv_string_length_bytes(jv_copy(result)), stdout); jv_free(result); } else { int dumpopts; diff --git a/tests/all.test b/tests/all.test index 0119dad..90ab7db 100644 --- a/tests/all.test +++ b/tests/all.test @@ -302,8 +302,8 @@ null [false, false, false, false, false, false, false, false, true ] [.[] | length] -[[], {}, [1,2], {"a":42}, "asdf"] -[0, 0, 2, 1, 4] +[[], {}, [1,2], {"a":42}, "asdf", "\u03bc"] +[0, 0, 2, 1, 4, 1] map(keys) [{}, {"abcd":1,"abc":2,"abcde":3}, {"x":1, "z": 3, "y":2}]