From: Stephen Dolan Date: Tue, 4 Sep 2012 19:34:43 +0000 (+0100) Subject: Move some unicode handling stuff to a separate file. X-Git-Tag: jq-1.1~76 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c5ab3b2336a3ad14844bd91beae5b4dbca2c2926;p=jq Move some unicode handling stuff to a separate file. --- diff --git a/c/Makefile b/c/Makefile index d437b14..6496da8 100644 --- a/c/Makefile +++ b/c/Makefile @@ -17,7 +17,7 @@ parser.tab.c: parser.y lexer.yy.h bison -W -d parser.y -v --report-file=parser.info parser.tab.h: parser.tab.c -parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute.c builtin.c jv.c jv_parse.c jv_print.c jv_dtoa.c +parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute.c builtin.c jv.c jv_parse.c jv_print.c jv_dtoa.c jv_unicode.c $(CC) -o $@ $^ jv_test: jv_test.c jv.c jv_print.c jv_dtoa.c diff --git a/c/jv_parse.c b/c/jv_parse.c index 987bb9e..7a8c811 100644 --- a/c/jv_parse.c +++ b/c/jv_parse.c @@ -4,6 +4,7 @@ #include "jv.h" #include "jv_dtoa.h" #include "jv_parse.h" +#include "jv_unicode.h" typedef const char* presult; @@ -157,27 +158,6 @@ static int unhex4(char* hex) { return r; } -static int utf8_encode(int codepoint, char* out) { - assert(codepoint >= 0 && codepoint <= 0x10FFFF); - char* start = out; - if (codepoint <= 0x7F) { - *out++ = codepoint; - } else if (codepoint <= 0x7FF) { - *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); - *out++ = 0x80 + ((codepoint & 0x03F)); - } else if(codepoint <= 0xFFFF) { - *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); - *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); - *out++ = 0x80 + ((codepoint & 0x003F)); - } else { - *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); - *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); - *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); - *out++ = 0x80 + ((codepoint & 0x00003F)); - } - return out - start; -} - static pfunc found_string(struct jv_parser* p) { char* in = p->tokenbuf; char* out = p->tokenbuf; @@ -217,7 +197,7 @@ static pfunc found_string(struct jv_parser* p) { |(surrogate - 0xDC00)); } // FIXME assert valid codepoint - out += utf8_encode(codepoint, out); + out += jvp_utf8_encode(codepoint, out); break; default: diff --git a/c/jv_unicode.c b/c/jv_unicode.c new file mode 100644 index 0000000..b1475ff --- /dev/null +++ b/c/jv_unicode.c @@ -0,0 +1,30 @@ +#include +#include "jv_unicode.h" +int jvp_utf8_encode_length(int codepoint) { + if (codepoint <= 0x7F) return 1; + else if (codepoint <= 0x7FF) return 2; + else if (codepoint <= 0xFFFF) return 3; + else return 4; +} + +int jvp_utf8_encode(int codepoint, char* out) { + assert(codepoint >= 0 && codepoint <= 0x10FFFF); + char* start = out; + if (codepoint <= 0x7F) { + *out++ = codepoint; + } else if (codepoint <= 0x7FF) { + *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); + *out++ = 0x80 + ((codepoint & 0x03F)); + } else if(codepoint <= 0xFFFF) { + *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); + *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x003F)); + } else { + *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); + *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); + *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x00003F)); + } + assert(out - start == jvp_utf8_encode_length(codepoint)); + return out - start; +} diff --git a/c/jv_unicode.h b/c/jv_unicode.h new file mode 100644 index 0000000..041c29a --- /dev/null +++ b/c/jv_unicode.h @@ -0,0 +1,7 @@ +#ifndef JV_UNICODE_H +#define JV_UNICODE_H +int jvp_utf8_decode_length(char startchar); + +int jvp_utf8_encode_length(int codepoint); +int jvp_utf8_encode(int codepoint, char* out); +#endif