Beyond the Basic Multilingual Plane, dead Cthulhu waits dreaming.
sed 's/.*`\(.*\)'\''.*/\1/' | grep -v '^all$$' | \
xargs rm
+jv_utf8_tables.h: gen_utf8_tables.py
+ python $^ > $@
lexer.yy.c: lexer.l
flex -o lexer.yy.c --header-file=lexer.yy.h lexer.l
bison -W -d parser.y -v --report-file=parser.info
parser.tab.h: parser.tab.c
+jv_unicode.c: jv_utf8_tables.h
+
parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute.c builtin.c jv.c jv_parse.c jv_print.c jv_dtoa.c jv_unicode.c
$(CC) -o $@ $^
--- /dev/null
+#!/usr/bin/python
+
+mask = lambda n: (1 << n) - 1
+
+
+def print_table(type, name, t):
+ assert len(t) == 256
+ print "const static",type, name+"[]", "="
+ first = True
+ for i in range(0,len(t),16):
+ print (" {" if i == 0 else " ") +\
+ ", ".join("0x%02x"%n for n in t[i:i+16]) + \
+ ("," if i + 16 < 256 else "};")
+
+
+def utf8info(c):
+ if c < 0x80: return 1, mask(7)
+ if 0x80 <= c <= 0xBF: return 255, mask(6)
+ if 0xC0 <= c <= 0xC1: return 0, 0
+ if 0xC2 <= c <= 0xDF: return 2, mask(5)
+ if 0xE0 <= c <= 0xEF: return 3, mask(4)
+ if 0xF0 <= c <= 0xF4: return 4, mask(3)
+ if 0xF4 <= c <= 0xFF: return 0, 0
+
+table = lambda i: [utf8info(c)[i] for c in range(256)]
+
+print "#define UTF8_CONTINUATION_BYTE ((unsigned char)255)"
+
+print_table("unsigned char", "utf8_coding_length", table(0))
+print_table("unsigned char", "utf8_coding_bits", table(1))
#include "jv.h"
#include <stdio.h>
+#include <float.h>
+#include <math.h>
#include "jv_dtoa.h"
+#include "jv_unicode.h"
static void jv_dump_string(jv str, int ascii_only) {
assert(jv_get_kind(str) == JV_KIND_STRING);
const char* i = jv_string_value(str);
- const char* end = i + jv_string_length(str);
- while (i < end) {
- int unicode_escape;
- int c = (unsigned char)*i++;
+ const char* end = i + jv_string_length(jv_copy(str));
+ int c;
+ while ((i = jvp_utf8_next(i, end, &c))) {
+ assert(c != -1);
+ int unicode_escape = 0;
if (0x20 <= c && c <= 0x7E) {
// printable ASCII
if (c == '"' || c == '\\') {
putchar('\\');
}
putchar(c);
- unicode_escape = 0;
} else if (c < 0x20 || c == 0x7F) {
// ASCII control character
switch (c) {
unicode_escape = 1;
break;
}
+ } else {
+ unicode_escape = 1;
+ }
+ if (unicode_escape) {
+ if (c <= 0xffff) {
+ printf("\\u%04x", c);
+ } else {
+ c -= 0x10000;
+ printf("\\u%04x\\u%04x",
+ 0xD800 | ((c & 0xffc00) >> 10),
+ 0xDC00 | (c & 0x003ff));
+ }
}
}
+ assert(c != -1);
}
static void jv_dump_term(struct dtoa_context* C, jv x) {
break;
case JV_KIND_STRING:
// FIXME: all sorts of broken
- printf("\"%s\"", jv_string_value(x));
+ putchar('"');
+ jv_dump_string(x, 0);
+ putchar('"');
break;
case JV_KIND_ARRAY: {
printf("[");
+#include <stdio.h>
#include <assert.h>
#include "jv_unicode.h"
+#include "jv_utf8_tables.h"
+
+const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) {
+ if (in == end) {
+ codepoint = 0;
+ return 0;
+ }
+ unsigned char first = (unsigned char)in[0];
+ int length = utf8_coding_length[first];
+ if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) {
+ *codepoint = -1;
+ return 0;
+ }
+ *codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
+ for (int i=1; i<length; i++) {
+ int ch = (unsigned char)in[i];
+ if (utf8_coding_length[(unsigned char)in[i]] != UTF8_CONTINUATION_BYTE){
+ *codepoint = -1;
+ return 0;
+ }
+ *codepoint = (*codepoint << 6) | (ch & 0x3f);
+ }
+ return in + length;
+}
+
+int jvp_utf8_verify(const char* in, const char* end) {
+ int codepoint = 0;
+ while ((in = jvp_utf8_next(in, end, &codepoint))) {
+ if (codepoint == -1) return 0;
+ }
+ return codepoint != -1;
+}
+
int jvp_utf8_encode_length(int codepoint) {
if (codepoint <= 0x7F) return 1;
else if (codepoint <= 0x7FF) return 2;
#ifndef JV_UNICODE_H
#define JV_UNICODE_H
+
+const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
+
+
int jvp_utf8_decode_length(char startchar);
int jvp_utf8_encode_length(int codepoint);