kQUOT = 0x27,
kDBLQ = 0x22;
+# include "cptbl.h"
+
+# define cp1047_to_8859(c) cp1047_8859_1[c]
+
std::string prog;
void usage() {
bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
size_t pos = origpos + 3;
std::string outstr;
- outstr += (kDBLQ);
+ outstr += '\"'; // local encoding
for(;pos<endpos;pos++) {
char c = linestr[pos];
if(c == kBKSLASH) {
appendByte(outstr, c);
}
}
- outstr += (kDBLQ);
+ outstr += ('\"');
linestr.replace(origpos, (endpos-origpos+1), outstr);
if(linestr[pos] == '\\') continue;
// some other escape… ignore
} else {
+#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
+ // mogrify 1-4 bytes from 1047 'back' to utf-8
+ char old_byte = linestr[pos];
+ linestr[pos] = cp1047_to_8859(linestr[pos]);
+ // how many more?
+ int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
+ for(size_t pos2 = pos+1; trail>0; pos++,trail--) {
+ linestr[pos2] = cp1047_to_8859(linestr[pos2]);
+ }
+#endif
+
// Proceed to decode utf-8
const uint8_t *s = (const uint8_t*) (linestr.c_str());
int32_t i = pos;
int32_t length = linestr.size();
UChar32 c;
+ if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
+#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
+ linestr[pos] = old_byte; // put it back
+#endif
+ continue; // single code point not previously legal for \u escaping
+ }
- if(U8_IS_SINGLE((uint8_t)s[i])) continue; // single code point
-
+ // otherwise, convert it to \u / \U
{
U8_NEXT(s, i, length, c);
}
if(c<0) {
fprintf(stderr, "Illegal utf-8 sequence\n");
+ fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
return true;
}
--- /dev/null
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+#include "unicode/ucnv.h"
+#include "unicode/uniset.h"
+#include <stdio.h>
+
+static const char *kConverter = "ibm-1047";
+
+int main(int argc, const char *argv[]) {
+ printf("// %s\n", U_COPYRIGHT_STRING);
+ printf("// generated by tblgen. You weren't going to edit it by hand, were you?\n");
+ printf("\n");
+
+ UErrorCode status = U_ZERO_ERROR;
+ LocalUConverterPointer cnv(ucnv_open(kConverter, &status));
+
+ if(U_FAILURE(status)) {
+ fprintf(stderr, "Failed to open %s: %s\n", kConverter, u_errorName(status));
+ return 1;
+ }
+
+ printf("static const char cp1047_8859_1[256] = { \n");
+ for(int i=0x00; i<0x100; i++) {
+ char cp1047[1];
+ cp1047[0] = i;
+ UChar u[1];
+ UChar *target = u;
+ const char *source = cp1047;
+ ucnv_toUnicode(cnv.getAlias(), &target, u+1, &source, cp1047+1, nullptr, true, &status);
+ if(U_FAILURE(status)) {
+ fprintf(stderr, "Conversion failure at #%X: %s\n", i, u_errorName(status));
+ return 2;
+ }
+ printf(" (char)0x%02X, /* %02X */\n", u[0], i);
+ }
+ printf("};\n\n");
+
+ //
+ // UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status);
+ UnicodeSet oldIllegal("[0-9 a-z A-Z "
+ "_ \\{ \\} \\[ \\] # \\( \\) < > % \\: ; . "
+ "? * + \\- / \\^ \\& | ~ ! = , \\ \" ' ]", status);
+
+ /*
+
+http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 ) page 10, section 2.2 says:
+
+1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15)
+senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters:
+a b c d e f g h i j k l m n opqrstuvwxyz
+A B C D E F G H I J K L M N OPQRSTUVWXYZ
+0 12 3 4 5 6 7 8 9
+ _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\"
+2 The universal-character-name construct provides a way to name other characters. hex-quad:
+hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit
+universal-character-name: \u hex-quad
+\U hex-quad hex-quad
+The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed.
+
+
+So basically: printable ASCII plus 0x00-0x1F, 0x7F-0x9F, was all illegal.
+
+Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html
+
+ */
+
+
+
+ printf("static const bool oldIllegal[256] = { \n");
+ for(UChar i=0x00; i<0x100;i++) {
+ printf(" %s, /* U+%04X */\n",
+ (oldIllegal.contains(i))?" true":"false",
+ i);
+ }
+ printf("};\n\n");
+
+ return 0;
+}