#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.15 2018/10/15 16:29:16 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.16 2019/02/19 20:30:35 christos Exp $")
#endif /* lint */
#include "magic.h"
size_t *);
private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "UTF-8 Unicode";
*code_mime = "utf-8";
+ } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+ if (ucs_type == 1) {
+ *code = "Little-endian UTF-32 Unicode";
+ *code_mime = "utf-32le";
+ } else {
+ *code = "Big-endian UTF-32 Unicode";
+ *code_mime = "utf-32be";
+ }
+ DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Little-endian UTF-16 Unicode";
}
private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
size_t *ulen)
{
int bigend;
if (nbytes < 2)
return 0;
- if (buf[0] == 0xff && buf[1] == 0xfe)
+ if (bf[0] == 0xff && bf[1] == 0xfe)
bigend = 0;
- else if (buf[0] == 0xfe && buf[1] == 0xff)
+ else if (bf[0] == 0xfe && bf[1] == 0xff)
bigend = 1;
else
return 0;
/* XXX fix to properly handle chars > 65536 */
if (bigend)
- ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+ ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
else
- ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+ ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
- if (ubuf[*ulen - 1] == 0xfffe)
+ if (ubf[*ulen - 1] == 0xfffe)
return 0;
- if (ubuf[*ulen - 1] < 128 &&
- text_chars[(size_t)ubuf[*ulen - 1]] != T)
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[(size_t)ubf[*ulen - 1]] != T)
return 0;
}
return 1 + bigend;
}
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+ size_t *ulen)
+{
+ int bigend;
+ size_t i;
+
+ if (nbytes < 4)
+ return 0;
+
+ if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+ bigend = 0;
+ else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+ bigend = 1;
+ else
+ return 0;
+
+ *ulen = 0;
+
+ for (i = 4; i + 1 < nbytes; i += 4) {
+ /* XXX fix to properly handle chars > 65536 */
+
+ if (bigend)
+ ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
+ | (bf[i + 1] << 16) | bf[i] << 24;
+ else
+ ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8)
+ | (bf[i + 2] << 16) | (bf[i + 3] << 24);
+
+ if (ubf[*ulen - 1] == 0xfffe)
+ return 0;
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[(size_t)ubf[*ulen - 1]] != T)
+ return 0;
+ }
+
+ return 1 + bigend;
+}
#undef F
#undef T
#undef I