]> granicus.if.org Git - file/commitdiff
PR/61: tmc: Add UCS-32 built-in detection.
authorChristos Zoulas <christos@zoulas.com>
Tue, 19 Feb 2019 20:30:35 +0000 (20:30 +0000)
committerChristos Zoulas <christos@zoulas.com>
Tue, 19 Feb 2019 20:30:35 +0000 (20:30 +0000)
src/encoding.c

index ef8493d52eab323dfc7d6447619c5251ec32bb01..b9727fd8f9a681b591239189facbf38e6a2264ef 100644 (file)
@@ -35,7 +35,7 @@
 #include "file.h"
 
 #ifndef        lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.15 2018/10/15 16:29:16 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.16 2019/02/19 20:30:35 christos Exp $")
 #endif /* lint */
 
 #include "magic.h"
@@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
     size_t *);
 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
@@ -116,6 +117,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
                DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
                *code = "UTF-8 Unicode";
                *code_mime = "utf-8";
+       } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+               if (ucs_type == 1) {
+                       *code = "Little-endian UTF-32 Unicode";
+                       *code_mime = "utf-32le";
+               } else {
+                       *code = "Big-endian UTF-32 Unicode";
+                       *code_mime = "utf-32be";
+               }
+               DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
        } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
                if (ucs_type == 1) {
                        *code = "Little-endian UTF-16 Unicode";
@@ -410,7 +420,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 }
 
 private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
     size_t *ulen)
 {
        int bigend;
@@ -419,9 +429,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
        if (nbytes < 2)
                return 0;
 
-       if (buf[0] == 0xff && buf[1] == 0xfe)
+       if (bf[0] == 0xff && bf[1] == 0xfe)
                bigend = 0;
-       else if (buf[0] == 0xfe && buf[1] == 0xff)
+       else if (bf[0] == 0xfe && bf[1] == 0xff)
                bigend = 1;
        else
                return 0;
@@ -432,20 +442,58 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
                /* XXX fix to properly handle chars > 65536 */
 
                if (bigend)
-                       ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+                       ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
                else
-                       ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+                       ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
 
-               if (ubuf[*ulen - 1] == 0xfffe)
+               if (ubf[*ulen - 1] == 0xfffe)
                        return 0;
-               if (ubuf[*ulen - 1] < 128 &&
-                   text_chars[(size_t)ubuf[*ulen - 1]] != T)
+               if (ubf[*ulen - 1] < 128 &&
+                   text_chars[(size_t)ubf[*ulen - 1]] != T)
                        return 0;
        }
 
        return 1 + bigend;
 }
 
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+    size_t *ulen)
+{
+       int bigend;
+       size_t i;
+
+       if (nbytes < 4)
+               return 0;
+
+       if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+               bigend = 0;
+       else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+               bigend = 1;
+       else
+               return 0;
+
+       *ulen = 0;
+
+       for (i = 4; i + 1 < nbytes; i += 4) {
+               /* XXX fix to properly handle chars > 65536 */
+
+               if (bigend)
+                       ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
+                           | (bf[i + 1] << 16) | bf[i] << 24;
+               else
+                       ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8) 
+                           | (bf[i + 2] << 16) | (bf[i + 3] << 24);
+
+               if (ubf[*ulen - 1] == 0xfffe)
+                       return 0;
+               if (ubf[*ulen - 1] < 128 &&
+                   text_chars[(size_t)ubf[*ulen - 1]] != T)
+                       return 0;
+       }
+
+       return 1 + bigend;
+}
 #undef F
 #undef T
 #undef I