From: Reuben Thomas Date: Sat, 1 Mar 2008 22:21:48 +0000 (+0000) Subject: Text/binary split. X-Git-Tag: FILE4_24~16 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d99312e44ac3abaae5ed9365e30fd0f6d88d511a;p=file Text/binary split. --- diff --git a/doc/magic.man b/doc/magic.man index 40817ea3..e3dfd96e 100644 --- a/doc/magic.man +++ b/doc/magic.man @@ -1,4 +1,4 @@ -.\" $File: magic.man,v 1.54 2008/02/28 22:22:19 rrt Exp $ +.\" $File: magic.man,v 1.55 2008/02/28 22:24:46 rrt Exp $ .Dd January 10, 2007 .Dt MAGIC __FSECTION__ .Os @@ -163,8 +163,7 @@ The regular expression is tested against line onwards, where .Dv N is the given offset. -Because it looks for newline characters, it is only useful for -(single-byte encoded) text. +Line endings are assumed to be in the machine's native format. .Dv ^ and .Dv $ @@ -187,6 +186,19 @@ This is intended to be used with the test no other matches. .El .Pp +Each top-level magic pattern (see below for an explanation of levels) +is classified as text or binary according to the types used. Types +.Dq regex +and +.Dq search +are classified as text tests, unless non-printable characters are used +in the pattern. All other tests are classified as binary. A top-level +pattern is considered to be a test text when all its patterns are text +patterns; otherwise, it is considered to be a binary pattern. When +matching a file, binary patterns are tried first; if no match is +found, and the file looks like text, then its encoding is determined +and the text patterns are tried. +.Pp The numeric types may optionally be followed by .Dv \*[Am] and a numeric value, diff --git a/src/apprentice.c b/src/apprentice.c index 1aefbf80..72e9b969 100644 --- a/src/apprentice.c +++ b/src/apprentice.c @@ -49,7 +49,7 @@ #include #ifndef lint -FILE_RCSID("@(#)$File: apprentice.c,v 1.129 2008/02/27 17:59:21 rrt Exp $") +FILE_RCSID("@(#)$File: apprentice.c,v 1.130 2008/02/27 18:04:53 rrt Exp $") #endif /* lint */ #define EATAB {while (isascii((unsigned char) *l) && \ @@ -505,6 +505,65 @@ apprentice_sort(const void *a, const void *b) return 1; } +private int +set_test_type(struct magic *mstart, struct magic *m) +{ + switch (m->type) { + case FILE_BYTE: + case FILE_SHORT: + case FILE_LONG: + case FILE_DATE: + case FILE_BESHORT: + case FILE_BELONG: + case FILE_BEDATE: + case FILE_LESHORT: + case FILE_LELONG: + case FILE_LEDATE: + case FILE_LDATE: + case FILE_BELDATE: + case FILE_LELDATE: + case FILE_MEDATE: + case FILE_MELDATE: + case FILE_MELONG: + case FILE_QUAD: + case FILE_LEQUAD: + case FILE_BEQUAD: + case FILE_QDATE: + case FILE_LEQDATE: + case FILE_BEQDATE: + case FILE_QLDATE: + case FILE_LEQLDATE: + case FILE_BEQLDATE: + case FILE_FLOAT: + case FILE_BEFLOAT: + case FILE_LEFLOAT: + case FILE_DOUBLE: + case FILE_BEDOUBLE: + case FILE_LEDOUBLE: + case FILE_STRING: + case FILE_PSTRING: + case FILE_BESTRING16: + case FILE_LESTRING16: + /* binary test, set flag */ + mstart->flag |= BINTEST; + break; + case FILE_REGEX: + case FILE_SEARCH: + /* binary test if pattern is not text */ + if (file_looks_utf8(m->value.s, m->vallen, NULL, NULL) == 0) + mstart->flag |= BINTEST; + break; + case FILE_DEFAULT: + /* can't deduce anything; we shouldn't see this at the + top level anyway */ + break; + case FILE_INVALID: + default: + /* invalid search type, but no need to complain here */ + break; + } +} + /* * Load and parse one file. */ @@ -561,7 +620,7 @@ apprentice_load(struct magic_set *ms, struct magic **magicp, uint32_t *nmagicp, { int errs = 0; struct magic_entry *marray; - uint32_t marraycount, i, mentrycount = 0; + uint32_t marraycount, i, mentrycount = 0, starttest; char *subfn; struct stat st; DIR *dir; @@ -600,7 +659,41 @@ apprentice_load(struct magic_set *ms, struct magic **magicp, uint32_t *nmagicp, if (errs) goto out; + /* Set types of tests */ + for (i = 0; i < marraycount; ) { + if (marray[i].mp->cont_level != 0) { + i++; + continue; + } + + starttest = i; + do { + set_test_type(marray[starttest].mp, marray[i].mp); + if (ms->flags & MAGIC_DEBUG) { + (void)fprintf(stderr, "%s%s%s: %s\n", + marray[i].mp->mimetype, + marray[i].mp->mimetype[0] == '\0' ? "" : "; ", + marray[i].mp->desc[0] ? marray[i].mp->desc : "(no description)", + marray[i].mp->flag & BINTEST ? "binary" : "text"); + if (marray[i].mp->flag & BINTEST) { +#define SYMBOL "text" +#define SYMLEN sizeof(SYMBOL) + char *p = strstr(marray[i].mp->desc, "text"); + if (p && (p == marray[i].mp->desc || isspace(p[-1])) && + (p + SYMLEN - marray[i].mp->desc == MAXstring || + (p[SYMLEN] == '\0' || isspace(p[SYMLEN])))) { + (void)fprintf(stderr, + "*** Possible binary test for text type\n"); + } +#undef SYMBOL +#undef SYMLEN + } + } + } while (++i < marraycount && marray[i].mp->cont_level != 0); + } + qsort(marray, marraycount, sizeof(*marray), apprentice_sort); + /* * Make sure that any level 0 "default" line is last (if one exists). */ diff --git a/src/ascmagic.c b/src/ascmagic.c index 191d2353..4576fcad 100644 --- a/src/ascmagic.c +++ b/src/ascmagic.c @@ -49,7 +49,7 @@ #include "names.h" #ifndef lint -FILE_RCSID("@(#)$File: ascmagic.c,v 1.60 2008/02/24 01:16:08 rrt Exp $") +FILE_RCSID("@(#)$File: ascmagic.c,v 1.61 2008/02/27 15:02:33 rrt Exp $") #endif /* lint */ #define MAXLINELEN 300 /* longest sane line length */ @@ -65,15 +65,16 @@ private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); private int ascmatch(const unsigned char *, const unichar *, size_t); +private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t); protected int file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) { size_t i; - unsigned char *nbuf = NULL; + unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end; unichar *ubuf = NULL; - size_t ulen; + size_t ulen, mlen; const struct names *p; int rv = -1; int mime = ms->flags & MAGIC_MIME; @@ -164,6 +165,24 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) goto done; } + /* Convert ubuf to UTF-8 and try text soft magic */ + /* If original was ASCII or UTF-8, could use nbuf instead of + re-converting. */ + /* malloc size is a conservative overestimate; could be + re-converting improved, or at least realloced after + re-converting conversion. */ + mlen = ulen * 6; + if ((utf8_buf = malloc(mlen)) == NULL) { + file_oomem(ms, mlen); + goto done; + } + if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL) + goto done; + if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) { + rv = 1; + goto done; + } + /* look for tokens from names.h - this is expensive! */ if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; @@ -332,6 +351,8 @@ done: free(nbuf); if (ubuf) free(ubuf); + if (utf8_buf) + free(utf8_buf); return rv; } @@ -490,6 +511,63 @@ looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, return 1; } +/* + * Encode Unicode string as UTF-8, returning pointer to character + * after end of string, or NULL if an invalid character is found. + */ +private unsigned char * +encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen) +{ + size_t i; + unsigned char *end = buf + len; + + for (i = 0; i < ulen; i++) { + if (ubuf[i] <= 0x7f) { + if (end - buf < 1) + return NULL; + *buf++ = (unsigned char)ubuf[i]; + } else if (ubuf[i] <= 0x7ff) { + if (end - buf < 2) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0xffff) { + if (end - buf < 3) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x1fffff) { + if (end - buf < 4) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x3ffffff) { + if (end - buf < 5) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x7fffffff) { + if (end - buf < 6) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc); + *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else /* Invalid character */ + return NULL; + } + + return buf; +} + /* * Decide whether some text looks like UTF-8. Returns: * diff --git a/src/file.h b/src/file.h index 2801d7b7..f00693f3 100644 --- a/src/file.h +++ b/src/file.h @@ -27,7 +27,7 @@ */ /* * file.h - definitions for file(1) program - * @(#)$File: file.h,v 1.101 2008/02/24 01:16:08 rrt Exp $ + * @(#)$File: file.h,v 1.102 2008/02/24 01:35:58 christos Exp $ */ #ifndef __file_h__ @@ -108,13 +108,15 @@ struct magic { /* Word 1 */ uint16_t cont_level; /* level of ">" */ uint8_t flag; -#define INDIR 1 /* if '(...)' appears */ -#define OFFADD 2 /* if '>&' or '>...(&' appears */ -#define INDIROFFADD 4 /* if '>&(' appears */ -#define UNSIGNED 8 /* comparison is unsigned */ -#define NOSPACE 16 /* suppress space character before output */ -#define TEXTTEST 32 /* test is for a text type (set only +#define INDIR 0x01 /* if '(...)' appears */ +#define OFFADD 0x02 /* if '>&' or '>...(&' appears */ +#define INDIROFFADD 0x04 /* if '>&(' appears */ +#define UNSIGNED 0x08 /* comparison is unsigned */ +#define NOSPACE 0x10 /* suppress space character before output */ +#define BINTEST 0x20 /* test is for a binary type (set only for top-level tests) */ +#define TEXTTEST 0 /* for passing to file_softmagic */ + uint8_t dummy1; /* Word 2 */ @@ -329,7 +331,7 @@ protected int file_zmagic(struct magic_set *, int, const char *, const unsigned char *, size_t); protected int file_ascmagic(struct magic_set *, const unsigned char *, size_t); protected int file_is_tar(struct magic_set *, const unsigned char *, size_t); -protected int file_softmagic(struct magic_set *, const unsigned char *, size_t); +protected int file_softmagic(struct magic_set *, const unsigned char *, size_t, int); protected struct mlist *file_apprentice(struct magic_set *, const char *, int); protected uint64_t file_signextend(struct magic_set *, struct magic *, uint64_t); diff --git a/src/funcs.c b/src/funcs.c index 67f01ceb..e804c3be 100644 --- a/src/funcs.c +++ b/src/funcs.c @@ -38,7 +38,7 @@ #endif #ifndef lint -FILE_RCSID("@(#)$File: funcs.c,v 1.37 2008/02/07 00:58:52 christos Exp $") +FILE_RCSID("@(#)$File: funcs.c,v 1.38 2008/02/19 00:58:59 rrt Exp $") #endif /* lint */ /* @@ -180,7 +180,7 @@ file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf, (m = file_is_tar(ms, buf, nb)) == 0) { /* try tests in /etc/magic (or surrogate magic file) */ if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 || - (m = file_softmagic(ms, buf, nb)) == 0) { + (m = file_softmagic(ms, buf, nb, BINTEST)) == 0) { /* try known keywords, check whether it is ASCII */ if ((ms->flags & MAGIC_NO_CHECK_ASCII) != 0 || (m = file_ascmagic(ms, buf, nb)) == 0) { diff --git a/src/softmagic.c b/src/softmagic.c index b1846293..08c49e68 100644 --- a/src/softmagic.c +++ b/src/softmagic.c @@ -38,11 +38,11 @@ #ifndef lint -FILE_RCSID("@(#)$File: softmagic.c,v 1.115 2008/02/25 01:05:30 rrt Exp $") +FILE_RCSID("@(#)$File: softmagic.c,v 1.116 2008/02/25 02:54:08 rrt Exp $") #endif /* lint */ private int match(struct magic_set *, struct magic *, uint32_t, - const unsigned char *, size_t); + const unsigned char *, size_t, int); private int mget(struct magic_set *, const unsigned char *, struct magic *, size_t, unsigned int); private int magiccheck(struct magic_set *, struct magic *); @@ -69,12 +69,12 @@ private void cvt_64(union VALUETYPE *, const struct magic *); */ /*ARGSUSED1*/ /* nbytes passed for regularity, maybe need later */ protected int -file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) +file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes, int mode) { struct mlist *ml; int rv; for (ml = ms->mlist->next; ml != ms->mlist; ml = ml->next) - if ((rv = match(ms, ml->magic, ml->nmagic, buf, nbytes)) != 0) + if ((rv = match(ms, ml->magic, ml->nmagic, buf, nbytes, mode)) != 0) return rv; return 0; @@ -109,7 +109,7 @@ file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) */ private int match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, - const unsigned char *s, size_t nbytes) + const unsigned char *s, size_t nbytes, int mode) { uint32_t magindex = 0; unsigned int cont_level = 0; @@ -125,6 +125,14 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, int flush; struct magic *m = &magic[magindex]; + if ((m->flag & BINTEST) != mode) { + /* Skip sub-tests */ + while (magic[magindex + 1].cont_level != 0 && + ++magindex < nmagic) + continue; + continue; /* Skip to next top-level test*/ + } + ms->offset = m->offset; ms->line = m->lineno;