-.\" $File: magic.man,v 1.54 2008/02/28 22:22:19 rrt Exp $
+.\" $File: magic.man,v 1.55 2008/02/28 22:24:46 rrt Exp $
.Dd January 10, 2007
.Dt MAGIC __FSECTION__
.Os
onwards, where
.Dv N
is the given offset.
-Because it looks for newline characters, it is only useful for
-(single-byte encoded) text.
+Line endings are assumed to be in the machine's native format.
.Dv ^
and
.Dv $
no other matches.
.El
.Pp
+Each top-level magic pattern (see below for an explanation of levels)
+is classified as text or binary according to the types used. Types
+.Dq regex
+and
+.Dq search
+are classified as text tests, unless non-printable characters are used
+in the pattern. All other tests are classified as binary. A top-level
+pattern is considered to be a test text when all its patterns are text
+patterns; otherwise, it is considered to be a binary pattern. When
+matching a file, binary patterns are tried first; if no match is
+found, and the file looks like text, then its encoding is determined
+and the text patterns are tried.
+.Pp
The numeric types may optionally be followed by
.Dv \*[Am]
and a numeric value,
#include <dirent.h>
#ifndef lint
-FILE_RCSID("@(#)$File: apprentice.c,v 1.129 2008/02/27 17:59:21 rrt Exp $")
+FILE_RCSID("@(#)$File: apprentice.c,v 1.130 2008/02/27 18:04:53 rrt Exp $")
#endif /* lint */
#define EATAB {while (isascii((unsigned char) *l) && \
return 1;
}
+private int
+set_test_type(struct magic *mstart, struct magic *m)
+{
+ switch (m->type) {
+ case FILE_BYTE:
+ case FILE_SHORT:
+ case FILE_LONG:
+ case FILE_DATE:
+ case FILE_BESHORT:
+ case FILE_BELONG:
+ case FILE_BEDATE:
+ case FILE_LESHORT:
+ case FILE_LELONG:
+ case FILE_LEDATE:
+ case FILE_LDATE:
+ case FILE_BELDATE:
+ case FILE_LELDATE:
+ case FILE_MEDATE:
+ case FILE_MELDATE:
+ case FILE_MELONG:
+ case FILE_QUAD:
+ case FILE_LEQUAD:
+ case FILE_BEQUAD:
+ case FILE_QDATE:
+ case FILE_LEQDATE:
+ case FILE_BEQDATE:
+ case FILE_QLDATE:
+ case FILE_LEQLDATE:
+ case FILE_BEQLDATE:
+ case FILE_FLOAT:
+ case FILE_BEFLOAT:
+ case FILE_LEFLOAT:
+ case FILE_DOUBLE:
+ case FILE_BEDOUBLE:
+ case FILE_LEDOUBLE:
+ case FILE_STRING:
+ case FILE_PSTRING:
+ case FILE_BESTRING16:
+ case FILE_LESTRING16:
+ /* binary test, set flag */
+ mstart->flag |= BINTEST;
+ break;
+ case FILE_REGEX:
+ case FILE_SEARCH:
+ /* binary test if pattern is not text */
+ if (file_looks_utf8(m->value.s, m->vallen, NULL, NULL) == 0)
+ mstart->flag |= BINTEST;
+ break;
+ case FILE_DEFAULT:
+ /* can't deduce anything; we shouldn't see this at the
+ top level anyway */
+ break;
+ case FILE_INVALID:
+ default:
+ /* invalid search type, but no need to complain here */
+ break;
+ }
+}
+
/*
* Load and parse one file.
*/
{
int errs = 0;
struct magic_entry *marray;
- uint32_t marraycount, i, mentrycount = 0;
+ uint32_t marraycount, i, mentrycount = 0, starttest;
char *subfn;
struct stat st;
DIR *dir;
if (errs)
goto out;
+ /* Set types of tests */
+ for (i = 0; i < marraycount; ) {
+ if (marray[i].mp->cont_level != 0) {
+ i++;
+ continue;
+ }
+
+ starttest = i;
+ do {
+ set_test_type(marray[starttest].mp, marray[i].mp);
+ if (ms->flags & MAGIC_DEBUG) {
+ (void)fprintf(stderr, "%s%s%s: %s\n",
+ marray[i].mp->mimetype,
+ marray[i].mp->mimetype[0] == '\0' ? "" : "; ",
+ marray[i].mp->desc[0] ? marray[i].mp->desc : "(no description)",
+ marray[i].mp->flag & BINTEST ? "binary" : "text");
+ if (marray[i].mp->flag & BINTEST) {
+#define SYMBOL "text"
+#define SYMLEN sizeof(SYMBOL)
+ char *p = strstr(marray[i].mp->desc, "text");
+ if (p && (p == marray[i].mp->desc || isspace(p[-1])) &&
+ (p + SYMLEN - marray[i].mp->desc == MAXstring ||
+ (p[SYMLEN] == '\0' || isspace(p[SYMLEN])))) {
+ (void)fprintf(stderr,
+ "*** Possible binary test for text type\n");
+ }
+#undef SYMBOL
+#undef SYMLEN
+ }
+ }
+ } while (++i < marraycount && marray[i].mp->cont_level != 0);
+ }
+
qsort(marray, marraycount, sizeof(*marray), apprentice_sort);
+
/*
* Make sure that any level 0 "default" line is last (if one exists).
*/
#include "names.h"
#ifndef lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.60 2008/02/24 01:16:08 rrt Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.61 2008/02/27 15:02:33 rrt Exp $")
#endif /* lint */
#define MAXLINELEN 300 /* longest sane line length */
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
private int ascmatch(const unsigned char *, const unichar *, size_t);
+private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
protected int
file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
{
size_t i;
- unsigned char *nbuf = NULL;
+ unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
unichar *ubuf = NULL;
- size_t ulen;
+ size_t ulen, mlen;
const struct names *p;
int rv = -1;
int mime = ms->flags & MAGIC_MIME;
goto done;
}
+ /* Convert ubuf to UTF-8 and try text soft magic */
+ /* If original was ASCII or UTF-8, could use nbuf instead of
+ re-converting. */
+ /* malloc size is a conservative overestimate; could be
+ re-converting improved, or at least realloced after
+ re-converting conversion. */
+ mlen = ulen * 6;
+ if ((utf8_buf = malloc(mlen)) == NULL) {
+ file_oomem(ms, mlen);
+ goto done;
+ }
+ if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
+ goto done;
+ if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
+ rv = 1;
+ goto done;
+ }
+
/* look for tokens from names.h - this is expensive! */
if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
goto subtype_identified;
free(nbuf);
if (ubuf)
free(ubuf);
+ if (utf8_buf)
+ free(utf8_buf);
return rv;
}
return 1;
}
+/*
+ * Encode Unicode string as UTF-8, returning pointer to character
+ * after end of string, or NULL if an invalid character is found.
+ */
+private unsigned char *
+encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
+{
+ size_t i;
+ unsigned char *end = buf + len;
+
+ for (i = 0; i < ulen; i++) {
+ if (ubuf[i] <= 0x7f) {
+ if (end - buf < 1)
+ return NULL;
+ *buf++ = (unsigned char)ubuf[i];
+ } else if (ubuf[i] <= 0x7ff) {
+ if (end - buf < 2)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0xffff) {
+ if (end - buf < 3)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x1fffff) {
+ if (end - buf < 4)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x3ffffff) {
+ if (end - buf < 5)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x7fffffff) {
+ if (end - buf < 6)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
+ *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else /* Invalid character */
+ return NULL;
+ }
+
+ return buf;
+}
+
/*
* Decide whether some text looks like UTF-8. Returns:
*
*/
/*
* file.h - definitions for file(1) program
- * @(#)$File: file.h,v 1.101 2008/02/24 01:16:08 rrt Exp $
+ * @(#)$File: file.h,v 1.102 2008/02/24 01:35:58 christos Exp $
*/
#ifndef __file_h__
/* Word 1 */
uint16_t cont_level; /* level of ">" */
uint8_t flag;
-#define INDIR 1 /* if '(...)' appears */
-#define OFFADD 2 /* if '>&' or '>...(&' appears */
-#define INDIROFFADD 4 /* if '>&(' appears */
-#define UNSIGNED 8 /* comparison is unsigned */
-#define NOSPACE 16 /* suppress space character before output */
-#define TEXTTEST 32 /* test is for a text type (set only
+#define INDIR 0x01 /* if '(...)' appears */
+#define OFFADD 0x02 /* if '>&' or '>...(&' appears */
+#define INDIROFFADD 0x04 /* if '>&(' appears */
+#define UNSIGNED 0x08 /* comparison is unsigned */
+#define NOSPACE 0x10 /* suppress space character before output */
+#define BINTEST 0x20 /* test is for a binary type (set only
for top-level tests) */
+#define TEXTTEST 0 /* for passing to file_softmagic */
+
uint8_t dummy1;
/* Word 2 */
const unsigned char *, size_t);
protected int file_ascmagic(struct magic_set *, const unsigned char *, size_t);
protected int file_is_tar(struct magic_set *, const unsigned char *, size_t);
-protected int file_softmagic(struct magic_set *, const unsigned char *, size_t);
+protected int file_softmagic(struct magic_set *, const unsigned char *, size_t, int);
protected struct mlist *file_apprentice(struct magic_set *, const char *, int);
protected uint64_t file_signextend(struct magic_set *, struct magic *,
uint64_t);
#endif
#ifndef lint
-FILE_RCSID("@(#)$File: funcs.c,v 1.37 2008/02/07 00:58:52 christos Exp $")
+FILE_RCSID("@(#)$File: funcs.c,v 1.38 2008/02/19 00:58:59 rrt Exp $")
#endif /* lint */
/*
(m = file_is_tar(ms, buf, nb)) == 0) {
/* try tests in /etc/magic (or surrogate magic file) */
if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 ||
- (m = file_softmagic(ms, buf, nb)) == 0) {
+ (m = file_softmagic(ms, buf, nb, BINTEST)) == 0) {
/* try known keywords, check whether it is ASCII */
if ((ms->flags & MAGIC_NO_CHECK_ASCII) != 0 ||
(m = file_ascmagic(ms, buf, nb)) == 0) {
#ifndef lint
-FILE_RCSID("@(#)$File: softmagic.c,v 1.115 2008/02/25 01:05:30 rrt Exp $")
+FILE_RCSID("@(#)$File: softmagic.c,v 1.116 2008/02/25 02:54:08 rrt Exp $")
#endif /* lint */
private int match(struct magic_set *, struct magic *, uint32_t,
- const unsigned char *, size_t);
+ const unsigned char *, size_t, int);
private int mget(struct magic_set *, const unsigned char *,
struct magic *, size_t, unsigned int);
private int magiccheck(struct magic_set *, struct magic *);
*/
/*ARGSUSED1*/ /* nbytes passed for regularity, maybe need later */
protected int
-file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
+file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes, int mode)
{
struct mlist *ml;
int rv;
for (ml = ms->mlist->next; ml != ms->mlist; ml = ml->next)
- if ((rv = match(ms, ml->magic, ml->nmagic, buf, nbytes)) != 0)
+ if ((rv = match(ms, ml->magic, ml->nmagic, buf, nbytes, mode)) != 0)
return rv;
return 0;
*/
private int
match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
- const unsigned char *s, size_t nbytes)
+ const unsigned char *s, size_t nbytes, int mode)
{
uint32_t magindex = 0;
unsigned int cont_level = 0;
int flush;
struct magic *m = &magic[magindex];
+ if ((m->flag & BINTEST) != mode) {
+ /* Skip sub-tests */
+ while (magic[magindex + 1].cont_level != 0 &&
+ ++magindex < nmagic)
+ continue;
+ continue; /* Skip to next top-level test*/
+ }
+
ms->offset = m->offset;
ms->line = m->lineno;