From caaf816f93f1eb8e0af373b45714431889caacd1 Mon Sep 17 00:00:00 2001 From: Reuben Thomas Date: Thu, 6 Nov 2008 22:49:08 +0000 Subject: [PATCH] Fixes to MIME detection. --- doc/file.man | 46 +++++++++++++++++++++++----------------------- src/encoding.c | 19 +++++++------------ src/funcs.c | 9 +++++---- src/softmagic.c | 16 +++++++++------- 4 files changed, 44 insertions(+), 46 deletions(-) diff --git a/doc/file.man b/doc/file.man index db0c4498..8f7a95ac 100644 --- a/doc/file.man +++ b/doc/file.man @@ -1,4 +1,4 @@ -.\" $File: file.man,v 1.77 2008/10/30 10:50:24 rrt Exp $ +.\" $File: file.man,v 1.78 2008/11/06 21:17:45 rrt Exp $ .Dd October 9, 2008 .Dt FILE __CSECTION__ .Os @@ -55,12 +55,12 @@ When modifying magic files or the program itself, make sure to .Em "preserve these keywords" . Users depend on knowing that all the readable files in a directory have the word -.Dq text +.Sq text printed. Don't do as Berkeley did and change -.Dq shell commands text +.Sq shell commands text to -.Dq shell script . +.Sq shell script . .Pp The filesystem tests are based on examining the return from a .Xr stat 2 @@ -117,10 +117,10 @@ in each set. If a file passes any of these tests, its character set is reported. ASCII, ISO-8859-x, UTF-8, and extended-ASCII files are identified as -.Dq text +.Sq text because they will be mostly readable on nearly any terminal; UTF-16 and EBCDIC are only -.Dq character data +.Sq character data because, while they contain text, it is text that will require translation before it can be read. @@ -156,7 +156,7 @@ archives). .Pp Any file that cannot be identified as having been written in any of the character sets listed above is simply said to be -.Dq data . +.Sq data . .Sh OPTIONS .Bl -tag -width indent .It Fl b , -brief @@ -180,9 +180,11 @@ are: .Dv EMX application type (only on EMX). .It text -Various types of text files. +Various types of text files (this test will try to guess the text encoding, irrespective of the setting of the +.Sq encoding +option). .It encoding -Different text encodings. +Different text encodings for soft magic tests. .It tokens Looks for known tokens inside text files. .It cdf @@ -220,17 +222,15 @@ is not defined. .It Fl i , -mime Causes the file command to output mime type strings rather than the more traditional human readable ones. Thus it may say -.Dq text/plain; charset=us-ascii +.Sq text/plain; charset=us-ascii rather than -.Dq ASCII text . +.Sq ASCII text . In order for this option to work, file changes the way it handles files recognized by the command itself (such as many of the text file types, directories etc), and makes use of an alternative -.Dq magic +.Sq magic file. -(See -.Dq FILES -section, below). +(See the FILES section, below). .It Fl -mime-type , -mime-encoding Like .Fl i , @@ -238,10 +238,10 @@ but print only the specified element(s). .It Fl k , -keep-going Don't stop at the first match, keep going. Subsequent matches will be have the string -.Dq "\[rs]012\- " +.Sq "\[rs]012\- " prepended. (If you want a newline, see the -.Dq "\-r" +.Sq "\-r" option.) .It Fl L , -dereference option causes symlinks to be followed, as the like-named option in @@ -323,7 +323,7 @@ will not attempt to open .Pa $HOME/.magic . .Nm adds -.Dq .mgc +.Sq .mgc to the value of this variable as appropriate. The environment variable .Dv POSIXLY_CORRECT @@ -479,7 +479,7 @@ support and merge MIME and non-MIME magic, support directories as well as files of magic, apply many bug fixes and improve the build system. .Pp The list of contributors to the -.Dq magic +.Sq magic directory (magic files) is too long to include here. You know who you are; thank you. @@ -524,15 +524,15 @@ than position within the magic file? .Pp The program should provide a way to give an estimate of -.Dq how good +.Sq how good a guess is. We end up removing guesses (e.g. -.Dq From\ +.Sq From\ as first 5 chars of file) because they are not as good as other guesses (e.g. -.Dq Newsgroups: +.Sq Newsgroups: versus -.Dq Return-Path: +.Sq Return-Path: ). Still, if the others don't pan out, it should be possible to use the first guess. diff --git a/src/encoding.c b/src/encoding.c index fd368353..223f6cbe 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -39,7 +39,7 @@ #include #ifndef lint -FILE_RCSID("@(#)$File: ascmagic.c,v 1.68 2008/10/30 10:50:24 rrt Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.1 2008/11/06 21:17:45 rrt Exp $") #endif /* lint */ private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); @@ -60,7 +60,7 @@ protected int file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) { size_t mlen; - int rv = 0, ucs_type; + int rv = 1, ucs_type; unsigned char *nbuf = NULL; mlen = (nbytes + 1) * sizeof(nbuf[0]); @@ -74,18 +74,16 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni goto done; } + *type = "text"; if (looks_ascii(buf, nbytes, *ubuf, ulen)) { *code = "ASCII"; *code_mime = "us-ascii"; - *type = "text"; } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { *code = "UTF-8 Unicode (with BOM)"; *code_mime = "utf-8"; - *type = "text"; } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { *code = "UTF-8 Unicode"; *code_mime = "utf-8"; - *type = "text"; } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { if (ucs_type == 1) { *code = "Little-endian UTF-16 Unicode"; @@ -94,28 +92,25 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni *code = "Big-endian UTF-16 Unicode"; *code_mime = "utf-16be"; } - *type = "character data"; } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { *code = "ISO-8859"; - *type = "text"; *code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { *code = "Non-ISO extended-ASCII"; - *type = "text"; *code_mime = "unknown-8bit"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { *code = "EBCDIC"; - *type = "character data"; *code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { *code = "International EBCDIC"; - *type = "character data"; *code_mime = "ebcdic"; - } else /* Doesn't look like text at all */ - rv = -1; + } else { /* Doesn't look like text at all */ + rv = 0; + *type = "binary"; + } } done: diff --git a/src/funcs.c b/src/funcs.c index 29f16b40..e754d4be 100644 --- a/src/funcs.c +++ b/src/funcs.c @@ -27,7 +27,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: funcs.c,v 1.47 2008/11/04 16:38:28 christos Exp $") +FILE_RCSID("@(#)$File: funcs.c,v 1.48 2008/11/06 21:17:45 rrt Exp $") #endif /* lint */ #include "magic.h" @@ -158,7 +158,7 @@ protected int file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf, size_t nb) { - int m = 0, rv = 0; + int m = 0, rv = 0, looks_text = 0; int mime = ms->flags & MAGIC_MIME; const unsigned char *ubuf = CAST(const unsigned char *, buf); unichar *u8buf = NULL; @@ -206,13 +206,14 @@ file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf, (m = file_trycdf(ms, fd, ubuf, nb)) == 0) { /* try to discover text encoding */ if ((ms->flags & MAGIC_NO_CHECK_ENCODING) == 0) - file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type); + looks_text = file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type); /* try soft magic tests */ if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 || (m = file_softmagic(ms, ubuf, nb, BINTEST)) == 0) { /* try text properties (and possibly text tokens) */ if ((ms->flags & MAGIC_NO_CHECK_TEXT) != 0 || - (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) { + ((ms->flags & MAGIC_NO_CHECK_ENCODING) != 0 && (m = file_ascmagic(ms, ubuf, nb)) == 0) || + looks_text == 0 || (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) { /* give up */ if ((!mime || (mime & MAGIC_MIME_TYPE)) && file_printf(ms, mime ? diff --git a/src/softmagic.c b/src/softmagic.c index 969233f0..597c27b0 100644 --- a/src/softmagic.c +++ b/src/softmagic.c @@ -32,7 +32,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: softmagic.c,v 1.127 2008/11/06 15:38:28 christos Exp $") +FILE_RCSID("@(#)$File: softmagic.c,v 1.128 2008/11/06 21:17:45 rrt Exp $") #endif /* lint */ #include "magic.h" @@ -1828,15 +1828,17 @@ handle_annotation(struct magic_set *ms, struct magic *m) return 1; } if (ms->flags & MAGIC_MIME) { - if (ms->flags & MAGIC_MIME_TYPE) { + if ((ms->flags & MAGIC_MIME_TYPE) && m->mimetype[0]) { + ms->event_flags |= EVENT_WROTE_MIME_TYPE; if (file_printf(ms, "%s", m->mimetype) == -1) return -1; } - if (ms->flags & MAGIC_MIME_ENCODING) { - ms->event_flags |= EVENT_WROTE_MIME_TYPE; - return 0; /* Let ascmagic find the encoding */ - } - return 1; + /* If we want an encoding, let ascmagic find it. */ + if ((ms->flags & MAGIC_MIME_ENCODING)) + return 0; + /* If we didn't write a MIME type, and we want one, + allow ascmagic to run.*/ + return m->mimetype[0] != '\0'; } return -2; } -- 2.40.0