From: Reuben Thomas Date: Thu, 6 Nov 2008 22:49:08 +0000 (+0000) Subject: Fixes to MIME detection. X-Git-Tag: FILE5_05~284 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e3a3c6c21e376e0e108341ec4e08026fa1c89dbd;p=file Fixes to MIME detection. --- diff --git a/doc/file.man b/doc/file.man index bd3be256..1215e69e 100644 --- a/doc/file.man +++ b/doc/file.man @@ -1,4 +1,4 @@ -.\" $File: file.man,v 1.78 2008/11/06 21:17:45 rrt Exp $ +.\" $File: file.man,v 1.79 2008/11/06 22:49:08 rrt Exp $ .Dd October 9, 2008 .Dt FILE __CSECTION__ .Os @@ -55,12 +55,12 @@ When modifying magic files or the program itself, make sure to .Em "preserve these keywords" . Users depend on knowing that all the readable files in a directory have the word -.Dq text +.Sq text printed. Don't do as Berkeley did and change -.Dq shell commands text +.Sq shell commands text to -.Dq shell script . +.Sq shell script . .Pp The filesystem tests are based on examining the return from a .Xr stat 2 @@ -117,10 +117,10 @@ in each set. If a file passes any of these tests, its character set is reported. ASCII, ISO-8859-x, UTF-8, and extended-ASCII files are identified as -.Dq text +.Sq text because they will be mostly readable on nearly any terminal; UTF-16 and EBCDIC are only -.Dq character data +.Sq character data because, while they contain text, it is text that will require translation before it can be read. @@ -156,7 +156,7 @@ archives). .Pp Any file that cannot be identified as having been written in any of the character sets listed above is simply said to be -.Dq data . +.Sq data . .Sh OPTIONS .Bl -tag -width indent .It Fl b , -brief @@ -180,9 +180,11 @@ are: .Dv EMX application type (only on EMX). .It text -Various types of text files. +Various types of text files (this test will try to guess the text encoding, irrespective of the setting of the +.Sq encoding +option). .It encoding -Different text encodings. +Different text encodings for soft magic tests. .It tokens Looks for known tokens inside text files. .It cdf @@ -220,17 +222,15 @@ is not defined. .It Fl i , -mime Causes the file command to output mime type strings rather than the more traditional human readable ones. Thus it may say -.Dq text/plain; charset=us-ascii +.Sq text/plain; charset=us-ascii rather than -.Dq ASCII text . +.Sq ASCII text . In order for this option to work, file changes the way it handles files recognized by the command itself (such as many of the text file types, directories etc), and makes use of an alternative -.Dq magic +.Sq magic file. -(See -.Dq FILES -section, below). +(See the FILES section, below). .It Fl -mime-type , -mime-encoding Like .Fl i , @@ -238,10 +238,10 @@ but print only the specified element(s). .It Fl k , -keep-going Don't stop at the first match, keep going. Subsequent matches will be have the string -.Dq "\[rs]012\- " +.Sq "\[rs]012\- " prepended. (If you want a newline, see the -.Dq "\-r" +.Sq "\-r" option.) .It Fl L , -dereference option causes symlinks to be followed, as the like-named option in @@ -323,7 +323,7 @@ will not attempt to open .Pa $HOME/.magic . .Nm adds -.Dq .mgc +.Sq .mgc to the value of this variable as appropriate. The environment variable .Dv POSIXLY_CORRECT @@ -479,7 +479,7 @@ support and merge MIME and non-MIME magic, support directories as well as files of magic, apply many bug fixes and improve the build system. .Pp The list of contributors to the -.Dq magic +.Sq magic directory (magic files) is too long to include here. You know who you are; thank you. @@ -524,15 +524,15 @@ than position within the magic file? .Pp The program should provide a way to give an estimate of -.Dq how good +.Sq how good a guess is. We end up removing guesses (e.g. -.Dq From\ +.Sq From\ as first 5 chars of file) because they are not as good as other guesses (e.g. -.Dq Newsgroups: +.Sq Newsgroups: versus -.Dq Return-Path: +.Sq Return-Path: ). Still, if the others don't pan out, it should be possible to use the first guess. diff --git a/src/encoding.c b/src/encoding.c index 44320452..8a1a634a 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -39,7 +39,7 @@ #include #ifndef lint -FILE_RCSID("@(#)$File: encoding.c,v 1.1 2008/11/06 21:17:45 rrt Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.2 2008/11/06 22:49:08 rrt Exp $") #endif /* lint */ private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); @@ -60,7 +60,7 @@ protected int file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) { size_t mlen; - int rv = 0, ucs_type; + int rv = 1, ucs_type; unsigned char *nbuf = NULL; mlen = (nbytes + 1) * sizeof(nbuf[0]); @@ -74,18 +74,16 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni goto done; } + *type = "text"; if (looks_ascii(buf, nbytes, *ubuf, ulen)) { *code = "ASCII"; *code_mime = "us-ascii"; - *type = "text"; } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { *code = "UTF-8 Unicode (with BOM)"; *code_mime = "utf-8"; - *type = "text"; } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { *code = "UTF-8 Unicode"; *code_mime = "utf-8"; - *type = "text"; } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { if (ucs_type == 1) { *code = "Little-endian UTF-16 Unicode"; @@ -94,28 +92,25 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni *code = "Big-endian UTF-16 Unicode"; *code_mime = "utf-16be"; } - *type = "character data"; } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { *code = "ISO-8859"; - *type = "text"; *code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { *code = "Non-ISO extended-ASCII"; - *type = "text"; *code_mime = "unknown-8bit"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { *code = "EBCDIC"; - *type = "character data"; *code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { *code = "International EBCDIC"; - *type = "character data"; *code_mime = "ebcdic"; - } else /* Doesn't look like text at all */ - rv = -1; + } else { /* Doesn't look like text at all */ + rv = 0; + *type = "binary"; + } } done: diff --git a/src/funcs.c b/src/funcs.c index 95fe34c7..f45ffacd 100644 --- a/src/funcs.c +++ b/src/funcs.c @@ -27,7 +27,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: funcs.c,v 1.48 2008/11/06 21:17:45 rrt Exp $") +FILE_RCSID("@(#)$File: funcs.c,v 1.49 2008/11/06 22:49:08 rrt Exp $") #endif /* lint */ #include "magic.h" @@ -158,7 +158,7 @@ protected int file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf, size_t nb) { - int m = 0, rv = 0; + int m = 0, rv = 0, looks_text = 0; int mime = ms->flags & MAGIC_MIME; const unsigned char *ubuf = CAST(const unsigned char *, buf); unichar *u8buf = NULL; @@ -206,13 +206,14 @@ file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf, (m = file_trycdf(ms, fd, ubuf, nb)) == 0) { /* try to discover text encoding */ if ((ms->flags & MAGIC_NO_CHECK_ENCODING) == 0) - file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type); + looks_text = file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type); /* try soft magic tests */ if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 || (m = file_softmagic(ms, ubuf, nb, BINTEST)) == 0) { /* try text properties (and possibly text tokens) */ if ((ms->flags & MAGIC_NO_CHECK_TEXT) != 0 || - (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) { + ((ms->flags & MAGIC_NO_CHECK_ENCODING) != 0 && (m = file_ascmagic(ms, ubuf, nb)) == 0) || + looks_text == 0 || (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) { /* give up */ if ((!mime || (mime & MAGIC_MIME_TYPE)) && file_printf(ms, mime ? diff --git a/src/softmagic.c b/src/softmagic.c index 0cebe2db..03d9fd1c 100644 --- a/src/softmagic.c +++ b/src/softmagic.c @@ -32,7 +32,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: softmagic.c,v 1.128 2008/11/06 21:17:45 rrt Exp $") +FILE_RCSID("@(#)$File: softmagic.c,v 1.129 2008/11/06 22:49:08 rrt Exp $") #endif /* lint */ #include "magic.h" @@ -1828,15 +1828,17 @@ handle_annotation(struct magic_set *ms, struct magic *m) return 1; } if (ms->flags & MAGIC_MIME) { - if (ms->flags & MAGIC_MIME_TYPE) { + if ((ms->flags & MAGIC_MIME_TYPE) && m->mimetype[0]) { + ms->event_flags |= EVENT_WROTE_MIME_TYPE; if (file_printf(ms, "%s", m->mimetype) == -1) return -1; } - if (ms->flags & MAGIC_MIME_ENCODING) { - ms->event_flags |= EVENT_WROTE_MIME_TYPE; - return 0; /* Let ascmagic find the encoding */ - } - return 1; + /* If we want an encoding, let ascmagic find it. */ + if ((ms->flags & MAGIC_MIME_ENCODING)) + return 0; + /* If we didn't write a MIME type, and we want one, + allow ascmagic to run.*/ + return m->mimetype[0] != '\0'; } return -2; }