-.\" $File: file.man,v 1.77 2008/10/30 10:50:24 rrt Exp $
+.\" $File: file.man,v 1.78 2008/11/06 21:17:45 rrt Exp $
.Dd October 9, 2008
.Dt FILE __CSECTION__
.Os
.Em "preserve these keywords" .
Users depend on knowing that all the readable files in a directory
have the word
-.Dq text
+.Sq text
printed.
Don't do as Berkeley did and change
-.Dq shell commands text
+.Sq shell commands text
to
-.Dq shell script .
+.Sq shell script .
.Pp
The filesystem tests are based on examining the return from a
.Xr stat 2
If a file passes any of these tests, its character set is reported.
ASCII, ISO-8859-x, UTF-8, and extended-ASCII files are identified
as
-.Dq text
+.Sq text
because they will be mostly readable on nearly any terminal;
UTF-16 and EBCDIC are only
-.Dq character data
+.Sq character data
because, while
they contain text, it is text that will require translation
before it can be read.
.Pp
Any file that cannot be identified as having been written
in any of the character sets listed above is simply said to be
-.Dq data .
+.Sq data .
.Sh OPTIONS
.Bl -tag -width indent
.It Fl b , -brief
.Dv EMX
application type (only on EMX).
.It text
-Various types of text files.
+Various types of text files (this test will try to guess the text encoding, irrespective of the setting of the
+.Sq encoding
+option).
.It encoding
-Different text encodings.
+Different text encodings for soft magic tests.
.It tokens
Looks for known tokens inside text files.
.It cdf
.It Fl i , -mime
Causes the file command to output mime type strings rather than the more
traditional human readable ones. Thus it may say
-.Dq text/plain; charset=us-ascii
+.Sq text/plain; charset=us-ascii
rather than
-.Dq ASCII text .
+.Sq ASCII text .
In order for this option to work, file changes the way
it handles files recognized by the command itself (such as many of the
text file types, directories etc), and makes use of an alternative
-.Dq magic
+.Sq magic
file.
-(See
-.Dq FILES
-section, below).
+(See the FILES section, below).
.It Fl -mime-type , -mime-encoding
Like
.Fl i ,
.It Fl k , -keep-going
Don't stop at the first match, keep going. Subsequent matches will be
have the string
-.Dq "\[rs]012\- "
+.Sq "\[rs]012\- "
prepended.
(If you want a newline, see the
-.Dq "\-r"
+.Sq "\-r"
option.)
.It Fl L , -dereference
option causes symlinks to be followed, as the like-named option in
.Pa $HOME/.magic .
.Nm
adds
-.Dq .mgc
+.Sq .mgc
to the value of this variable as appropriate.
The environment variable
.Dv POSIXLY_CORRECT
as files of magic, apply many bug fixes and improve the build system.
.Pp
The list of contributors to the
-.Dq magic
+.Sq magic
directory (magic files)
is too long to include here.
You know who you are; thank you.
.Pp
The program should provide a way to give an estimate
of
-.Dq how good
+.Sq how good
a guess is.
We end up removing guesses (e.g.
-.Dq From\
+.Sq From\
as first 5 chars of file) because
they are not as good as other guesses (e.g.
-.Dq Newsgroups:
+.Sq Newsgroups:
versus
-.Dq Return-Path:
+.Sq Return-Path:
).
Still, if the others don't pan out, it should be possible to use the
first guess.
#include <stdlib.h>
#ifndef lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.68 2008/10/30 10:50:24 rrt Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.1 2008/11/06 21:17:45 rrt Exp $")
#endif /* lint */
private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
{
size_t mlen;
- int rv = 0, ucs_type;
+ int rv = 1, ucs_type;
unsigned char *nbuf = NULL;
mlen = (nbytes + 1) * sizeof(nbuf[0]);
goto done;
}
+ *type = "text";
if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
*code = "ASCII";
*code_mime = "us-ascii";
- *type = "text";
} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
*code = "UTF-8 Unicode (with BOM)";
*code_mime = "utf-8";
- *type = "text";
} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
*code = "UTF-8 Unicode";
*code_mime = "utf-8";
- *type = "text";
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Little-endian UTF-16 Unicode";
*code = "Big-endian UTF-16 Unicode";
*code_mime = "utf-16be";
}
- *type = "character data";
} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
*code = "ISO-8859";
- *type = "text";
*code_mime = "iso-8859-1";
} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
*code = "Non-ISO extended-ASCII";
- *type = "text";
*code_mime = "unknown-8bit";
} else {
from_ebcdic(buf, nbytes, nbuf);
if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
*code = "EBCDIC";
- *type = "character data";
*code_mime = "ebcdic";
} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
*code = "International EBCDIC";
- *type = "character data";
*code_mime = "ebcdic";
- } else /* Doesn't look like text at all */
- rv = -1;
+ } else { /* Doesn't look like text at all */
+ rv = 0;
+ *type = "binary";
+ }
}
done:
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: funcs.c,v 1.47 2008/11/04 16:38:28 christos Exp $")
+FILE_RCSID("@(#)$File: funcs.c,v 1.48 2008/11/06 21:17:45 rrt Exp $")
#endif /* lint */
#include "magic.h"
file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf,
size_t nb)
{
- int m = 0, rv = 0;
+ int m = 0, rv = 0, looks_text = 0;
int mime = ms->flags & MAGIC_MIME;
const unsigned char *ubuf = CAST(const unsigned char *, buf);
unichar *u8buf = NULL;
(m = file_trycdf(ms, fd, ubuf, nb)) == 0) {
/* try to discover text encoding */
if ((ms->flags & MAGIC_NO_CHECK_ENCODING) == 0)
- file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type);
+ looks_text = file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type);
/* try soft magic tests */
if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 ||
(m = file_softmagic(ms, ubuf, nb, BINTEST)) == 0) {
/* try text properties (and possibly text tokens) */
if ((ms->flags & MAGIC_NO_CHECK_TEXT) != 0 ||
- (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) {
+ ((ms->flags & MAGIC_NO_CHECK_ENCODING) != 0 && (m = file_ascmagic(ms, ubuf, nb)) == 0) ||
+ looks_text == 0 || (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) {
/* give up */
if ((!mime || (mime & MAGIC_MIME_TYPE)) &&
file_printf(ms, mime ?
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: softmagic.c,v 1.127 2008/11/06 15:38:28 christos Exp $")
+FILE_RCSID("@(#)$File: softmagic.c,v 1.128 2008/11/06 21:17:45 rrt Exp $")
#endif /* lint */
#include "magic.h"
return 1;
}
if (ms->flags & MAGIC_MIME) {
- if (ms->flags & MAGIC_MIME_TYPE) {
+ if ((ms->flags & MAGIC_MIME_TYPE) && m->mimetype[0]) {
+ ms->event_flags |= EVENT_WROTE_MIME_TYPE;
if (file_printf(ms, "%s", m->mimetype) == -1)
return -1;
}
- if (ms->flags & MAGIC_MIME_ENCODING) {
- ms->event_flags |= EVENT_WROTE_MIME_TYPE;
- return 0; /* Let ascmagic find the encoding */
- }
- return 1;
+ /* If we want an encoding, let ascmagic find it. */
+ if ((ms->flags & MAGIC_MIME_ENCODING))
+ return 0;
+ /* If we didn't write a MIME type, and we want one,
+ allow ascmagic to run.*/
+ return m->mimetype[0] != '\0';
}
return -2;
}