This check-in mostly fixes a bunch of problems with MIME checks. It's

author Reuben Thomas <rrt@sc3d.org>

Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)

committer Reuben Thomas <rrt@sc3d.org>

Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)
author Reuben Thomas <rrt@sc3d.org>
Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)
committer Reuben Thomas <rrt@sc3d.org>
Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)
diff --git a/ChangeLog b/ChangeLog

index c2e2213ac9793eb17a54bb5d72ee83721cdee10e..97e1d1f2073666a24bd5c0b5a913ab8be8b4d3da 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,8 +1,26 @@
+2008-11-06 23:00  Reuben Thomas <rrt@sc3d.org>
+
+       * Fix --mime, --mime-type and --mime-encoding under new scheme.
+
+        * Rename "ascii" to "text" and add "encoding" test.
+
+       * Return a precise ("utf-16le" or "utf-16be") MIME charset for
+         UTF-16.
+
+       * Fix error in comment caused by automatic indentation adding
+         words!
+
  2008-11-06 10:35  Christos Zoulas <christos@astron.com>
  
         * use memchr instead of strchr because the string
           might not be NUL terminated (Scott MacVicar)
  
+2008-11-03 07:31  Reuben Thomas <rrt@sc3d.org>
+
+       * Fix a printf with a non-literal format string.
+
+       * Fix formatting and punctuation of help for "--apple".
+
  2008-10-30 11:00  Reuben Thomas <rrt@sc3d.org>
  
         * Correct words counts in comments of struct magic.
@@ -109,22 +127,22 @@
  
  2008-05-06 00:13  Robert Byrnes  <byrnes@wildpumpkin.net>
  
-        * src/Makefile.am:
+       * src/Makefile.am:
           Ensure that getopt_long and [v]asprintf are included in libmagic,
           as needed.
  
           Remove unnecessary EXTRA_DIST.
  
-        * src/Makefile.in:
+       * src/Makefile.in:
           Rerun automake.
  
-        * src/vasprintf.c (dispatch):
+       * src/vasprintf.c (dispatch):
           Fix variable precision bug: be sure to step past '*'.
  
-        * src/vasprintf.c (core):
+       * src/vasprintf.c (core):
           Remove unreachable code.
  
-        * src/apprentice.c (set_test_type):
+       * src/apprentice.c (set_test_type):
           Add cast to avoid compiler warning.
  
  2008-04-22 23:45  Christos Zoulas  <christos@astron.com>
@@ -135,12 +153,12 @@
  
  2008-04-04 11:00  Christos Zoulas  <christos@astron.com>
  
-        * >= <= is not supported, so fix the magic and warn about it.
+       * >= <= is not supported, so fix the magic and warn about it.
           reported by: Thien-Thi Nguyen <ttn@gnuvola.org>
  
  2008-03-27 16:16  Robert Byrnes  <byrnes@wildpumpkin.net>
  
-        * src/readelf.c (donote):
+       * src/readelf.c (donote):
           ELF core file command name/line bug fixes and enhancements:
  
           Try larger offsets first to avoid false matches
@@ -166,7 +184,7 @@
         * Clarify UTF-8 BOM message (Reuben Thomas)
  
         * Add HTML comment to token list in names.h
-       
+
  2007-02-04 15:50 Christos Zoulas <christos@astron.com>
  
         * Debian fixes (Reuben Thomas)
@@ -206,7 +224,7 @@
  
  2007-10-28 20:48 Christos Zoulas <christos@astron.com>
  
-       * float and double magic support (Behan Webster) 
+       * float and double magic support (Behan Webster)
  
  2007-10-28 20:48 Christos Zoulas <christos@astron.com>
  
@@ -253,7 +271,7 @@
           be easily parsed:
               mimetype [charset=character-set] [encoding=encoding-mime-type]
  
-         Remove spurious extra text from some MIME type printouts 
+         Remove spurious extra text from some MIME type printouts
           (mostly in is_tar).
  
           Fix one case where -i produced nothing at all (for a 1-byte file,
@@ -283,7 +301,7 @@
  2007-03-15 10:51 Christos Zoulas <christos@astron.com>
  
         * fix fortran and nroff reversed tests (Dmitry V. Levin)
-       
+
         * fix exclude option (Dmitry V. Levin)
  
  2007-02-08 17:30 Christos Zoulas <christos@astron.com>
@@ -302,7 +320,7 @@
         * Add exclude flag.
  
  2007-01-18 05:29 Anon Ymous <do@not.spam.me>
-       
+
         * Move the "type" detection code from parse() into its own table
           driven routine.  This avoids maintaining multiple lists in
           file.h.
@@ -310,7 +328,7 @@
         * Add an optional conditional field (ust before the type field).
           This code is wrapped in "#ifdef ENABLE_CONDITIONALS" as it is
           likely to go away.
-       
+
  2007-01-16 23:24 Anon Ymous <do@not.spam.me>
  
         * Fix an initialization bug in check_mem().
@@ -381,7 +399,7 @@
  2006-12-08 16:32 Christos Zoulas <christos@astron.com>
  
         * store and print the line number of the magic
-         entry for debugging.         
+         entry for debugging.
  
         * if the magic entry did not print anything,
           don't treat it as a match
@@ -396,7 +414,7 @@
           file_softmagic.
  
  2006-11-25 13:35 Christos Zoulas <christos@astron.com>
-       
+
         * Don't store the current offset in the magic
           struct, because it needs to be restored and
           it was not done properly all the time. Bug
@@ -486,7 +504,7 @@
         * Look for note sections in non executables.
  
  2005-09-20 13:33 Christos Zoulas <christos@astron.com>
-       
+
         * Don't print SVR4 Style in core files multiple times
             (Radek Vokál)
  
@@ -497,9 +515,9 @@
  2005-08-18 09:53 Christos Zoulas <christos@astron.com>
  
         * Remove erroreous mention of /etc/magic in the file man page
-         This is gentoo bug 101639. (Mike Frysinger) 
+         This is gentoo bug 101639. (Mike Frysinger)
  
-       * Cross-compile support and detection (Mike Frysinger) 
+       * Cross-compile support and detection (Mike Frysinger)
  
  2005-08-12 10:17 Christos Zoulas <christos@astron.com>
  
@@ -531,20 +549,20 @@
         * Avoid NULL pointer dereference in time conversion.
  
  2005-03-06 00:00  Joerg Walter <jwalt@mail.garni.ch>
-       
+
         * Add indirect magic offset support, and search mode.
  
  2005-01-12 00:00  Stepan Kasal  <kasal@ucw.cz>
  
-        * src/ascmagic.c (file_ascmagic): Fix three bugs about text files:
-          If a CRLF text file happens to have CR at offset HOWMANY - 1
-          (currently 0xffff), it should not be counted as CR line
-          terminator.
-          If a line has length exactly MAXLINELEN, it should not yet be
-          treated as a ``very long line'', as MAXLINELEN is ``longest sane
-          line length''.
-          With CRLF, the line length was not computed correctly, and even
-          lines of length MAXLINELEN - 1 were treated as ``very long''.
+       * src/ascmagic.c (file_ascmagic): Fix three bugs about text files:
+         If a CRLF text file happens to have CR at offset HOWMANY - 1
+         (currently 0xffff), it should not be counted as CR line
+         terminator.
+         If a line has length exactly MAXLINELEN, it should not yet be
+         treated as a ``very long line'', as MAXLINELEN is ``longest sane
+         line length''.
+         With CRLF, the line length was not computed correctly, and even
+         lines of length MAXLINELEN - 1 were treated as ``very long''.
  
  2004-12-07 14:15  Christos Zoulas  <christos@astron.com>
  
@@ -579,12 +597,12 @@
  
         * Remove 3rd and 4th copyright clause; approved by Ian Darwin.
  
-       * Fix small memory leaks; caught by: Tamas Sarlos 
+       * Fix small memory leaks; caught by: Tamas Sarlos
             <stamas@csillag.ilab.sztaki.hu>
  
  2004-07-24 16:33  Christos Zoulas  <christos@astron.com>
  
-       * magic.mime update Danny Milosavljevic <danny.milo@gmx.net> 
+       * magic.mime update Danny Milosavljevic <danny.milo@gmx.net>
  
         * FreeBSD version update Oliver Eikemeier <eikemeier@fillmore-labs.com>
  
diff --git a/doc/file.man b/doc/file.man

index 77522410981dd2c5e98b6884130d403a6b70c2b9..db0c4498f0bae12c51dd390e89b7d24bd30416d7 100644 (file)
--- a/doc/file.man
+++ b/doc/file.man
@@ -1,4 +1,4 @@
-.\" $File: file.man,v 1.76 2008/10/18 20:47:47 christos Exp $
+.\" $File: file.man,v 1.77 2008/10/30 10:50:24 rrt Exp $
  .Dd October 9, 2008
  .Dt FILE __CSECTION__
  .Os
@@ -41,12 +41,12 @@ characters and is probably safe to read on an
  terminal),
  .Em executable
  (the file contains the result of compiling a program
-in a form understandable to some 
+in a form understandable to some
  .Dv UNIX
  kernel or another),
  or
  .Em data
-meaning anything else (data is usually 
+meaning anything else (data is usually
  .Sq binary
  or non-printable).
  Exceptions are well-known file formats (core files, tar archives)
@@ -54,12 +54,12 @@ that are known to contain binary data.
  When modifying magic files or the program itself, make sure to
  .Em "preserve these keywords" .
  Users depend on knowing that all the readable files in a directory
-have the word 
+have the word
  .Dq text
  printed.
-Don't do as Berkeley did and change 
+Don't do as Berkeley did and change
  .Dq shell commands text
-to 
+to
  .Dq shell script .
  .Pp
  The filesystem tests are based on examining the return from a
@@ -78,16 +78,16 @@ The magic tests are used to check for files with data in
  particular fixed formats.
  The canonical example of this is a binary executable (compiled program)
  .Dv a.out
-file, whose format is defined in 
+file, whose format is defined in
  .In elf.h ,
  .In a.out.h
  and possibly
  .In exec.h
  in the standard include directory.
-These files have a 
+These files have a
  .Sq "magic number"
  stored in a particular place
-near the beginning of the file that tells the 
+near the beginning of the file that tells the
  .Dv UNIX operating system
  that the file is a binary executable, and which of several types thereof.
  The concept of a
@@ -116,10 +116,10 @@ ranges and sequences of bytes that constitute printable text
  in each set.
  If a file passes any of these tests, its character set is reported.
  ASCII, ISO-8859-x, UTF-8, and extended-ASCII files are identified
-as 
+as
  .Dq text
  because they will be mostly readable on nearly any terminal;
-UTF-16 and EBCDIC are only 
+UTF-16 and EBCDIC are only
  .Dq character data
  because, while
  they contain text, it is text that will require translation
@@ -144,13 +144,13 @@ For example, the keyword
  .Em .br
  indicates that the file is most likely a
  .Xr troff 1
-input file, just as the keyword 
+input file, just as the keyword
  .Em struct
  indicates a C program.
  These tests are less reliable than the previous
  two groups, so they are performed last.
  The language test routines also test for some miscellany
-(such as 
+(such as
  .Xr tar 1
  archives).
  .Pp
@@ -177,38 +177,39 @@ from the list of tests made to determine the file type. Valid test names
  are:
  .Bl -tag -width
  .It apptype
-Check for
  .Dv EMX
  application type (only on EMX).
-.It ascii
-Check for various types of ascii files.
+.It text
+Various types of text files.
+.It encoding
+Different text encodings.
+.It tokens
+Looks for known tokens inside text files.
  .It cdf
-Don't look for Compound Document Files.
+Prints details of Compound Document Files.
  .It compress
-Don't look for, or inside compressed files.
+Checks for, and looks inside, compressed files.
  .It elf
-Don't print elf details.
+Prints ELF file details.
  .It soft
-Don't consult magic files.
+Consults magic files.
  .It tar
-Don't examine tar files.
-.It tokens
-Don't look for known tokens inside ascii files.
+Examines tar files.
  .El
  .It Fl f , -files-from Ar namefile
-Read the names of the files to be examined from 
+Read the names of the files to be examined from
  .Ar namefile
-(one per line) 
+(one per line)
  before the argument list.
-Either 
+Either
  .Ar namefile
  or at least one filename argument must be present;
-to test the standard input, use 
+to test the standard input, use
  .Sq -
  as a filename argument.
  .It Fl F , -separator Ar separator
  Use the specified string as the separator between the filename and the
-file result returned. Defaults to 
+file result returned. Defaults to
  .Sq \&: .
  .It Fl h , -no-dereference
  option causes symlinks not to be followed
@@ -345,47 +346,47 @@ options.
  .Sh STANDARDS CONFORMANCE
  This program is believed to exceed the System V Interface Definition
  of FILE(CMD), as near as one can determine from the vague language
-contained therein. 
+contained therein.
  Its behavior is mostly compatible with the System V program of the same name.
  This version knows more magic, however, so it will produce
-different (albeit more accurate) output in many cases. 
+different (albeit more accurate) output in many cases.
  .\" URL: http://www.opengroup.org/onlinepubs/009695399/utilities/file.html
  .Pp
-The one significant difference 
+The one significant difference
  between this version and System V
  is that this version treats any white space
  as a delimiter, so that spaces in pattern strings must be escaped.
  For example,
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  >10    string  language impress\       (imPRESS data)
  .Ed
  .Pp
  in an existing magic file would have to be changed to
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  >10    string  language\e impress      (imPRESS data)
  .Ed
  .Pp
  In addition, in this version, if a pattern string contains a backslash,
  it must be escaped.
  For example
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  0      string          \ebegindata     Andrew Toolkit document
  .Ed
  .Pp
  in an existing magic file would have to be changed to
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  0      string          \e\ebegindata   Andrew Toolkit document
  .Ed
  .Pp
  SunOS releases 3.2 and later from Sun Microsystems include a
-.Nm 
+.Nm
  command derived from the System V one, but with some extensions.
  My version differs from Sun's only in minor ways.
-It includes the extension of the 
+It includes the extension of the
  .Sq &
  operator, used as,
  for example,
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  >16    long&0x7fffffff >0              not stripped
  .Ed
  .Sh MAGIC DIRECTORY
@@ -393,7 +394,7 @@ The magic file entries have been collected from various sources,
  mainly USENET, and contributed by various authors.
  Christos Zoulas (address below) will collect additional
  or corrected magic file entries.
-A consolidation of magic file entries 
+A consolidation of magic file entries
  will be distributed periodically.
  .Pp
  The order of entries in the magic file is significant.
@@ -403,14 +404,14 @@ If your old
  .Nm
  command uses a magic file,
  keep the old magic file around for comparison purposes
-(rename it to 
+(rename it to
  .Pa __MAGIC__.orig ).
  .Sh EXAMPLES
-.Bd -literal -offset indent 
+.Bd -literal -offset indent
  $ file file.c file /dev/{wd0a,hda}
  file.c:   C program text
  file:     ELF 32-bit LSB executable, Intel 80386, version 1 (SYSV),
-          dynamically linked (uses shared libs), stripped
+         dynamically linked (uses shared libs), stripped
  /dev/wd0a: block special (0/0)
  /dev/hda: block special (3/0)
  
@@ -439,9 +440,9 @@ file:        application/x-executable
  
  .Ed
  .Sh HISTORY
-There has been a 
-.Nm 
-command in every 
+There has been a
+.Nm
+command in every
  .Dv UNIX since at least Research Version 4
  (man page dated November, 1973).
  The System V version introduced one significant major change:
@@ -464,7 +465,7 @@ Primary development and maintenance from 1990 to the present by
  Christos Zoulas (christos@astron.com).
  .Pp
  Altered by Chris Lowth, chris@lowth.com, 2000:
-Handle the 
+Handle the
  .Fl i
  option to output mime type strings, using an alternative
  magic file and internal logic.
@@ -510,10 +511,10 @@ files.
  The support for text files (primarily for programming languages)
  is simplistic, inefficient and requires recompilation to update.
  .Pp
-The list of keywords in 
+The list of keywords in
  .Dv ascmagic
  probably belongs in the Magic file.
-This could be done by using some keyword like 
+This could be done by using some keyword like
  .Sq *
  for the offset value.
  .Pp
@@ -521,20 +522,20 @@ Complain about conflicts in the magic file entries.
  Make a rule that the magic entries sort based on file offset rather
  than position within the magic file?
  .Pp
-The program should provide a way to give an estimate 
-of 
+The program should provide a way to give an estimate
+of
  .Dq how good
  a guess is.
-We end up removing guesses (e.g. 
-.Dq From\ 
+We end up removing guesses (e.g.
+.Dq From\
  as first 5 chars of file) because
-they are not as good as other guesses (e.g. 
+they are not as good as other guesses (e.g.
  .Dq Newsgroups:
  versus
  .Dq Return-Path:
  ).
  Still, if the others don't pan out, it should be possible to use the
-first guess.  
+first guess.
  .Pp
  This manual page, and particularly this section, is too long.
  .Sh RETURN CODE
diff --git a/src/Makefile.am b/src/Makefile.am

index 33a2d4063bbec4e389ef7af8bc9531f5f167ff0c..db78d96e49d4514c34bd8d82c371b1e6445c1b1c 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -8,7 +8,7 @@ AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"'
  AM_CFLAGS = @WARNINGS@
  
  libmagic_la_SOURCES = magic.c apprentice.c softmagic.c ascmagic.c \
-       compress.c is_tar.c readelf.c print.c fsmagic.c \
+       encoding.c compress.c is_tar.c readelf.c print.c fsmagic.c \
         funcs.c file.h names.h patchlevel.h readelf.h tar.h apptype.c \
         file_opts.h elfclass.h mygetopt.h cdf.c cdf_time.c readcdf.c cdf.h
  libmagic_la_LDFLAGS = -no-undefined -version-info 1:0:0
diff --git a/src/ascmagic.c b/src/ascmagic.c

index 5c5574f506331da3f4b76da6eb2b48ade032a0e4..cc63243e27bc66089321d3a4de7599d767d75d29 100644 (file)
--- a/src/ascmagic.c
+++ b/src/ascmagic.c
@@ -2,7 +2,7 @@
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
   * maintained 1995-present by Christos Zoulas and others.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -12,7 +12,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -31,15 +31,12 @@
   *
   * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
   * to handle character codes other than ASCII on a unified basis.
- *
- * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
- * international characters, now subsumed into this file.
   */
  
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.68 2008/10/30 10:50:24 rrt Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.69 2008/11/04 16:38:28 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -57,32 +54,63 @@ FILE_RCSID("@(#)$File: ascmagic.c,v 1.68 2008/10/30 10:50:24 rrt Exp $")
  #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
                   || (x) == 0x85 || (x) == '\f')
  
-private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
-    size_t *);
-private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
-private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  private int ascmatch(const unsigned char *, const unichar *, size_t);
  private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
+private size_t trim_nuls(const unsigned char *, size_t);
  
+/*
+ * Undo the NUL-termination kindly provided by process()
+ * but leave at least one byte to look at
+ */
+private size_t
+trim_nuls(const unsigned char *buf, size_t nbytes)
+{
+       while (nbytes > 1 && buf[nbytes - 1] == '\0')
+               nbytes--;
+
+       return nbytes;
+}
  
  protected int
  file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
  {
-       size_t i;
-       unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
-       unichar *ubuf = NULL;   
-       size_t ulen, mlen;
-       const struct names *p;
-       const char *encoding = "binary";
+       unichar *ubuf = NULL;
+       size_t ulen;
         int rv = -1;
-       int mime = ms->flags & MAGIC_MIME;
  
         const char *code = NULL;
         const char *code_mime = NULL;
         const char *type = NULL;
+
+       if (ms->flags & MAGIC_APPLE)
+               return 0;
+
+       nbytes = trim_nuls(buf, nbytes);
+
+       /* If file doesn't look like any sort of text, give up. */
+       if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime, &type) == 0) {
+               rv = 0;
+               goto done;
+       }
+
+       rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, code_mime, type);
+
+ done:
+       if (ubuf)
+               free(ubuf);
+
+       return rv;
+}
+
+protected int
+file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t ulen, const char *code, const char *code_mime, const char *type)
+{
+       unsigned char *utf8_buf = NULL, *utf8_end;
+       size_t mlen, i;
+       const struct names *p;
+       int rv = -1;
+       int mime = ms->flags & MAGIC_MIME;
+
         const char *subtype = NULL;
         const char *subtype_mime = NULL;
  
@@ -101,90 +129,17 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
         if (ms->flags & MAGIC_APPLE)
                 return 0;
  
-       /*
-        * Undo the NUL-termination kindly provided by process()
-        * but leave at least one byte to look at
-        */
-       while (nbytes > 1 && buf[nbytes - 1] == '\0')
-               nbytes--;
-
-       mlen = (nbytes + 1) * sizeof(nbuf[0]);
-       if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
-               file_oomem(ms, mlen);
-               goto done;
-       }
-       mlen = (nbytes + 1) * sizeof(ubuf[0]);
-       if ((ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
-               file_oomem(ms, mlen);
-               goto done;
-       }
-
-       /*
-        * Then try to determine whether it's any character code we can
-        * identify.  Each of these tests, if it succeeds, will leave
-        * the text converted into one-unichar-per-character Unicode in
-        * ubuf, and the number of characters converted in ulen.
-        */
-       if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
-               code = "ASCII";
-               code_mime = "us-ascii";
-               type = "text";
-               encoding = "7bit";
-       } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
-               code = "UTF-8 Unicode (with BOM)";
-               code_mime = "utf-8";
-               type = "text";
-       } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
-               code = "UTF-8 Unicode";
-               code_mime = "utf-8";
-               type = "text";
-       } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
-               if (i == 1)
-                       code = "Little-endian UTF-16 Unicode";
-               else
-                       code = "Big-endian UTF-16 Unicode";
-
-               type = "character data";
-               code_mime = "utf-16";    /* is this defined? */
-       } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
-               code = "ISO-8859";
-               type = "text";
-               code_mime = "iso-8859-1"; 
-       } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
-               code = "Non-ISO extended-ASCII";
-               type = "text";
-               code_mime = "unknown";
-       } else {
-               from_ebcdic(buf, nbytes, nbuf);
-
-               if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
-                       code = "EBCDIC";
-                       type = "character data";
-                       code_mime = "ebcdic";
-               } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
-                       code = "International EBCDIC";
-                       type = "character data";
-                       code_mime = "ebcdic";
-               } else {
-                       if (mime == MAGIC_MIME_ENCODING)
-                               if (file_printf(ms, "%s", encoding) == -1)
-                                       goto done;
-                       rv = 0;
-                       goto done;  /* doesn't look like text at all */
-               }
-       }
+       nbytes = trim_nuls(buf, nbytes);
  
+       /* If we have fewer than 2 bytes, give up. */
         if (nbytes <= 1) {
                 rv = 0;
                 goto done;
         }
  
         /* Convert ubuf to UTF-8 and try text soft magic */
-       /* If original was ASCII or UTF-8, could use nbuf instead of
-          re-converting. */
         /* malloc size is a conservative overestimate; could be
-          re-converting improved, or at least realloced after
-          re-converting conversion. */
+          improved, or at least realloced after conversion. */
         mlen = ulen * 6;
         if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
                 file_oomem(ms, mlen);
@@ -270,7 +225,8 @@ subtype_identified:
                 n_cr++;
  
         if (mime) {
-               if (mime & MAGIC_MIME_TYPE) {
+               if ((mime & MAGIC_MIME_TYPE) &&
+                   !(ms->event_flags && EVENT_WROTE_MIME_TYPE)) {
                         if (subtype_mime) {
                                 if (file_printf(ms, "%s", subtype_mime) == -1)
                                         goto done;
@@ -289,22 +245,18 @@ subtype_identified:
                 }
  
                 if (mime == MAGIC_MIME_ENCODING)
-                       if (file_printf(ms, "%s", encoding) == -1)
+                       if (file_printf(ms, "%s", code_mime) == -1)
                                 goto done;
         } else {
                 if (file_printf(ms, "%s", code) == -1)
                         goto done;
  
                 if (subtype) {
-                       if (file_printf(ms, " ") == -1)
-                               goto done;
-                       if (file_printf(ms, "%s", subtype) == -1)
+                       if (file_printf(ms, " %s", subtype) == -1)
                                 goto done;
                 }
  
-               if (file_printf(ms, " ") == -1)
-                       goto done;
-               if (file_printf(ms, "%s", type) == -1)
+               if (file_printf(ms, " %s", type) == -1)
                         goto done;
  
                 if (has_long_lines)
@@ -320,7 +272,7 @@ subtype_identified:
                         if (file_printf(ms, ", with") == -1)
                                 goto done;
  
-                       if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
+                       if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
                                 if (file_printf(ms, " no") == -1)
                                         goto done;
                         } else {
@@ -363,10 +315,6 @@ subtype_identified:
         }
         rv = 1;
  done:
-       if (nbuf)
-               free(nbuf);
-       if (ubuf)
-               free(ubuf);
         if (utf8_buf)
                 free(utf8_buf);
  
@@ -389,144 +337,6 @@ ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
                 return 1;
  }
  
-/*
- * This table reflects a particular philosophy about what constitutes
- * "text," and there is room for disagreement about it.
- *
- * Version 3.31 of the file command considered a file to be ASCII if
- * each of its characters was approved by either the isascii() or
- * isalpha() function.  On most systems, this would mean that any
- * file consisting only of characters in the range 0x00 ... 0x7F
- * would be called ASCII text, but many systems might reasonably
- * consider some characters outside this range to be alphabetic,
- * so the file command would call such characters ASCII.  It might
- * have been more accurate to call this "considered textual on the
- * local system" than "ASCII."
- *
- * It considered a file to be "International language text" if each
- * of its characters was either an ASCII printing character (according
- * to the real ASCII standard, not the above test), a character in
- * the range 0x80 ... 0xFF, or one of the following control characters:
- * backspace, tab, line feed, vertical tab, form feed, carriage return,
- * escape.  No attempt was made to determine the language in which files
- * of this type were written.
- *
- *
- * The table below considers a file to be ASCII if all of its characters
- * are either ASCII printing characters (again, according to the X3.4
- * standard, not isascii()) or any of the following controls: bell,
- * backspace, tab, line feed, form feed, carriage return, esc, nextline.
- *
- * I include bell because some programs (particularly shell scripts)
- * use it literally, even though it is rare in normal text.  I exclude
- * vertical tab because it never seems to be used in real text.  I also
- * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
- * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
- * character to.  It might be more appropriate to include it in the 8859
- * set instead of the ASCII set, but it's got to be included in *something*
- * we recognize or EBCDIC files aren't going to be considered textual.
- * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
- * and Latin characters, so these should possibly be allowed.  But they
- * make a real mess on VT100-style displays if they're not paired properly,
- * so we are probably better off not calling them text.
- *
- * A file is considered to be ISO-8859 text if its characters are all
- * either ASCII, according to the above definition, or printing characters
- * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
- *
- * Finally, a file is considered to be international text from some other
- * character code if its characters are all either ISO-8859 (according to
- * the above definition) or characters in the range 0x80 ... 0x9F, which
- * ISO-8859 considers to be control characters but the IBM PC and Macintosh
- * consider to be printing characters.
- */
-
-#define F 0   /* character never appears in text */
-#define T 1   /* character appears in plain ASCII text */
-#define I 2   /* character appears in ISO-8859 text */
-#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
-
-private char text_chars[256] = {
-       /*                  BEL BS HT LF    FF CR    */
-       F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
-        /*                              ESC          */
-       F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
-       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
-       /*            NEL                            */
-       X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
-       X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
-       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
-};
-
-private int
-looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
-    size_t *ulen)
-{
-       size_t i;
-
-       *ulen = 0;
-
-       for (i = 0; i < nbytes; i++) {
-               int t = text_chars[buf[i]];
-
-               if (t != T)
-                       return 0;
-
-               ubuf[(*ulen)++] = buf[i];
-       }
-
-       return 1;
-}
-
-private int
-looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
-{
-       size_t i;
-
-       *ulen = 0;
-
-       for (i = 0; i < nbytes; i++) {
-               int t = text_chars[buf[i]];
-
-               if (t != T && t != I)
-                       return 0;
-
-               ubuf[(*ulen)++] = buf[i];
-       }
-
-       return 1;
-}
-
-private int
-looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
-    size_t *ulen)
-{
-       size_t i;
-
-       *ulen = 0;
-
-       for (i = 0; i < nbytes; i++) {
-               int t = text_chars[buf[i]];
-
-               if (t != T && t != I && t != X)
-                       return 0;
-
-               ubuf[(*ulen)++] = buf[i];
-       }
-
-       return 1;
-}
-
  /*
   * Encode Unicode string as UTF-8, returning pointer to character
   * after end of string, or NULL if an invalid character is found.
@@ -583,226 +393,3 @@ encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
  
         return buf;
  }
-
-/*
- * Decide whether some text looks like UTF-8. Returns:
- *
- *     -1: invalid UTF-8
- *      0: uses odd control characters, so doesn't look like text
- *      1: 7-bit text
- *      2: definitely UTF-8 text (valid high-bit set bytes)
- *
- * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
- * ubuf must be big enough!
- */
-protected int
-file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
-{
-       size_t i;
-       int n;
-       unichar c;
-       int gotone = 0, ctrl = 0;
-
-       if (ubuf)
-               *ulen = 0;
-
-       for (i = 0; i < nbytes; i++) {
-               if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
-                       /*
-                        * Even if the whole file is valid UTF-8 sequences,
-                        * still reject it if it uses weird control characters.
-                        */
-
-                       if (text_chars[buf[i]] != T)
-                               ctrl = 1;
-
-                       if (ubuf)
-                               ubuf[(*ulen)++] = buf[i];
-               } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
-                       return -1;
-               } else {                           /* 11xxxxxx begins UTF-8 */
-                       int following;
-
-                       if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
-                               c = buf[i] & 0x1f;
-                               following = 1;
-                       } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
-                               c = buf[i] & 0x0f;
-                               following = 2;
-                       } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
-                               c = buf[i] & 0x07;
-                               following = 3;
-                       } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
-                               c = buf[i] & 0x03;
-                               following = 4;
-                       } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
-                               c = buf[i] & 0x01;
-                               following = 5;
-                       } else
-                               return -1;
-
-                       for (n = 0; n < following; n++) {
-                               i++;
-                               if (i >= nbytes)
-                                       goto done;
-
-                               if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
-                                       return -1;
-
-                               c = (c << 6) + (buf[i] & 0x3f);
-                       }
-
-                       if (ubuf)
-                               ubuf[(*ulen)++] = c;
-                       gotone = 1;
-               }
-       }
-done:
-       return ctrl ? 0 : (gotone ? 2 : 1);
-}
-
-/*
- * Decide whether some text looks like UTF-8 with BOM. If there is no
- * BOM, return -1; otherwise return the result of looks_utf8 on the
- * rest of the text.
- */
-private int
-looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
-    size_t *ulen)
-{
-       if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
-               return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
-       else
-               return -1;
-}
-
-private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
-    size_t *ulen)
-{
-       int bigend;
-       size_t i;
-
-       if (nbytes < 2)
-               return 0;
-
-       if (buf[0] == 0xff && buf[1] == 0xfe)
-               bigend = 0;
-       else if (buf[0] == 0xfe && buf[1] == 0xff)
-               bigend = 1;
-       else
-               return 0;
-
-       *ulen = 0;
-
-       for (i = 2; i + 1 < nbytes; i += 2) {
-               /* XXX fix to properly handle chars > 65536 */
-
-               if (bigend)
-                       ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
-               else
-                       ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
-
-               if (ubuf[*ulen - 1] == 0xfffe)
-                       return 0;
-               if (ubuf[*ulen - 1] < 128 &&
-                   text_chars[(size_t)ubuf[*ulen - 1]] != T)
-                       return 0;
-       }
-
-       return 1 + bigend;
-}
-
-#undef F
-#undef T
-#undef I
-#undef X
-
-/*
- * This table maps each EBCDIC character to an (8-bit extended) ASCII
- * character, as specified in the rationale for the dd(1) command in
- * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
- *
- * Unfortunately it does not seem to correspond exactly to any of the
- * five variants of EBCDIC documented in IBM's _Enterprise Systems
- * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
- * Edition, July, 1999, pp. I-1 - I-4.
- *
- * Fortunately, though, all versions of EBCDIC, including this one, agree
- * on most of the printing characters that also appear in (7-bit) ASCII.
- * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
- *
- * Fortunately too, there is general agreement that codes 0x00 through
- * 0x3F represent control characters, 0x41 a nonbreaking space, and the
- * remainder printing characters.
- *
- * This is sufficient to allow us to identify EBCDIC text and to distinguish
- * between old-style and internationalized examples of text.
- */
-
-private unsigned char ebcdic_to_ascii[] = {
-  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
- 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
-128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
-144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
-' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
-'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
-'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
-186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
-195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
-202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
-209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
-216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
-'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
-'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
-'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
-'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
-};
-
-#ifdef notdef
-/*
- * The following EBCDIC-to-ASCII table may relate more closely to reality,
- * or at least to modern reality.  It comes from
- *
- *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
- *
- * and maps the characters of EBCDIC code page 1047 (the code used for
- * Unix-derived software on IBM's 390 systems) to the corresponding
- * characters from ISO 8859-1.
- *
- * If this table is used instead of the above one, some of the special
- * cases for the NEL character can be taken out of the code.
- */
-
-private unsigned char ebcdic_1047_to_8859[] = {
-0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
-0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
-0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
-0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
-0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
-0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
-0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
-0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
-0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
-0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
-0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
-0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
-0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
-0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
-0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
-0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
-};
-#endif
-
-/*
- * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
- */
-private void
-from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
-{
-       size_t i;
-
-       for (i = 0; i < nbytes; i++) {
-               out[i] = ebcdic_to_ascii[buf[i]];
-       }
-}
diff --git a/src/encoding.c b/src/encoding.c

new file mode 100644 (file)

index 0000000..fd36835
--- /dev/null
+++ b/src/encoding.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) Ian F. Darwin 1986-1995.
+ * Software written by Ian F. Darwin and others;
+ * maintained 1995-present by Christos Zoulas and others.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Encoding -- determine the character encoding of a text file.
+ *
+ * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
+ * international characters.
+ */
+
+#include "file.h"
+#include "magic.h"
+#include <string.h>
+#include <memory.h>
+#include <stdlib.h>
+
+#ifndef        lint
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.68 2008/10/30 10:50:24 rrt Exp $")
+#endif /* lint */
+
+private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
+    size_t *);
+private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
+private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
+
+/*
+ * Try to determine whether text is in some character code we can
+ * identify.  Each of these tests, if it succeeds, will leave
+ * the text converted into one-unichar-per-character Unicode in
+ * ubuf, and the number of characters converted in ulen.
+ */
+protected int
+file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
+{
+       size_t mlen;
+       int rv = 0, ucs_type;
+       unsigned char *nbuf = NULL;
+
+       mlen = (nbytes + 1) * sizeof(nbuf[0]);
+       if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
+               file_oomem(ms, mlen);
+               goto done;
+       }
+       mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
+       if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
+               file_oomem(ms, mlen);
+               goto done;
+       }
+
+       if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
+               *code = "ASCII";
+               *code_mime = "us-ascii";
+               *type = "text";
+       } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
+               *code = "UTF-8 Unicode (with BOM)";
+               *code_mime = "utf-8";
+               *type = "text";
+       } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
+               *code = "UTF-8 Unicode";
+               *code_mime = "utf-8";
+               *type = "text";
+       } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
+               if (ucs_type == 1) {
+                       *code = "Little-endian UTF-16 Unicode";
+                       *code_mime = "utf-16le";
+               } else {
+                       *code = "Big-endian UTF-16 Unicode";
+                       *code_mime = "utf-16be";
+               }
+               *type = "character data";
+       } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
+               *code = "ISO-8859";
+               *type = "text";
+               *code_mime = "iso-8859-1";
+       } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
+               *code = "Non-ISO extended-ASCII";
+               *type = "text";
+               *code_mime = "unknown-8bit";
+       } else {
+               from_ebcdic(buf, nbytes, nbuf);
+
+               if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
+                       *code = "EBCDIC";
+                       *type = "character data";
+                       *code_mime = "ebcdic";
+               } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
+                       *code = "International EBCDIC";
+                       *type = "character data";
+                       *code_mime = "ebcdic";
+               } else /* Doesn't look like text at all */
+                       rv = -1;
+       }
+
+ done:
+       if (nbuf)
+               free(nbuf);
+
+       return rv;
+}
+
+/*
+ * This table reflects a particular philosophy about what constitutes
+ * "text," and there is room for disagreement about it.
+ *
+ * Version 3.31 of the file command considered a file to be ASCII if
+ * each of its characters was approved by either the isascii() or
+ * isalpha() function.  On most systems, this would mean that any
+ * file consisting only of characters in the range 0x00 ... 0x7F
+ * would be called ASCII text, but many systems might reasonably
+ * consider some characters outside this range to be alphabetic,
+ * so the file command would call such characters ASCII.  It might
+ * have been more accurate to call this "considered textual on the
+ * local system" than "ASCII."
+ *
+ * It considered a file to be "International language text" if each
+ * of its characters was either an ASCII printing character (according
+ * to the real ASCII standard, not the above test), a character in
+ * the range 0x80 ... 0xFF, or one of the following control characters:
+ * backspace, tab, line feed, vertical tab, form feed, carriage return,
+ * escape.  No attempt was made to determine the language in which files
+ * of this type were written.
+ *
+ *
+ * The table below considers a file to be ASCII if all of its characters
+ * are either ASCII printing characters (again, according to the X3.4
+ * standard, not isascii()) or any of the following controls: bell,
+ * backspace, tab, line feed, form feed, carriage return, esc, nextline.
+ *
+ * I include bell because some programs (particularly shell scripts)
+ * use it literally, even though it is rare in normal text.  I exclude
+ * vertical tab because it never seems to be used in real text.  I also
+ * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
+ * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
+ * character to.  It might be more appropriate to include it in the 8859
+ * set instead of the ASCII set, but it's got to be included in *something*
+ * we recognize or EBCDIC files aren't going to be considered textual.
+ * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
+ * and Latin characters, so these should possibly be allowed.  But they
+ * make a real mess on VT100-style displays if they're not paired properly,
+ * so we are probably better off not calling them text.
+ *
+ * A file is considered to be ISO-8859 text if its characters are all
+ * either ASCII, according to the above definition, or printing characters
+ * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
+ *
+ * Finally, a file is considered to be international text from some other
+ * character code if its characters are all either ISO-8859 (according to
+ * the above definition) or characters in the range 0x80 ... 0x9F, which
+ * ISO-8859 considers to be control characters but the IBM PC and Macintosh
+ * consider to be printing characters.
+ */
+
+#define F 0   /* character never appears in text */
+#define T 1   /* character appears in plain ASCII text */
+#define I 2   /* character appears in ISO-8859 text */
+#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
+
+private char text_chars[256] = {
+       /*                  BEL BS HT LF    FF CR    */
+       F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
+       /*                              ESC          */
+       F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
+       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
+       /*            NEL                            */
+       X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
+       X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
+       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
+};
+
+private int
+looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+    size_t *ulen)
+{
+       size_t i;
+
+       *ulen = 0;
+
+       for (i = 0; i < nbytes; i++) {
+               int t = text_chars[buf[i]];
+
+               if (t != T)
+                       return 0;
+
+               ubuf[(*ulen)++] = buf[i];
+       }
+
+       return 1;
+}
+
+private int
+looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+{
+       size_t i;
+
+       *ulen = 0;
+
+       for (i = 0; i < nbytes; i++) {
+               int t = text_chars[buf[i]];
+
+               if (t != T && t != I)
+                       return 0;
+
+               ubuf[(*ulen)++] = buf[i];
+       }
+
+       return 1;
+}
+
+private int
+looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+    size_t *ulen)
+{
+       size_t i;
+
+       *ulen = 0;
+
+       for (i = 0; i < nbytes; i++) {
+               int t = text_chars[buf[i]];
+
+               if (t != T && t != I && t != X)
+                       return 0;
+
+               ubuf[(*ulen)++] = buf[i];
+       }
+
+       return 1;
+}
+
+/*
+ * Decide whether some text looks like UTF-8. Returns:
+ *
+ *     -1: invalid UTF-8
+ *      0: uses odd control characters, so doesn't look like text
+ *      1: 7-bit text
+ *      2: definitely UTF-8 text (valid high-bit set bytes)
+ *
+ * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
+ * ubuf must be big enough!
+ */
+protected int
+file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+{
+       size_t i;
+       int n;
+       unichar c;
+       int gotone = 0, ctrl = 0;
+
+       if (ubuf)
+               *ulen = 0;
+
+       for (i = 0; i < nbytes; i++) {
+               if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
+                       /*
+                        * Even if the whole file is valid UTF-8 sequences,
+                        * still reject it if it uses weird control characters.
+                        */
+
+                       if (text_chars[buf[i]] != T)
+                               ctrl = 1;
+
+                       if (ubuf)
+                               ubuf[(*ulen)++] = buf[i];
+               } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
+                       return -1;
+               } else {                           /* 11xxxxxx begins UTF-8 */
+                       int following;
+
+                       if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
+                               c = buf[i] & 0x1f;
+                               following = 1;
+                       } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
+                               c = buf[i] & 0x0f;
+                               following = 2;
+                       } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
+                               c = buf[i] & 0x07;
+                               following = 3;
+                       } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
+                               c = buf[i] & 0x03;
+                               following = 4;
+                       } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
+                               c = buf[i] & 0x01;
+                               following = 5;
+                       } else
+                               return -1;
+
+                       for (n = 0; n < following; n++) {
+                               i++;
+                               if (i >= nbytes)
+                                       goto done;
+
+                               if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
+                                       return -1;
+
+                               c = (c << 6) + (buf[i] & 0x3f);
+                       }
+
+                       if (ubuf)
+                               ubuf[(*ulen)++] = c;
+                       gotone = 1;
+               }
+       }
+done:
+       return ctrl ? 0 : (gotone ? 2 : 1);
+}
+
+/*
+ * Decide whether some text looks like UTF-8 with BOM. If there is no
+ * BOM, return -1; otherwise return the result of looks_utf8 on the
+ * rest of the text.
+ */
+private int
+looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+    size_t *ulen)
+{
+       if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
+               return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
+       else
+               return -1;
+}
+
+private int
+looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+    size_t *ulen)
+{
+       int bigend;
+       size_t i;
+
+       if (nbytes < 2)
+               return 0;
+
+       if (buf[0] == 0xff && buf[1] == 0xfe)
+               bigend = 0;
+       else if (buf[0] == 0xfe && buf[1] == 0xff)
+               bigend = 1;
+       else
+               return 0;
+
+       *ulen = 0;
+
+       for (i = 2; i + 1 < nbytes; i += 2) {
+               /* XXX fix to properly handle chars > 65536 */
+
+               if (bigend)
+                       ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+               else
+                       ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+
+               if (ubuf[*ulen - 1] == 0xfffe)
+                       return 0;
+               if (ubuf[*ulen - 1] < 128 &&
+                   text_chars[(size_t)ubuf[*ulen - 1]] != T)
+                       return 0;
+       }
+
+       return 1 + bigend;
+}
+
+#undef F
+#undef T
+#undef I
+#undef X
+
+/*
+ * This table maps each EBCDIC character to an (8-bit extended) ASCII
+ * character, as specified in the rationale for the dd(1) command in
+ * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
+ *
+ * Unfortunately it does not seem to correspond exactly to any of the
+ * five variants of EBCDIC documented in IBM's _Enterprise Systems
+ * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
+ * Edition, July, 1999, pp. I-1 - I-4.
+ *
+ * Fortunately, though, all versions of EBCDIC, including this one, agree
+ * on most of the printing characters that also appear in (7-bit) ASCII.
+ * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
+ *
+ * Fortunately too, there is general agreement that codes 0x00 through
+ * 0x3F represent control characters, 0x41 a nonbreaking space, and the
+ * remainder printing characters.
+ *
+ * This is sufficient to allow us to identify EBCDIC text and to distinguish
+ * between old-style and internationalized examples of text.
+ */
+
+private unsigned char ebcdic_to_ascii[] = {
+  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
+ 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
+128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
+144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
+' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
+'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
+'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
+186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
+195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
+202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
+209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
+216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
+'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
+'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
+'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
+'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
+};
+
+#ifdef notdef
+/*
+ * The following EBCDIC-to-ASCII table may relate more closely to reality,
+ * or at least to modern reality.  It comes from
+ *
+ *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
+ *
+ * and maps the characters of EBCDIC code page 1047 (the code used for
+ * Unix-derived software on IBM's 390 systems) to the corresponding
+ * characters from ISO 8859-1.
+ *
+ * If this table is used instead of the above one, some of the special
+ * cases for the NEL character can be taken out of the code.
+ */
+
+private unsigned char ebcdic_1047_to_8859[] = {
+0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
+0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
+0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
+0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
+0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
+0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
+0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
+0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
+0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
+0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
+0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
+0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
+0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
+0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
+0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
+0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
+};
+#endif
+
+/*
+ * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
+ */
+private void
+from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
+{
+       size_t i;
+
+       for (i = 0; i < nbytes; i++) {
+               out[i] = ebcdic_to_ascii[buf[i]];
+       }
+}
diff --git a/src/file.c b/src/file.c

index 3e38839bf54a444e13b2a3a7fca45e9b56ec723c..7c0aa1464d24159bfabc73dc3b2fea67e27b384f 100644 (file)
--- a/src/file.c
+++ b/src/file.c
@@ -2,7 +2,7 @@
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
   * maintained 1995-present by Christos Zoulas and others.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -12,7 +12,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -32,7 +32,7 @@
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: file.c,v 1.126 2008/10/18 20:47:48 christos Exp $")
+FILE_RCSID("@(#)$File: file.c,v 1.127 2008/11/04 16:38:28 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -149,6 +149,7 @@ main(int argc, char *argv[])
                 { "cdf",        MAGIC_NO_CHECK_CDF },
                 { "compress",   MAGIC_NO_CHECK_COMPRESS },
                 { "elf",        MAGIC_NO_CHECK_ELF },
+               { "encoding",   MAGIC_NO_CHECK_ENCODING },
                 { "soft",       MAGIC_NO_CHECK_SOFT },
                 { "tar",        MAGIC_NO_CHECK_TAR },
                 { "tokens",     MAGIC_NO_CHECK_TOKENS },
@@ -228,7 +229,7 @@ main(int argc, char *argv[])
                         else
                                 flags |= nv[i].value;
                         break;
-                       
+
                 case 'f':
                         if(action)
                                 usage();
@@ -343,7 +344,7 @@ main(int argc, char *argv[])
                         process(argv[optind], wid);
         }
  
-       c = magic->haderr ? 1 : 0;
+       c = (magic->event_flags & EVENT_HAD_ERR) ? 1 : 0;
         magic_close(magic);
         return c;
  }
@@ -483,9 +484,9 @@ help(void)
  "Determine type of FILEs.\n"
  "\n", stderr);
  #define OPT(shortname, longname, opt, doc)      \
-        fprintf(stderr, "  -%c, --" longname doc, shortname);
+       fprintf(stderr, "  -%c, --" longname doc, shortname);
  #define OPT_LONGONLY(longname, opt, doc)        \
-        fprintf(stderr, "      --" longname doc);
+       fprintf(stderr, "      --" longname doc);
  #include "file_opts.h"
  #undef OPT
  #undef OPT_LONGONLY
diff --git a/src/file.h b/src/file.h

index d8bca618ef52ac5d9ff923102d1b42ce04aa0118..53e6923745a3c13a4758a42551de047aaf25c9f6 100644 (file)
--- a/src/file.h
+++ b/src/file.h
@@ -2,7 +2,7 @@
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
   * maintained 1995-present by Christos Zoulas and others.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -12,7 +12,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -27,7 +27,7 @@
   */
  /*
   * file.h - definitions for file(1) program
- * @(#)$File: file.h,v 1.112 2008/10/30 09:51:46 rrt Exp $
+ * @(#)$File: file.h,v 1.113 2008/11/04 16:48:42 christos Exp $
   */
  
  #ifndef __file_h__
@@ -122,7 +122,7 @@ union VALUETYPE {
         unsigned char us[MAXstring];
         float f;
         double d;
-}; 
+};
  
  struct magic {
         /* Word 1 */
@@ -134,7 +134,7 @@ struct magic {
  #define UNSIGNED       0x08    /* comparison is unsigned */
  #define NOSPACE                0x10    /* suppress space character before output */
  #define BINTEST                0x20    /* test is for a binary type (set only
-                                   for top-level tests) */
+                                  for top-level tests) */
  #define TEXTTEST       0       /* for passing to file_softmagic */
  
         uint8_t factor;
@@ -209,7 +209,7 @@ struct magic {
  #else
         uint8_t dummy;
  #endif
-       uint8_t factor_op;      
+       uint8_t factor_op;
  #define                FILE_FACTOR_OP_PLUS     '+'
  #define                FILE_FACTOR_OP_MINUS    '-'
  #define                FILE_FACTOR_OP_TIMES    '*'
@@ -317,8 +317,10 @@ struct magic_set {
         } o;
         uint32_t offset;
         int error;
-       int flags;
-       int haderr;
+       int flags;                      /* Control magic tests. */
+       int event_flags;                /* Note things that happened. */
+#define                EVENT_HAD_ERR           0x01
+#define                EVENT_WROTE_MIME_TYPE   0x02
         const char *file;
         size_t line;                    /* current magic line number */
  
@@ -355,6 +357,8 @@ protected int file_trycdf(struct magic_set *, int, const unsigned char *,
  protected int file_zmagic(struct magic_set *, int, const char *,
      const unsigned char *, size_t);
  protected int file_ascmagic(struct magic_set *, const unsigned char *, size_t);
+protected int file_ascmagic_with_encoding(struct magic_set *, const unsigned char *, size_t, unichar *, size_t, const char *, const char *, const char *);
+protected int file_encoding(struct magic_set *, const unsigned char *, size_t, unichar **, size_t *, const char **, const char **, const char **);
  protected int file_is_tar(struct magic_set *, const unsigned char *, size_t);
  protected int file_softmagic(struct magic_set *, const unsigned char *, size_t, int);
  protected struct mlist *file_apprentice(struct magic_set *, const char *, int);
diff --git a/src/funcs.c b/src/funcs.c

index 69485921f3d5b756c7a731e2e8cc8b2193010d72..29f16b403533a5054e5dfdd330f36f2e068d8ca9 100644 (file)
--- a/src/funcs.c
+++ b/src/funcs.c
@@ -1,7 +1,7 @@
  /*
   * Copyright (c) Christos Zoulas 2003.
   * All Rights Reserved.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -11,7 +11,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -27,7 +27,7 @@
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: funcs.c,v 1.46 2008/10/16 16:30:34 christos Exp $")
+FILE_RCSID("@(#)$File: funcs.c,v 1.47 2008/11/04 16:38:28 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -98,17 +98,17 @@ file_error_core(struct magic_set *ms, int error, const char *f, va_list va,
      uint32_t lineno)
  {
         /* Only the first error is ok */
-       if (ms->haderr)
+       if (ms->event_flags & EVENT_HAD_ERR)
                 return;
         if (lineno != 0) {
                 free(ms->o.buf);
                 ms->o.buf = NULL;
                 file_printf(ms, "line %u: ", lineno);
         }
-        file_vprintf(ms, f, va);
+       file_vprintf(ms, f, va);
         if (error > 0)
                 file_printf(ms, " (%s)", strerror(error));
-       ms->haderr++;
+       ms->event_flags |= EVENT_HAD_ERR;
         ms->error = error;
  }
  
@@ -158,9 +158,15 @@ protected int
  file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf,
      size_t nb)
  {
-       int m;
+       int m = 0, rv = 0;
         int mime = ms->flags & MAGIC_MIME;
         const unsigned char *ubuf = CAST(const unsigned char *, buf);
+       unichar *u8buf = NULL;
+       size_t ulen;
+
+       const char *code = NULL;
+       const char *code_mime = NULL;
+       const char *type = NULL;
  
         if (nb == 0) {
                 if ((!mime || (mime & MAGIC_MIME_TYPE)) &&
@@ -198,24 +204,35 @@ file_buffer(struct magic_set *ms, int fd, const char *inname, const void *buf,
                 /* Check if we have a CDF file */
                 if ((ms->flags & MAGIC_NO_CHECK_CDF) != 0 ||
                     (m = file_trycdf(ms, fd, ubuf, nb)) == 0) {
-                   /* try tests in /etc/magic (or surrogate magic file) */
+                   /* try to discover text encoding */
+                   if ((ms->flags & MAGIC_NO_CHECK_ENCODING) == 0)
+                       file_encoding(ms, ubuf, nb, &u8buf, &ulen, &code, &code_mime, &type);
+                   /* try soft magic tests */
                     if ((ms->flags & MAGIC_NO_CHECK_SOFT) != 0 ||
                         (m = file_softmagic(ms, ubuf, nb, BINTEST)) == 0) {
-                       /* try known keywords, check whether it is ASCII */
-                       if ((ms->flags & MAGIC_NO_CHECK_ASCII) != 0 ||
-                           (m = file_ascmagic(ms, ubuf, nb)) == 0) {
-                           /* abandon hope, all ye who remain here */
+                       /* try text properties (and possibly text tokens) */
+                       if ((ms->flags & MAGIC_NO_CHECK_TEXT) != 0 ||
+                           (m = file_ascmagic_with_encoding(ms, ubuf, nb, u8buf, ulen, code, code_mime, type)) == 0) {
+                           /* give up */
                             if ((!mime || (mime & MAGIC_MIME_TYPE)) &&
                                 file_printf(ms, mime ?
-                                   "application/octet-stream" :
-                                   "data") == -1)
-                                   return -1;
-                           m = 1;
+                                           "application/octet-stream" :
+                                           "data") == -1) {
+                                   rv = -1;
+                                   goto done;
+                           }
                         }
+                       m = 1;
                     }
                 }
             }
         }
+ done:
+       if (u8buf)
+               free(u8buf);
+       if (rv)
+               return rv;
+
  #ifdef BUILTIN_ELF
         if ((ms->flags & MAGIC_NO_CHECK_ELF) == 0 && m == 1 &&
             nb > 5 && fd != -1) {
@@ -242,7 +259,7 @@ file_reset(struct magic_set *ms)
                 return -1;
         }
         ms->o.buf = NULL;
-       ms->haderr = 0;
+       ms->event_flags &= ~EVENT_HAD_ERR;
         ms->error = -1;
         return 0;
  }
@@ -261,7 +278,7 @@ file_getbuffer(struct magic_set *ms)
         char *pbuf, *op, *np;
         size_t psize, len;
  
-       if (ms->haderr)
+       if (ms->event_flags & EVENT_HAD_ERR)
                 return NULL;
  
         if (ms->flags & MAGIC_RAW)
@@ -324,7 +341,7 @@ file_getbuffer(struct magic_set *ms)
  
         for (np = ms->o.pbuf, op = ms->o.buf; *op; op++) {
                 if (isprint((unsigned char)*op)) {
-                       *np++ = *op;    
+                       *np++ = *op;
                 } else {
                         OCTALIFY(np, op);
                 }
diff --git a/src/magic.c b/src/magic.c

index 093f827e1e04bd266d004e142634f29820db1981..00f8289fd2a8240b4c94f36ee073435046eafdb6 100644 (file)
--- a/src/magic.c
+++ b/src/magic.c
@@ -28,7 +28,7 @@
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: magic.c,v 1.54 2008/07/25 23:30:32 rrt Exp $")
+FILE_RCSID("@(#)$File: magic.c,v 1.55 2008/11/04 16:38:28 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -117,7 +117,7 @@ magic_open(int flags)
         if ((ms->c.li = CAST(struct level_info *, malloc(len))) == NULL)
                 goto free;
  
-       ms->haderr = 0;
+       ms->event_flags = 0;
         ms->error = -1;
         ms->mlist = NULL;
         ms->file = "unknown";
@@ -386,13 +386,13 @@ magic_buffer(struct magic_set *ms, const void *buf, size_t nb)
  public const char *
  magic_error(struct magic_set *ms)
  {
-       return ms->haderr ? ms->o.buf : NULL;
+       return (ms->event_flags & EVENT_HAD_ERR) ? ms->o.buf : NULL;
  }
  
  public int
  magic_errno(struct magic_set *ms)
  {
-       return ms->haderr ? ms->error : 0;
+       return (ms->event_flags & EVENT_HAD_ERR) ? ms->error : 0;
  }
  
  public int
diff --git a/src/magic.h b/src/magic.h

index 5864f266b3b7240a194a143cb03abee549c1dee6..a664e9aa39ed5f93e20f07d451936cc5d748de93 100644 (file)
--- a/src/magic.h
+++ b/src/magic.h
@@ -1,7 +1,7 @@
  /*
   * Copyright (c) Christos Zoulas 2003.
   * All Rights Reserved.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -11,7 +11,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -34,13 +34,13 @@
  #define        MAGIC_SYMLINK           0x000002 /* Follow symlinks */
  #define        MAGIC_COMPRESS          0x000004 /* Check inside compressed files */
  #define        MAGIC_DEVICES           0x000008 /* Look at the contents of devices */
-#define        MAGIC_MIME_TYPE         0x000010 /* Return only the MIME type */
+#define        MAGIC_MIME_TYPE         0x000010 /* Return the MIME type */
  #define        MAGIC_CONTINUE          0x000020 /* Return all matches */
  #define        MAGIC_CHECK             0x000040 /* Print warnings to stderr */
  #define        MAGIC_PRESERVE_ATIME    0x000080 /* Restore access time on exit */
-#define        MAGIC_RAW               0x000100 /* Don't translate unprint chars */
+#define        MAGIC_RAW               0x000100 /* Don't translate unprintable chars */
  #define        MAGIC_ERROR             0x000200 /* Handle ENOENT etc as real errors */
-#define        MAGIC_MIME_ENCODING     0x000400 /* Return only the MIME encoding */
+#define        MAGIC_MIME_ENCODING     0x000400 /* Return the MIME encoding */
  #define MAGIC_MIME             (MAGIC_MIME_TYPE|MAGIC_MIME_ENCODING)
  #define        MAGIC_APPLE             0x000800 /* Return the Apple creator and type */
  #define        MAGIC_NO_CHECK_COMPRESS 0x001000 /* Don't check for compressed files */
@@ -48,9 +48,13 @@
  #define        MAGIC_NO_CHECK_SOFT     0x004000 /* Don't check magic entries */
  #define        MAGIC_NO_CHECK_APPTYPE  0x008000 /* Don't check application type */
  #define        MAGIC_NO_CHECK_ELF      0x010000 /* Don't check for elf details */
-#define        MAGIC_NO_CHECK_ASCII    0x020000 /* Don't check for ascii files */
+#define        MAGIC_NO_CHECK_TEXT     0x020000 /* Don't check for text files */
  #define        MAGIC_NO_CHECK_CDF      0x040000 /* Don't check for cdf files */
-#define        MAGIC_NO_CHECK_TOKENS   0x100000 /* Don't check ascii/tokens */
+#define        MAGIC_NO_CHECK_TOKENS   0x100000 /* Don't check tokens */
+#define MAGIC_NO_CHECK_ENCODING 0x200000 /* Don't check text encodings */
+
+/* Defined for backwards compatibility (renamed) */
+#define        MAGIC_NO_CHECK_ASCII    MAGIC_NO_CHECK_TEXT
  
  /* Defined for backwards compatibility; do nothing */
  #define        MAGIC_NO_CHECK_FORTRAN  0x000000 /* Don't check ascii/fortran */
diff --git a/src/softmagic.c b/src/softmagic.c

index 0eec2fa3f97755b2360cd17aea0723e036fec484..969233f0b11b3b4fd4c8eccdd141fadf8dee3d36 100644 (file)
--- a/src/softmagic.c
+++ b/src/softmagic.c
@@ -2,7 +2,7 @@
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
   * maintained 1995-present by Christos Zoulas and others.
- * 
+ *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@ -12,7 +12,7 @@
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in the
   *    documentation and/or other materials provided with the distribution.
- *  
+ *
   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -32,7 +32,7 @@
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: softmagic.c,v 1.126 2008/11/04 16:38:28 christos Exp $")
+FILE_RCSID("@(#)$File: softmagic.c,v 1.127 2008/11/06 15:38:28 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -69,9 +69,6 @@ file_softmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes, in
  {
         struct mlist *ml;
         int rv;
-       if (ms->flags & MAGIC_MIME_ENCODING)
-               /* Let ascmagic do the work */
-               return 0;
         for (ml = ms->mlist->next; ml != ms->mlist; ml = ml->next)
                 if ((rv = match(ms, ml->magic, ml->nmagic, buf, nbytes, mode)) != 0)
                         return rv;
@@ -140,7 +137,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                 if (flush) {
                         if (m->reln == '!')
                                 flush = 0;
-               } else {        
+               } else {
                         switch (magiccheck(ms, m)) {
                         case -1:
                                 return -1;
@@ -152,7 +149,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                         }
                 }
                 if (flush) {
-                       /* 
+                       /*
                          * main entry didn't match,
                          * flush its continuations
                          */
@@ -169,7 +166,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                 if (*m->desc) {
                         need_separator = 1;
                         printed_something = 1;
-                       if ((e = handle_annotation(ms, m)) != 0)
+                       if ((e = handle_annotation(ms, m)) != -2)
                                 return e;
                         if (print_sep(ms, firstline) == -1)
                                 return -1;
@@ -212,7 +209,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                         flush = !mget(ms, s, m, nbytes, cont_level);
                         if (flush && m->reln != '!')
                                 continue;
-                               
+
                         switch (flush ? 1 : magiccheck(ms, m)) {
                         case -1:
                                 return -1;
@@ -237,7 +234,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                                  */
                                 if (*m->desc) {
                                         printed_something = 1;
-                                       if ((e = handle_annotation(ms, m)) != 0)
+                                       if ((e = handle_annotation(ms, m)) != -2)
                                                 return e;
                                         if (print_sep(ms, firstline) == -1)
                                                 return -1;
@@ -277,7 +274,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
                 }
                 if ((ms->flags & MAGIC_CONTINUE) == 0 && printed_something) {
                         return 1; /* don't keep searching */
-               }                       
+               }
         }
         return returnval;  /* This is hit if -k is set or there is no match */
  }
@@ -826,7 +823,7 @@ mcopy(struct magic_set *ms, union VALUETYPE *p, int type, int indir,
                         }
                         if (lines)
                                 last = (const char *)s + nbytes;
-                       
+
                         ms->search.s = buf;
                         ms->search.s_len = last - buf;
                         ms->search.offset = offset;
@@ -839,10 +836,10 @@ mcopy(struct magic_set *ms, union VALUETYPE *p, int type, int indir,
                         const unsigned char *esrc = s + nbytes;
                         char *dst = p->s;
                         char *edst = &p->s[sizeof(p->s) - 1];
-                       
+
                         if (type == FILE_BESTRING16)
                                 src++;
-                       
+
                         /* check for pointer overflow */
                         if (src < s) {
                                 file_magerror(ms, "invalid offset %u in mcopy()",
@@ -1389,14 +1386,14 @@ mget(struct magic_set *ms, const unsigned char *s,
                 if (nbytes < (offset + 1)) /* should alway be true */
                         return 0;
                 break;
-               
+
         case FILE_SHORT:
         case FILE_BESHORT:
         case FILE_LESHORT:
                 if (nbytes < (offset + 2))
                         return 0;
                 break;
-               
+
         case FILE_LONG:
         case FILE_BELONG:
         case FILE_LELONG:
@@ -1415,7 +1412,7 @@ mget(struct magic_set *ms, const unsigned char *s,
                 if (nbytes < (offset + 4))
                         return 0;
                 break;
-               
+
         case FILE_DOUBLE:
         case FILE_BEDOUBLE:
         case FILE_LEDOUBLE:
@@ -1465,7 +1462,7 @@ file_strncmp(const char *s1, const char *s2, size_t len, uint32_t flags)
         if (0L == flags) { /* normal string: do it fast */
                 while (len-- > 0)
                         if ((v = *b++ - *a++) != '\0')
-                               break; 
+                               break;
         }
         else { /* combine the others */
                 while (len-- > 0) {
@@ -1479,8 +1476,8 @@ file_strncmp(const char *s1, const char *s2, size_t len, uint32_t flags)
                                 if ((v = toupper(*b++) - *a++) != '\0')
                                         break;
                         }
-                       else if ((flags & STRING_COMPACT_BLANK) && 
-                           isspace(*a)) { 
+                       else if ((flags & STRING_COMPACT_BLANK) &&
+                           isspace(*a)) {
                                 a++;
                                 if (isspace(*b++)) {
                                         while (isspace(*b))
@@ -1575,23 +1572,23 @@ magiccheck(struct magic_set *ms, struct magic *m)
                 case 'x':
                         matched = 1;
                         break;
-       
+
                 case '!':
                         matched = fv != fl;
                         break;
-       
+
                 case '=':
                         matched = fv == fl;
                         break;
-       
+
                 case '>':
                         matched = fv > fl;
                         break;
-       
+
                 case '<':
                         matched = fv < fl;
                         break;
-       
+
                 default:
                         matched = 0;
                         file_magerror(ms, "cannot happen with float: invalid relation `%c'",
@@ -1609,23 +1606,23 @@ magiccheck(struct magic_set *ms, struct magic *m)
                 case 'x':
                         matched = 1;
                         break;
-       
+
                 case '!':
                         matched = dv != dl;
                         break;
-       
+
                 case '=':
                         matched = dv == dl;
                         break;
-       
+
                 case '>':
                         matched = dv > dl;
                         break;
-       
+
                 case '<':
                         matched = dv < dl;
                         break;
-       
+
                 default:
                         matched = 0;
                         file_magerror(ms, "cannot happen with double: invalid relation `%c'", m->reln);
@@ -1825,18 +1822,23 @@ magiccheck(struct magic_set *ms, struct magic *m)
  private int
  handle_annotation(struct magic_set *ms, struct magic *m)
  {
-       int ret = 0;
         if (ms->flags & MAGIC_APPLE) {
                 if (file_printf(ms, "%.8s", m->apple) == -1)
                         return -1;
-               ret = 1;
+               return 1;
         }
-       if (ms->flags & MAGIC_MIME_TYPE) {
-               if (file_printf(ms, "%s", m->mimetype) == -1)
-                        return -1;
-               ret = 1;
+       if (ms->flags & MAGIC_MIME) {
+               if (ms->flags & MAGIC_MIME_TYPE) {
+                       if (file_printf(ms, "%s", m->mimetype) == -1)
+                               return -1;
+               }
+               if (ms->flags & MAGIC_MIME_ENCODING) {
+                       ms->event_flags |= EVENT_WROTE_MIME_TYPE;
+                       return 0; /* Let ascmagic find the encoding */
+               }
+               return 1;
         }
-       return ret;
+       return -2;
  }
  
  private int
@@ -1845,7 +1847,7 @@ print_sep(struct magic_set *ms, int firstline)
         if (firstline)
                 return 0;
         /*
-        * we found another match 
+        * we found another match
          * put a newline and '-' to do some simple formatting
          */
         return file_printf(ms, "\n- ");
author	Reuben Thomas <rrt@sc3d.org>
	Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)
committer	Reuben Thomas <rrt@sc3d.org>
	Thu, 6 Nov 2008 21:17:45 +0000 (21:17 +0000)
ChangeLog		patch \| blob \| history
doc/file.man		patch \| blob \| history
src/Makefile.am		patch \| blob \| history
src/ascmagic.c		patch \| blob \| history
src/encoding.c	[new file with mode: 0644]	patch \| blob
src/file.c		patch \| blob \| history
src/file.h		patch \| blob \| history
src/funcs.c		patch \| blob \| history
src/magic.c		patch \| blob \| history
src/magic.h		patch \| blob \| history
src/softmagic.c		patch \| blob \| history