+2011-12-08 13:07 Reuben Thomas <rrt@sc3d.org>
+
+ * Remove hardwired token finding (names.h), turning it into soft
+ magic. Patterns are either anchored regexs or search/8192. English
+ language detection and PL/1 detection have been removed as they
+ were too fragile. -e tokens is still accepted for backwards
+ compatibility.
+ * Move 3ds patterns (which are commented out anyway) into autodesk
+ (they were, oddly, in c-lang).
+
2011-12-06 00:16 Reuben Thomas <rrt@sc3d.org>
* Tweak strength of generic hash-bang detectors to be less than
-.\" $File: file.man,v 1.96 2011/07/12 11:23:38 rrt Exp $
+.\" $File: file.man,v 1.97 2011/10/17 20:18:05 christos Exp $
.Dd October 17, 2011
.Dt FILE __CSECTION__
.Os
.It encoding
Different text encodings for soft magic tests.
.It tokens
-Looks for known tokens inside text files.
+Ignored for backwards compatibility.
.It cdf
Prints details of Compound Document Files.
.It compress
--- /dev/null
+#------------------------------------------------------------------------------
+# $File$
+# make: file(1) magic for assembler source
+#
+0 regex \^\.asciiz\? assembler source text
+!:mime text/x-asm
+0 regex \^\.byte assembler source text
+!:mime text/x-asm
+0 regex \^\.even assembler source text
+!:mime text/x-asm
+0 regex \^\.globl assembler source text
+!:mime text/x-asm
+0 regex \^\.text assembler source text
+!:mime text/x-asm
-
#------------------------------------------------------------------------------
-# $File$
-# c-lang: file(1) magic for C programs (or REXX)
+# $File: c-lang,v 1.14 2009/09/19 16:28:08 christos Exp $
+# c-lang: file(1) magic for C and related languages programs
#
-# XPM icons (Greg Roelofs, newt@uchicago.edu)
-# if you uncomment "/*" for C/REXX below, also uncomment this entry
-#0 string /*\ XPM\ */ X pixmap image data
-#!:mime image/x-xpmi
+# BCPL
+0 search/8192 "libhdr" BCPL source text
+!:mime text/x-bcpl
+0 search/8192 "LIBHDR" BCPL source text
+!:mime text/x-bcpl
-# 3DS (3d Studio files) Conflicts with diff output 0x3d '='
-#16 beshort 0x3d3d image/x-3ds
+# C
+0 regex \^#include C source text
+!:mime text/x-c
+0 regex \^char C source text
+!:mime text/x-c
+0 regex \^double C source text
+!:mime text/x-c
+0 regex \^extern C source text
+!:mime text/x-c
+0 regex \^float C source text
+!:mime text/x-c
+0 regex \^struct C source text
+!:mime text/x-c
+0 regex \^union C source text
+!:mime text/x-c
+0 search/8192 main( C source text
+!:mime text/x-c
-# this first will upset you if you're a PL/1 shop...
-# in which case rm it; ascmagic will catch real C programs
-#0 search/1 /* C or REXX program text
-#0 search/1 // C++ program text
+# C++
+# The strength of these rules is doubled so they beat the C rules above
+0 regex \^template C++ source text
+!:strength * 2
+!:mime text/x-c++
+0 regex \^virtual C++ source text
+!:strength * 2
+!:mime text/x-c++
+0 regex \^class C++ source text
+!:strength * 2
+!:mime text/x-c++
+0 regex \^public: C++ source text
+!:strength * 2
+!:mime text/x-c++
+0 regex \^private: C++ source text
+!:strength * 2
+!:mime text/x-c++
# From: Mikhail Teterin <mi@aldan.algebra.com>
0 string cscope cscope reference data
#------------------------------------------------------------------------------
-# $File: cad,v 1.9 2009/09/19 16:28:08 christos Exp $
+# $File: cad,v 1.10 2010/12/25 14:33:43 christos Exp $
# autocad: file(1) magic for cad files
#
0 string AC1012 AutoDesk AutoCAD R13
0 string AC1014 AutoDesk AutoCAD R14
0 string AC1015 AutoDesk AutoCAD R2000
+
+# 3DS (3d Studio files) Conflicts with diff output 0x3d '='
+#16 beshort 0x3d3d image/x-3ds
#------------------------------------------------------------------------------
-# $File$
+# $File: gnu,v 1.11 2009/09/19 16:28:09 christos Exp $
# gnu: file(1) magic for various GNU tools
#
# GNU nlsutils message catalog file format
# Files produced by GNU gettext
0 long 0xDE120495 GNU-format message catalog data
0 long 0x950412DE GNU-format message catalog data
+
+# gettext message catalogue
+0 regex \^msgid\ GNU gettext message catalogue text
+!:mime text/x-po
#------------------------------------------------------------------------------
-# $File: images,v 1.70 2010/11/25 15:00:12 christos Exp $
+# $File: images,v 1.71 2011/09/22 19:30:43 christos Exp $
# images: file(1) magic for image formats (see also "iff", and "c-lang" for
# XPM bitmaps)
#
#0 string BA PC bitmap array data
# XPM icons (Greg Roelofs, newt@uchicago.edu)
-# note possible collision with C/REXX entry in c-lang; currently commented out
0 search/1 /*\ XPM\ */ X pixmap image text
+!:mime image/x-xpmi
# Utah Raster Toolkit RLE images (janl@ifi.uio.no)
0 leshort 0xcc52 RLE image data,
#------------------------------------------------------------
-# $File$
+# $File: java,v 1.12 2009/09/19 16:28:10 christos Exp $
# Java ByteCode and Mach-O binaries (e.g., Mac OS X) use the
# same magic number, 0xcafebabe, so they are both handled
# in the entry called "cafebabe".
>0 regex dey\n[0-9][0-9][0-9]\0 Dalvik dex file (optimized for host)
>4 string >000 version %s
+# Java source
+0 regex ^import.*;$ Java source
+!:mime text/x-java
--- /dev/null
+#------------------------------------------------------------------------------
+# $File$
+# make: file(1) magic for M4 scripts
+#
+0 regex \^dnl\ M4 macro processor script text
+!:mime text/x-m4
-
#------------------------------------------------------------------------------
-# $File: mail.news,v 1.18 2010/11/25 15:00:12 christos Exp $
+# $File: mail.news,v 1.19 2011/01/25 13:55:57 christos Exp $
# mail.news: file(1) magic for mail and news
#
# Unfortunately, saved netnews also has From line added in some news software.
#0 string From mail text
-# There are tests to ascmagic.c to cope with mail and news.
0 string/t Relay-Version: old news text
!:mime message/rfc822
0 string/t #!\ rnews batched news text
!:mime message/rfc822
0 string/t Pipe\ to mail piping text
!:mime message/rfc822
-0 string/t Return-Path: smtp mail text
+0 string/t Delivered-To: SMTP mail text
+!:mime message/rfc822
+0 string/t Return-Path: SMTP mail text
!:mime message/rfc822
0 string/t Path: news text
!:mime message/news
--- /dev/null
+#------------------------------------------------------------------------------
+# $File$
+# make: file(1) magic for makefiles
+#
+0 regex \^CFLAGS makefile script text
+!:mime text/x-makefile
+0 regex \^LDFLAGS makefile script text
+!:mime text/x-makefile
+0 regex \^all: makefile script text
+!:mime text/x-makefile
+0 regex \^.PRECIOUS makefile script text
+!:mime text/x-makefile
+
+0 regex \^SUBDIRS automake makefile script text
+!:mime text/x-makefile
--- /dev/null
+#------------------------------------------------------------------------------
+# $File$
+# pascal: file(1) magic for Pascal source
+#
+0 search/8192 (input, Pascal source text
+!:mime text/x-pascal
+0 regex \^program Pascal source text
+!:mime text/x-pascal
+0 regex \^record Pascal source text
+!:mime text/x-pascal
#
-# $File: Makefile.am,v 1.74 2011/11/10 18:59:54 christos Exp $
+# $File: Makefile.am,v 1.75 2011/11/25 03:28:17 christos Exp $
#
MAGIC_FRAGMENT_BASE = Magdir
MAGIC_DIR = $(top_srcdir)/magic
$(MAGIC_FRAGMENT_DIR)/apple \
$(MAGIC_FRAGMENT_DIR)/applix \
$(MAGIC_FRAGMENT_DIR)/archive \
+$(MAGIC_FRAGMENT_DIR)/assembler \
$(MAGIC_FRAGMENT_DIR)/asterix \
$(MAGIC_FRAGMENT_DIR)/att3b \
$(MAGIC_FRAGMENT_DIR)/audio \
$(MAGIC_FRAGMENT_DIR)/llvm \
$(MAGIC_FRAGMENT_DIR)/lua \
$(MAGIC_FRAGMENT_DIR)/luks \
+$(MAGIC_FRAGMENT_DIR)/m4 \
$(MAGIC_FRAGMENT_DIR)/mach \
$(MAGIC_FRAGMENT_DIR)/macintosh \
$(MAGIC_FRAGMENT_DIR)/magic \
$(MAGIC_FRAGMENT_DIR)/mail.news \
+$(MAGIC_FRAGMENT_DIR)/make \
$(MAGIC_FRAGMENT_DIR)/maple \
$(MAGIC_FRAGMENT_DIR)/marc21 \
$(MAGIC_FRAGMENT_DIR)/mathcad \
$(MAGIC_FRAGMENT_DIR)/palm \
$(MAGIC_FRAGMENT_DIR)/parix \
$(MAGIC_FRAGMENT_DIR)/parrot \
+$(MAGIC_FRAGMENT_DIR)/pascal \
$(MAGIC_FRAGMENT_DIR)/pbm \
$(MAGIC_FRAGMENT_DIR)/pdf \
$(MAGIC_FRAGMENT_DIR)/pdp \
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.81 2011/03/15 22:16:29 christos Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.82 2011/09/20 15:30:14 christos Exp $")
#endif /* lint */
#include "magic.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
-#include "names.h"
#define MAXLINELEN 300 /* longest sane line length */
#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
|| (x) == 0x85 || (x) == '\f')
-private int ascmatch(const unsigned char *, const unichar *, size_t);
private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
private size_t trim_nuls(const unsigned char *, size_t);
/* If file doesn't look like any sort of text, give up. */
if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
- &type) == 0) {
+ &type) == 0)
rv = 0;
- goto done;
- }
-
- rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
- type, text);
+ else
+ rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
+ type, text);
- done:
if (ubuf)
free(ubuf);
{
unsigned char *utf8_buf = NULL, *utf8_end;
size_t mlen, i;
- const struct names *p;
int rv = -1;
int mime = ms->flags & MAGIC_MIME;
int n_lf = 0;
int n_cr = 0;
int n_nel = 0;
- int score, curtype, executable = 0;
+ int executable = 0;
size_t last_line_end = (size_t)-1;
int has_long_lines = 0;
== NULL)
goto done;
if ((rv = file_softmagic(ms, utf8_buf,
- (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) != 0)
- goto subtype_identified;
- else
+ (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) == 0)
rv = -1;
}
- /* look for tokens from names.h - this is expensive! */
- if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
- goto subtype_identified;
-
- i = 0;
- score = 0;
- curtype = -1;
- while (i < ulen) {
- size_t end;
-
- /* skip past any leading space */
- while (i < ulen && ISSPC(ubuf[i]))
- i++;
- if (i >= ulen)
- break;
-
- /* find the next whitespace */
- for (end = i + 1; end < nbytes; end++)
- if (ISSPC(ubuf[end]))
- break;
-
- /* compare the word thus isolated against the token list */
- for (p = names; p < names + NNAMES; p++) {
- if (ascmatch((const unsigned char *)p->name, ubuf + i,
- end - i)) {
- if (curtype == -1)
- curtype = p->type;
- else if (curtype != p->type) {
- score = p->score;
- curtype = p->type;
- } else
- score += p->score;
- if (score > 1) {
- subtype = types[p->type].human;
- subtype_mime = types[p->type].mime;
- goto subtype_identified;
- }
- }
- }
-
- i = end;
- }
-
-subtype_identified:
-
/* Now try to discover other details about the file. */
for (i = 0; i < ulen; i++) {
if (ubuf[i] == '\n') {
return rv;
}
-private int
-ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
-{
- size_t i;
-
- for (i = 0; i < ulen; i++) {
- if (s[i] != us[i])
- return 0;
- }
-
- if (s[i])
- return 0;
- else
- return 1;
-}
-
/*
* Encode Unicode string as UTF-8, returning pointer to character
* after end of string, or NULL if an invalid character is found.
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: file.c,v 1.142 2011/02/03 01:57:33 christos Exp $")
+FILE_RCSID("@(#)$File: file.c,v 1.144 2011/05/10 17:08:14 christos Exp $")
#endif /* lint */
#include "magic.h"
#include "mygetopt.h"
#endif
-#include "patchlevel.h"
-
#ifdef S_IFLNK
#define FILE_FLAGS "-bchikLlNnprsvz0"
#else
{ "soft", MAGIC_NO_CHECK_SOFT },
{ "tar", MAGIC_NO_CHECK_TAR },
{ "text", MAGIC_NO_CHECK_TEXT }, /* synonym for ascii */
- { "tokens", MAGIC_NO_CHECK_TOKENS },
+ { "tokens", MAGIC_NO_CHECK_TOKENS }, /* OBSOLETE: ignored for backwards compatibility */
};
private char *progname; /* used throughout */
case 'v':
if (magicfile == NULL)
magicfile = magic_getpath(magicfile, action);
- (void)fprintf(stdout, "%s-%d.%.2d\n", progname,
- FILE_VERSION_MAJOR, patchlevel);
+ (void)fprintf(stdout, "%s-%s\n", progname, VERSION);
(void)fprintf(stdout, "magic file from %s\n",
magicfile);
return 1;
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: funcs.c,v 1.57 2011/05/11 01:02:41 christos Exp $")
+FILE_RCSID("@(#)$File: funcs.c,v 1.58 2011/09/20 15:30:14 christos Exp $")
#endif /* lint */
#include "magic.h"
goto done;
}
- /* try text properties (and possibly text tokens) */
+ /* try text properties */
if ((ms->flags & MAGIC_NO_CHECK_TEXT) == 0) {
if ((m = file_ascmagic(ms, ubuf, nb, looks_text)) != 0) {
+++ /dev/null
-/*
- * Copyright (c) Ian F. Darwin 1986-1995.
- * Software written by Ian F. Darwin and others;
- * maintained 1995-present by Christos Zoulas and others.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice immediately at the beginning of the file, without modification,
- * this list of conditions, and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * Names.h - names and types used by ascmagic in file(1).
- * These tokens are here because they can appear anywhere in
- * the first HOWMANY bytes, while tokens in MAGIC must
- * appear at fixed offsets into the file. Don't make HOWMANY
- * too high unless you have a very fast CPU.
- *
- * $File: names.h,v 1.32 2008/02/11 00:19:29 rrt Exp $
- */
-
-/*
- modified by Chris Lowth - 9 April 2000
- to add mime type strings to the types table.
-*/
-
-/* these types are used to index the table 'types': keep em in sync! */
-#define L_C 0 /* first and foremost on UNIX */
-#define L_CC 1 /* Bjarne's postincrement */
-#define L_MAKE 2 /* Makefiles */
-#define L_PLI 3 /* PL/1 */
-#define L_MACH 4 /* some kinda assembler */
-#define L_ENG 5 /* English */
-#define L_PAS 6 /* Pascal */
-#define L_MAIL 7 /* Electronic mail */
-#define L_NEWS 8 /* Usenet Netnews */
-#define L_JAVA 9 /* Java code */
-#define L_HTML 10 /* HTML */
-#define L_BCPL 11 /* BCPL */
-#define L_M4 12 /* M4 */
-#define L_PO 13 /* PO */
-
-static const struct {
- char human[48];
- char mime[16];
-} types[] = {
- { "C program", "text/x-c", },
- { "C++ program", "text/x-c++" },
- { "make commands", "text/x-makefile" },
- { "PL/1 program", "text/x-pl1" },
- { "assembler program", "text/x-asm" },
- { "English", "text/plain" },
- { "Pascal program", "text/x-pascal" },
- { "mail", "text/x-mail" },
- { "news", "text/x-news" },
- { "Java program", "text/x-java" },
- { "HTML document", "text/html", },
- { "BCPL program", "text/x-bcpl" },
- { "M4 macro language pre-processor", "text/x-m4" },
- { "PO (gettext message catalogue)", "text/x-po" },
- { "cannot happen error on names.h/types", "error/x-error" }
-};
-
-/*
- * XXX - how should we distinguish Java from C++?
- * The trick used in a Debian snapshot, of having "extends" or "implements"
- * as tags for Java, doesn't work very well, given that those keywords
- * are often preceded by "class", which flags it as C++.
- *
- * Perhaps we need to be able to say
- *
- * If "class" then
- *
- * if "extends" or "implements" then
- * Java
- * else
- * C++
- * endif
- *
- * Or should we use other keywords, such as "package" or "import"?
- * Unfortunately, Ada95 uses "package", and Modula-3 uses "import",
- * although I infer from the language spec at
- *
- * http://www.research.digital.com/SRC/m3defn/html/m3.html
- *
- * that Modula-3 uses "IMPORT" rather than "import", i.e. it must be
- * in all caps.
- *
- * So, for now, we go with "import". We must put it before the C++
- * stuff, so that we don't misidentify Java as C++. Not using "package"
- * means we won't identify stuff that defines a package but imports
- * nothing; hopefully, very little Java code imports nothing (one of the
- * reasons for doing OO programming is to import as much as possible
- * and write only what you need to, right?).
- *
- * Unfortunately, "import" may cause us to misidentify English text
- * as Java, as it comes after "the" and "The". Perhaps we need a fancier
- * heuristic to identify Java?
- */
-static const struct names {
- char name[14];
- unsigned char type;
- unsigned char score;
-
-} names[] = {
- /* These must be sorted by eye for optimal hit rate */
- /* Add to this list only after substantial meditation */
- {"msgid", L_PO, 1 },
- {"dnl", L_M4, 2 },
- {"import", L_JAVA, 2 },
- {"\"libhdr\"", L_BCPL, 2 },
- {"\"LIBHDR\"", L_BCPL, 2 },
- {"//", L_CC, 2 },
- {"template", L_CC, 1 },
- {"virtual", L_CC, 1 },
- {"class", L_CC, 2 },
- {"public:", L_CC, 2 },
- {"private:", L_CC, 2 },
- {"/*", L_C, 2 }, /* must precede "The", "the", etc. */
- {"#include", L_C, 2 },
- {"char", L_C, 2 },
- {"The", L_ENG, 2 },
- {"the", L_ENG, 2 },
- {"double", L_C, 1 },
- {"extern", L_C, 2 },
- {"float", L_C, 1 },
- {"struct", L_C, 1 },
- {"union", L_C, 1 },
- {"main(", L_C, 2 },
- {"CFLAGS", L_MAKE, 2 },
- {"LDFLAGS", L_MAKE, 2 },
- {"all:", L_MAKE, 2 },
- {".PRECIOUS", L_MAKE, 2 },
- {".ascii", L_MACH, 2 },
- {".asciiz", L_MACH, 2 },
- {".byte", L_MACH, 2 },
- {".even", L_MACH, 2 },
- {".globl", L_MACH, 2 },
- {".text", L_MACH, 2 },
- {"clr", L_MACH, 2 },
- {"(input,", L_PAS, 2 },
- {"program", L_PAS, 1 },
- {"record", L_PAS, 1 },
- {"dcl", L_PLI, 2 },
- {"Received:", L_MAIL, 2 },
- {">From", L_MAIL, 2 },
- {"Return-Path:",L_MAIL, 2 },
- {"Cc:", L_MAIL, 2 },
- {"Newsgroups:", L_NEWS, 2 },
- {"Path:", L_NEWS, 2 },
- {"Organization:",L_NEWS, 2 },
- {"href=", L_HTML, 2 },
- {"HREF=", L_HTML, 2 },
- {"<body", L_HTML, 2 },
- {"<BODY", L_HTML, 2 },
- {"<html", L_HTML, 2 },
- {"<HTML", L_HTML, 2 },
- {"<!--", L_HTML, 2 },
-};
-#define NNAMES (sizeof(names)/sizeof(struct names))