From: Reuben Thomas Date: Thu, 8 Dec 2011 12:12:46 +0000 (+0000) Subject: Turn hardwired token matching into soft magic. See ChangeLog for details. X-Git-Tag: FILE5_10~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8af6ef037a378f51faeb80171237ac7cd4b18c34;p=file Turn hardwired token matching into soft magic. See ChangeLog for details. --- diff --git a/ChangeLog b/ChangeLog index 0b8f7870..a30292b7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2011-12-08 13:07 Reuben Thomas + + * Remove hardwired token finding (names.h), turning it into soft + magic. Patterns are either anchored regexs or search/8192. English + language detection and PL/1 detection have been removed as they + were too fragile. -e tokens is still accepted for backwards + compatibility. + * Move 3ds patterns (which are commented out anyway) into autodesk + (they were, oddly, in c-lang). + 2011-12-06 00:16 Reuben Thomas * Tweak strength of generic hash-bang detectors to be less than diff --git a/doc/file.man b/doc/file.man index 1e33d9d3..60af973c 100644 --- a/doc/file.man +++ b/doc/file.man @@ -1,4 +1,4 @@ -.\" $File: file.man,v 1.96 2011/07/12 11:23:38 rrt Exp $ +.\" $File: file.man,v 1.97 2011/10/17 20:18:05 christos Exp $ .Dd October 17, 2011 .Dt FILE __CSECTION__ .Os @@ -192,7 +192,7 @@ option). .It encoding Different text encodings for soft magic tests. .It tokens -Looks for known tokens inside text files. +Ignored for backwards compatibility. .It cdf Prints details of Compound Document Files. .It compress diff --git a/magic/Magdir/assembler b/magic/Magdir/assembler new file mode 100644 index 00000000..486a6870 --- /dev/null +++ b/magic/Magdir/assembler @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------------ +# $File$ +# make: file(1) magic for assembler source +# +0 regex \^\.asciiz\? assembler source text +!:mime text/x-asm +0 regex \^\.byte assembler source text +!:mime text/x-asm +0 regex \^\.even assembler source text +!:mime text/x-asm +0 regex \^\.globl assembler source text +!:mime text/x-asm +0 regex \^\.text assembler source text +!:mime text/x-asm diff --git a/magic/Magdir/c-lang b/magic/Magdir/c-lang index 3966c263..f79b77f6 100644 --- a/magic/Magdir/c-lang +++ b/magic/Magdir/c-lang @@ -1,21 +1,49 @@ - #------------------------------------------------------------------------------ -# $File$ -# c-lang: file(1) magic for C programs (or REXX) +# $File: c-lang,v 1.14 2009/09/19 16:28:08 christos Exp $ +# c-lang: file(1) magic for C and related languages programs # -# XPM icons (Greg Roelofs, newt@uchicago.edu) -# if you uncomment "/*" for C/REXX below, also uncomment this entry -#0 string /*\ XPM\ */ X pixmap image data -#!:mime image/x-xpmi +# BCPL +0 search/8192 "libhdr" BCPL source text +!:mime text/x-bcpl +0 search/8192 "LIBHDR" BCPL source text +!:mime text/x-bcpl -# 3DS (3d Studio files) Conflicts with diff output 0x3d '=' -#16 beshort 0x3d3d image/x-3ds +# C +0 regex \^#include C source text +!:mime text/x-c +0 regex \^char C source text +!:mime text/x-c +0 regex \^double C source text +!:mime text/x-c +0 regex \^extern C source text +!:mime text/x-c +0 regex \^float C source text +!:mime text/x-c +0 regex \^struct C source text +!:mime text/x-c +0 regex \^union C source text +!:mime text/x-c +0 search/8192 main( C source text +!:mime text/x-c -# this first will upset you if you're a PL/1 shop... -# in which case rm it; ascmagic will catch real C programs -#0 search/1 /* C or REXX program text -#0 search/1 // C++ program text +# C++ +# The strength of these rules is doubled so they beat the C rules above +0 regex \^template C++ source text +!:strength * 2 +!:mime text/x-c++ +0 regex \^virtual C++ source text +!:strength * 2 +!:mime text/x-c++ +0 regex \^class C++ source text +!:strength * 2 +!:mime text/x-c++ +0 regex \^public: C++ source text +!:strength * 2 +!:mime text/x-c++ +0 regex \^private: C++ source text +!:strength * 2 +!:mime text/x-c++ # From: Mikhail Teterin 0 string cscope cscope reference data diff --git a/magic/Magdir/cad b/magic/Magdir/cad index ec340d7a..fcca65b6 100644 --- a/magic/Magdir/cad +++ b/magic/Magdir/cad @@ -1,6 +1,6 @@ #------------------------------------------------------------------------------ -# $File: cad,v 1.9 2009/09/19 16:28:08 christos Exp $ +# $File: cad,v 1.10 2010/12/25 14:33:43 christos Exp $ # autocad: file(1) magic for cad files # @@ -113,3 +113,6 @@ 0 string AC1012 AutoDesk AutoCAD R13 0 string AC1014 AutoDesk AutoCAD R14 0 string AC1015 AutoDesk AutoCAD R2000 + +# 3DS (3d Studio files) Conflicts with diff output 0x3d '=' +#16 beshort 0x3d3d image/x-3ds diff --git a/magic/Magdir/gnu b/magic/Magdir/gnu index 1bdf9b26..fddab68f 100644 --- a/magic/Magdir/gnu +++ b/magic/Magdir/gnu @@ -1,6 +1,6 @@ #------------------------------------------------------------------------------ -# $File$ +# $File: gnu,v 1.11 2009/09/19 16:28:09 christos Exp $ # gnu: file(1) magic for various GNU tools # # GNU nlsutils message catalog file format @@ -42,3 +42,7 @@ # Files produced by GNU gettext 0 long 0xDE120495 GNU-format message catalog data 0 long 0x950412DE GNU-format message catalog data + +# gettext message catalogue +0 regex \^msgid\ GNU gettext message catalogue text +!:mime text/x-po diff --git a/magic/Magdir/images b/magic/Magdir/images index 621d3ded..21674afe 100644 --- a/magic/Magdir/images +++ b/magic/Magdir/images @@ -1,6 +1,6 @@ #------------------------------------------------------------------------------ -# $File: images,v 1.70 2010/11/25 15:00:12 christos Exp $ +# $File: images,v 1.71 2011/09/22 19:30:43 christos Exp $ # images: file(1) magic for image formats (see also "iff", and "c-lang" for # XPM bitmaps) # @@ -233,8 +233,8 @@ #0 string BA PC bitmap array data # XPM icons (Greg Roelofs, newt@uchicago.edu) -# note possible collision with C/REXX entry in c-lang; currently commented out 0 search/1 /*\ XPM\ */ X pixmap image text +!:mime image/x-xpmi # Utah Raster Toolkit RLE images (janl@ifi.uio.no) 0 leshort 0xcc52 RLE image data, diff --git a/magic/Magdir/java b/magic/Magdir/java index b5b43603..9a4cc4c9 100644 --- a/magic/Magdir/java +++ b/magic/Magdir/java @@ -1,6 +1,6 @@ #------------------------------------------------------------ -# $File$ +# $File: java,v 1.12 2009/09/19 16:28:10 christos Exp $ # Java ByteCode and Mach-O binaries (e.g., Mac OS X) use the # same magic number, 0xcafebabe, so they are both handled # in the entry called "cafebabe". @@ -24,3 +24,6 @@ >0 regex dey\n[0-9][0-9][0-9]\0 Dalvik dex file (optimized for host) >4 string >000 version %s +# Java source +0 regex ^import.*;$ Java source +!:mime text/x-java diff --git a/magic/Magdir/m4 b/magic/Magdir/m4 new file mode 100644 index 00000000..508f32dd --- /dev/null +++ b/magic/Magdir/m4 @@ -0,0 +1,6 @@ +#------------------------------------------------------------------------------ +# $File$ +# make: file(1) magic for M4 scripts +# +0 regex \^dnl\ M4 macro processor script text +!:mime text/x-m4 diff --git a/magic/Magdir/mail.news b/magic/Magdir/mail.news index 2c3d55af..96fa069e 100644 --- a/magic/Magdir/mail.news +++ b/magic/Magdir/mail.news @@ -1,11 +1,9 @@ - #------------------------------------------------------------------------------ -# $File: mail.news,v 1.18 2010/11/25 15:00:12 christos Exp $ +# $File: mail.news,v 1.19 2011/01/25 13:55:57 christos Exp $ # mail.news: file(1) magic for mail and news # # Unfortunately, saved netnews also has From line added in some news software. #0 string From mail text -# There are tests to ascmagic.c to cope with mail and news. 0 string/t Relay-Version: old news text !:mime message/rfc822 0 string/t #!\ rnews batched news text @@ -16,7 +14,9 @@ !:mime message/rfc822 0 string/t Pipe\ to mail piping text !:mime message/rfc822 -0 string/t Return-Path: smtp mail text +0 string/t Delivered-To: SMTP mail text +!:mime message/rfc822 +0 string/t Return-Path: SMTP mail text !:mime message/rfc822 0 string/t Path: news text !:mime message/news diff --git a/magic/Magdir/make b/magic/Magdir/make new file mode 100644 index 00000000..4ee5c1e1 --- /dev/null +++ b/magic/Magdir/make @@ -0,0 +1,15 @@ +#------------------------------------------------------------------------------ +# $File$ +# make: file(1) magic for makefiles +# +0 regex \^CFLAGS makefile script text +!:mime text/x-makefile +0 regex \^LDFLAGS makefile script text +!:mime text/x-makefile +0 regex \^all: makefile script text +!:mime text/x-makefile +0 regex \^.PRECIOUS makefile script text +!:mime text/x-makefile + +0 regex \^SUBDIRS automake makefile script text +!:mime text/x-makefile diff --git a/magic/Magdir/pascal b/magic/Magdir/pascal new file mode 100644 index 00000000..a95939c0 --- /dev/null +++ b/magic/Magdir/pascal @@ -0,0 +1,10 @@ +#------------------------------------------------------------------------------ +# $File$ +# pascal: file(1) magic for Pascal source +# +0 search/8192 (input, Pascal source text +!:mime text/x-pascal +0 regex \^program Pascal source text +!:mime text/x-pascal +0 regex \^record Pascal source text +!:mime text/x-pascal diff --git a/magic/Makefile.am b/magic/Makefile.am index 9de0acd9..9464acf8 100644 --- a/magic/Makefile.am +++ b/magic/Makefile.am @@ -1,5 +1,5 @@ # -# $File: Makefile.am,v 1.74 2011/11/10 18:59:54 christos Exp $ +# $File: Makefile.am,v 1.75 2011/11/25 03:28:17 christos Exp $ # MAGIC_FRAGMENT_BASE = Magdir MAGIC_DIR = $(top_srcdir)/magic @@ -22,6 +22,7 @@ $(MAGIC_FRAGMENT_DIR)/apl \ $(MAGIC_FRAGMENT_DIR)/apple \ $(MAGIC_FRAGMENT_DIR)/applix \ $(MAGIC_FRAGMENT_DIR)/archive \ +$(MAGIC_FRAGMENT_DIR)/assembler \ $(MAGIC_FRAGMENT_DIR)/asterix \ $(MAGIC_FRAGMENT_DIR)/att3b \ $(MAGIC_FRAGMENT_DIR)/audio \ @@ -115,10 +116,12 @@ $(MAGIC_FRAGMENT_DIR)/lisp \ $(MAGIC_FRAGMENT_DIR)/llvm \ $(MAGIC_FRAGMENT_DIR)/lua \ $(MAGIC_FRAGMENT_DIR)/luks \ +$(MAGIC_FRAGMENT_DIR)/m4 \ $(MAGIC_FRAGMENT_DIR)/mach \ $(MAGIC_FRAGMENT_DIR)/macintosh \ $(MAGIC_FRAGMENT_DIR)/magic \ $(MAGIC_FRAGMENT_DIR)/mail.news \ +$(MAGIC_FRAGMENT_DIR)/make \ $(MAGIC_FRAGMENT_DIR)/maple \ $(MAGIC_FRAGMENT_DIR)/marc21 \ $(MAGIC_FRAGMENT_DIR)/mathcad \ @@ -161,6 +164,7 @@ $(MAGIC_FRAGMENT_DIR)/osf1 \ $(MAGIC_FRAGMENT_DIR)/palm \ $(MAGIC_FRAGMENT_DIR)/parix \ $(MAGIC_FRAGMENT_DIR)/parrot \ +$(MAGIC_FRAGMENT_DIR)/pascal \ $(MAGIC_FRAGMENT_DIR)/pbm \ $(MAGIC_FRAGMENT_DIR)/pdf \ $(MAGIC_FRAGMENT_DIR)/pdp \ diff --git a/src/ascmagic.c b/src/ascmagic.c index f6da80ac..53c1920c 100644 --- a/src/ascmagic.c +++ b/src/ascmagic.c @@ -35,7 +35,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: ascmagic.c,v 1.81 2011/03/15 22:16:29 christos Exp $") +FILE_RCSID("@(#)$File: ascmagic.c,v 1.82 2011/09/20 15:30:14 christos Exp $") #endif /* lint */ #include "magic.h" @@ -46,13 +46,11 @@ FILE_RCSID("@(#)$File: ascmagic.c,v 1.81 2011/03/15 22:16:29 christos Exp $") #ifdef HAVE_UNISTD_H #include #endif -#include "names.h" #define MAXLINELEN 300 /* longest sane line length */ #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') -private int ascmatch(const unsigned char *, const unichar *, size_t); private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t); private size_t trim_nuls(const unsigned char *, size_t); @@ -88,15 +86,12 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes, /* If file doesn't look like any sort of text, give up. */ if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime, - &type) == 0) { + &type) == 0) rv = 0; - goto done; - } - - rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, - type, text); + else + rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, + type, text); - done: if (ubuf) free(ubuf); @@ -110,7 +105,6 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, { unsigned char *utf8_buf = NULL, *utf8_end; size_t mlen, i; - const struct names *p; int rv = -1; int mime = ms->flags & MAGIC_MIME; @@ -125,7 +119,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, int n_lf = 0; int n_cr = 0; int n_nel = 0; - int score, curtype, executable = 0; + int executable = 0; size_t last_line_end = (size_t)-1; int has_long_lines = 0; @@ -154,57 +148,10 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, == NULL) goto done; if ((rv = file_softmagic(ms, utf8_buf, - (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) != 0) - goto subtype_identified; - else + (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) == 0) rv = -1; } - /* look for tokens from names.h - this is expensive! */ - if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) - goto subtype_identified; - - i = 0; - score = 0; - curtype = -1; - while (i < ulen) { - size_t end; - - /* skip past any leading space */ - while (i < ulen && ISSPC(ubuf[i])) - i++; - if (i >= ulen) - break; - - /* find the next whitespace */ - for (end = i + 1; end < nbytes; end++) - if (ISSPC(ubuf[end])) - break; - - /* compare the word thus isolated against the token list */ - for (p = names; p < names + NNAMES; p++) { - if (ascmatch((const unsigned char *)p->name, ubuf + i, - end - i)) { - if (curtype == -1) - curtype = p->type; - else if (curtype != p->type) { - score = p->score; - curtype = p->type; - } else - score += p->score; - if (score > 1) { - subtype = types[p->type].human; - subtype_mime = types[p->type].mime; - goto subtype_identified; - } - } - } - - i = end; - } - -subtype_identified: - /* Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { @@ -356,22 +303,6 @@ done: return rv; } -private int -ascmatch(const unsigned char *s, const unichar *us, size_t ulen) -{ - size_t i; - - for (i = 0; i < ulen; i++) { - if (s[i] != us[i]) - return 0; - } - - if (s[i]) - return 0; - else - return 1; -} - /* * Encode Unicode string as UTF-8, returning pointer to character * after end of string, or NULL if an invalid character is found. diff --git a/src/file.c b/src/file.c index dab6f780..08fbb42b 100644 --- a/src/file.c +++ b/src/file.c @@ -32,7 +32,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: file.c,v 1.142 2011/02/03 01:57:33 christos Exp $") +FILE_RCSID("@(#)$File: file.c,v 1.144 2011/05/10 17:08:14 christos Exp $") #endif /* lint */ #include "magic.h" @@ -70,8 +70,6 @@ int getopt_long(int argc, char * const *argv, const char *optstring, const struc #include "mygetopt.h" #endif -#include "patchlevel.h" - #ifdef S_IFLNK #define FILE_FLAGS "-bchikLlNnprsvz0" #else @@ -118,7 +116,7 @@ private const struct { { "soft", MAGIC_NO_CHECK_SOFT }, { "tar", MAGIC_NO_CHECK_TAR }, { "text", MAGIC_NO_CHECK_TEXT }, /* synonym for ascii */ - { "tokens", MAGIC_NO_CHECK_TOKENS }, + { "tokens", MAGIC_NO_CHECK_TOKENS }, /* OBSOLETE: ignored for backwards compatibility */ }; private char *progname; /* used throughout */ @@ -251,8 +249,7 @@ main(int argc, char *argv[]) case 'v': if (magicfile == NULL) magicfile = magic_getpath(magicfile, action); - (void)fprintf(stdout, "%s-%d.%.2d\n", progname, - FILE_VERSION_MAJOR, patchlevel); + (void)fprintf(stdout, "%s-%s\n", progname, VERSION); (void)fprintf(stdout, "magic file from %s\n", magicfile); return 1; diff --git a/src/funcs.c b/src/funcs.c index 026f472e..b14b78cb 100644 --- a/src/funcs.c +++ b/src/funcs.c @@ -27,7 +27,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: funcs.c,v 1.57 2011/05/11 01:02:41 christos Exp $") +FILE_RCSID("@(#)$File: funcs.c,v 1.58 2011/09/20 15:30:14 christos Exp $") #endif /* lint */ #include "magic.h" @@ -253,7 +253,7 @@ file_buffer(struct magic_set *ms, int fd, const char *inname __attribute__ ((unu goto done; } - /* try text properties (and possibly text tokens) */ + /* try text properties */ if ((ms->flags & MAGIC_NO_CHECK_TEXT) == 0) { if ((m = file_ascmagic(ms, ubuf, nb, looks_text)) != 0) { diff --git a/src/names.h b/src/names.h deleted file mode 100644 index 6df4ea28..00000000 --- a/src/names.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Ian F. Darwin 1986-1995. - * Software written by Ian F. Darwin and others; - * maintained 1995-present by Christos Zoulas and others. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice immediately at the beginning of the file, without modification, - * this list of conditions, and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -/* - * Names.h - names and types used by ascmagic in file(1). - * These tokens are here because they can appear anywhere in - * the first HOWMANY bytes, while tokens in MAGIC must - * appear at fixed offsets into the file. Don't make HOWMANY - * too high unless you have a very fast CPU. - * - * $File: names.h,v 1.32 2008/02/11 00:19:29 rrt Exp $ - */ - -/* - modified by Chris Lowth - 9 April 2000 - to add mime type strings to the types table. -*/ - -/* these types are used to index the table 'types': keep em in sync! */ -#define L_C 0 /* first and foremost on UNIX */ -#define L_CC 1 /* Bjarne's postincrement */ -#define L_MAKE 2 /* Makefiles */ -#define L_PLI 3 /* PL/1 */ -#define L_MACH 4 /* some kinda assembler */ -#define L_ENG 5 /* English */ -#define L_PAS 6 /* Pascal */ -#define L_MAIL 7 /* Electronic mail */ -#define L_NEWS 8 /* Usenet Netnews */ -#define L_JAVA 9 /* Java code */ -#define L_HTML 10 /* HTML */ -#define L_BCPL 11 /* BCPL */ -#define L_M4 12 /* M4 */ -#define L_PO 13 /* PO */ - -static const struct { - char human[48]; - char mime[16]; -} types[] = { - { "C program", "text/x-c", }, - { "C++ program", "text/x-c++" }, - { "make commands", "text/x-makefile" }, - { "PL/1 program", "text/x-pl1" }, - { "assembler program", "text/x-asm" }, - { "English", "text/plain" }, - { "Pascal program", "text/x-pascal" }, - { "mail", "text/x-mail" }, - { "news", "text/x-news" }, - { "Java program", "text/x-java" }, - { "HTML document", "text/html", }, - { "BCPL program", "text/x-bcpl" }, - { "M4 macro language pre-processor", "text/x-m4" }, - { "PO (gettext message catalogue)", "text/x-po" }, - { "cannot happen error on names.h/types", "error/x-error" } -}; - -/* - * XXX - how should we distinguish Java from C++? - * The trick used in a Debian snapshot, of having "extends" or "implements" - * as tags for Java, doesn't work very well, given that those keywords - * are often preceded by "class", which flags it as C++. - * - * Perhaps we need to be able to say - * - * If "class" then - * - * if "extends" or "implements" then - * Java - * else - * C++ - * endif - * - * Or should we use other keywords, such as "package" or "import"? - * Unfortunately, Ada95 uses "package", and Modula-3 uses "import", - * although I infer from the language spec at - * - * http://www.research.digital.com/SRC/m3defn/html/m3.html - * - * that Modula-3 uses "IMPORT" rather than "import", i.e. it must be - * in all caps. - * - * So, for now, we go with "import". We must put it before the C++ - * stuff, so that we don't misidentify Java as C++. Not using "package" - * means we won't identify stuff that defines a package but imports - * nothing; hopefully, very little Java code imports nothing (one of the - * reasons for doing OO programming is to import as much as possible - * and write only what you need to, right?). - * - * Unfortunately, "import" may cause us to misidentify English text - * as Java, as it comes after "the" and "The". Perhaps we need a fancier - * heuristic to identify Java? - */ -static const struct names { - char name[14]; - unsigned char type; - unsigned char score; - -} names[] = { - /* These must be sorted by eye for optimal hit rate */ - /* Add to this list only after substantial meditation */ - {"msgid", L_PO, 1 }, - {"dnl", L_M4, 2 }, - {"import", L_JAVA, 2 }, - {"\"libhdr\"", L_BCPL, 2 }, - {"\"LIBHDR\"", L_BCPL, 2 }, - {"//", L_CC, 2 }, - {"template", L_CC, 1 }, - {"virtual", L_CC, 1 }, - {"class", L_CC, 2 }, - {"public:", L_CC, 2 }, - {"private:", L_CC, 2 }, - {"/*", L_C, 2 }, /* must precede "The", "the", etc. */ - {"#include", L_C, 2 }, - {"char", L_C, 2 }, - {"The", L_ENG, 2 }, - {"the", L_ENG, 2 }, - {"double", L_C, 1 }, - {"extern", L_C, 2 }, - {"float", L_C, 1 }, - {"struct", L_C, 1 }, - {"union", L_C, 1 }, - {"main(", L_C, 2 }, - {"CFLAGS", L_MAKE, 2 }, - {"LDFLAGS", L_MAKE, 2 }, - {"all:", L_MAKE, 2 }, - {".PRECIOUS", L_MAKE, 2 }, - {".ascii", L_MACH, 2 }, - {".asciiz", L_MACH, 2 }, - {".byte", L_MACH, 2 }, - {".even", L_MACH, 2 }, - {".globl", L_MACH, 2 }, - {".text", L_MACH, 2 }, - {"clr", L_MACH, 2 }, - {"(input,", L_PAS, 2 }, - {"program", L_PAS, 1 }, - {"record", L_PAS, 1 }, - {"dcl", L_PLI, 2 }, - {"Received:", L_MAIL, 2 }, - {">From", L_MAIL, 2 }, - {"Return-Path:",L_MAIL, 2 }, - {"Cc:", L_MAIL, 2 }, - {"Newsgroups:", L_NEWS, 2 }, - {"Path:", L_NEWS, 2 }, - {"Organization:",L_NEWS, 2 }, - {"href=", L_HTML, 2 }, - {"HREF=", L_HTML, 2 }, - {"