]> granicus.if.org Git - file/commitdiff
add a simple scoring scheme to require more than one keyword matches for
authorChristos Zoulas <christos@zoulas.com>
Fri, 8 Oct 2010 21:58:44 +0000 (21:58 +0000)
committerChristos Zoulas <christos@zoulas.com>
Fri, 8 Oct 2010 21:58:44 +0000 (21:58 +0000)
english words.

src/ascmagic.c
src/names.h

index 6164603171ef2d3dae36155c7096063534bcf08d..c86d1feaafce4ade46ddf77cce738a6f37430b35 100644 (file)
@@ -36,7 +36,7 @@
 #include "file.h"
 
 #ifndef        lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.74 2008/11/07 19:10:25 christos Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
 #endif /* lint */
 
 #include "magic.h"
@@ -125,6 +125,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
        int n_lf = 0;
        int n_cr = 0;
        int n_nel = 0;
+       int score, curtype;
 
        size_t last_line_end = (size_t)-1;
        int has_long_lines = 0;
@@ -161,6 +162,8 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
                goto subtype_identified;
 
        i = 0;
+       score = 0;
+       curtype = -1;
        while (i < ulen) {
                size_t end;
 
@@ -179,9 +182,18 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
                for (p = names; p < names + NNAMES; p++) {
                        if (ascmatch((const unsigned char *)p->name, ubuf + i,
                            end - i)) {
-                               subtype = types[p->type].human;
-                               subtype_mime = types[p->type].mime;
-                               goto subtype_identified;
+                               if (curtype == -1)
+                                       curtype = p->type;
+                               else if (curtype != p->type) {
+                                       score = p->score;
+                                       curtype = p->type;
+                               } else
+                                       score += p->score;
+                               if (score > 1) {
+                                       subtype = types[p->type].human;
+                                       subtype_mime = types[p->type].mime;
+                                       goto subtype_identified;
+                               }
                        }
                }
 
index daf116818c912703de017ab1ff40912c8e88c7ab..6df4ea288b57bd927d632a7f61acb8a47b435a67 100644 (file)
@@ -32,7 +32,7 @@
  * appear at fixed offsets into the file. Don't make HOWMANY
  * too high unless you have a very fast CPU.
  *
- * $File: names.h,v 1.31 2008/02/07 00:58:52 christos Exp $
+ * $File: names.h,v 1.32 2008/02/11 00:19:29 rrt Exp $
  */
 
 /*
@@ -115,59 +115,62 @@ static const struct {
  */
 static const struct names {
        char name[14];
-       short type;
+       unsigned char type;
+       unsigned char score;
+
 } names[] = {
        /* These must be sorted by eye for optimal hit rate */
        /* Add to this list only after substantial meditation */
-       {"msgid",       L_PO},
-       {"dnl",         L_M4},
-       {"import",      L_JAVA},
-       {"\"libhdr\"",  L_BCPL},
-       {"\"LIBHDR\"",  L_BCPL},
-       {"//",          L_CC},
-       {"template",    L_CC},
-       {"virtual",     L_CC},
-       {"class",       L_CC},
-       {"public:",     L_CC},
-       {"private:",    L_CC},
-       {"/*",          L_C},   /* must precede "The", "the", etc. */
-       {"#include",    L_C},
-       {"char",        L_C},
-       {"The",         L_ENG},
-       {"the",         L_ENG},
-       {"double",      L_C},
-       {"extern",      L_C},
-       {"float",       L_C},
-       {"struct",      L_C},
-       {"union",       L_C},
-       {"CFLAGS",      L_MAKE},
-       {"LDFLAGS",     L_MAKE},
-       {"all:",        L_MAKE},
-       {".PRECIOUS",   L_MAKE},
-       {".ascii",      L_MACH},
-       {".asciiz",     L_MACH},
-       {".byte",       L_MACH},
-       {".even",       L_MACH},
-       {".globl",      L_MACH},
-       {".text",       L_MACH},
-       {"clr",         L_MACH},
-       {"(input,",     L_PAS},
-       {"program",     L_PAS},
-       {"record",      L_PAS},
-       {"dcl",         L_PLI},
-       {"Received:",   L_MAIL},
-       {">From",       L_MAIL},
-       {"Return-Path:",L_MAIL},
-       {"Cc:",         L_MAIL},
-       {"Newsgroups:", L_NEWS},
-       {"Path:",       L_NEWS},
-       {"Organization:",L_NEWS},
-       {"href=",       L_HTML},
-       {"HREF=",       L_HTML},
-       {"<body",       L_HTML},
-       {"<BODY",       L_HTML},
-       {"<html",       L_HTML},
-       {"<HTML",       L_HTML},
-       {"<!--",        L_HTML},
+       {"msgid",       L_PO, 1 },
+       {"dnl",         L_M4, 2 },
+       {"import",      L_JAVA, 2 },
+       {"\"libhdr\"",  L_BCPL, 2 },
+       {"\"LIBHDR\"",  L_BCPL, 2 },
+       {"//",          L_CC, 2 },
+       {"template",    L_CC, 1 },
+       {"virtual",     L_CC, 1 },
+       {"class",       L_CC, 2 },
+       {"public:",     L_CC, 2 },
+       {"private:",    L_CC, 2 },
+       {"/*",          L_C, 2 },       /* must precede "The", "the", etc. */
+       {"#include",    L_C, 2 },
+       {"char",        L_C, 2 },
+       {"The",         L_ENG, 2 },
+       {"the",         L_ENG, 2 },
+       {"double",      L_C, 1 },
+       {"extern",      L_C, 2 },
+       {"float",       L_C, 1 },
+       {"struct",      L_C, 1 },
+       {"union",       L_C, 1 },
+       {"main(",       L_C, 2 },
+       {"CFLAGS",      L_MAKE, 2 },
+       {"LDFLAGS",     L_MAKE, 2 },
+       {"all:",        L_MAKE, 2 },
+       {".PRECIOUS",   L_MAKE, 2 },
+       {".ascii",      L_MACH, 2 },
+       {".asciiz",     L_MACH, 2 },
+       {".byte",       L_MACH, 2 },
+       {".even",       L_MACH, 2 },
+       {".globl",      L_MACH, 2 },
+       {".text",       L_MACH, 2 },
+       {"clr",         L_MACH, 2 },
+       {"(input,",     L_PAS, 2 },
+       {"program",     L_PAS, 1 },
+       {"record",      L_PAS, 1 },
+       {"dcl",         L_PLI, 2 },
+       {"Received:",   L_MAIL, 2 },
+       {">From",       L_MAIL, 2 },
+       {"Return-Path:",L_MAIL, 2 },
+       {"Cc:",         L_MAIL, 2 },
+       {"Newsgroups:", L_NEWS, 2 },
+       {"Path:",       L_NEWS, 2 },
+       {"Organization:",L_NEWS, 2 },
+       {"href=",       L_HTML, 2 },
+       {"HREF=",       L_HTML, 2 },
+       {"<body",       L_HTML, 2 },
+       {"<BODY",       L_HTML, 2 },
+       {"<html",       L_HTML, 2 },
+       {"<HTML",       L_HTML, 2 },
+       {"<!--",        L_HTML, 2 },
 };
 #define NNAMES (sizeof(names)/sizeof(struct names))