From 90b2bd248bae54dbd2c2fec310fafb6a3ba07326 Mon Sep 17 00:00:00 2001 From: Christos Zoulas Date: Fri, 8 Oct 2010 21:58:44 +0000 Subject: [PATCH] add a simple scoring scheme to require more than one keyword matches for english words. --- src/ascmagic.c | 20 +++++++-- src/names.h | 107 +++++++++++++++++++++++++------------------------ 2 files changed, 71 insertions(+), 56 deletions(-) diff --git a/src/ascmagic.c b/src/ascmagic.c index 61646031..c86d1fea 100644 --- a/src/ascmagic.c +++ b/src/ascmagic.c @@ -36,7 +36,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: ascmagic.c,v 1.74 2008/11/07 19:10:25 christos Exp $") +FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $") #endif /* lint */ #include "magic.h" @@ -125,6 +125,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, int n_lf = 0; int n_cr = 0; int n_nel = 0; + int score, curtype; size_t last_line_end = (size_t)-1; int has_long_lines = 0; @@ -161,6 +162,8 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, goto subtype_identified; i = 0; + score = 0; + curtype = -1; while (i < ulen) { size_t end; @@ -179,9 +182,18 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, for (p = names; p < names + NNAMES; p++) { if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { - subtype = types[p->type].human; - subtype_mime = types[p->type].mime; - goto subtype_identified; + if (curtype == -1) + curtype = p->type; + else if (curtype != p->type) { + score = p->score; + curtype = p->type; + } else + score += p->score; + if (score > 1) { + subtype = types[p->type].human; + subtype_mime = types[p->type].mime; + goto subtype_identified; + } } } diff --git a/src/names.h b/src/names.h index daf11681..6df4ea28 100644 --- a/src/names.h +++ b/src/names.h @@ -32,7 +32,7 @@ * appear at fixed offsets into the file. Don't make HOWMANY * too high unless you have a very fast CPU. * - * $File: names.h,v 1.31 2008/02/07 00:58:52 christos Exp $ + * $File: names.h,v 1.32 2008/02/11 00:19:29 rrt Exp $ */ /* @@ -115,59 +115,62 @@ static const struct { */ static const struct names { char name[14]; - short type; + unsigned char type; + unsigned char score; + } names[] = { /* These must be sorted by eye for optimal hit rate */ /* Add to this list only after substantial meditation */ - {"msgid", L_PO}, - {"dnl", L_M4}, - {"import", L_JAVA}, - {"\"libhdr\"", L_BCPL}, - {"\"LIBHDR\"", L_BCPL}, - {"//", L_CC}, - {"template", L_CC}, - {"virtual", L_CC}, - {"class", L_CC}, - {"public:", L_CC}, - {"private:", L_CC}, - {"/*", L_C}, /* must precede "The", "the", etc. */ - {"#include", L_C}, - {"char", L_C}, - {"The", L_ENG}, - {"the", L_ENG}, - {"double", L_C}, - {"extern", L_C}, - {"float", L_C}, - {"struct", L_C}, - {"union", L_C}, - {"CFLAGS", L_MAKE}, - {"LDFLAGS", L_MAKE}, - {"all:", L_MAKE}, - {".PRECIOUS", L_MAKE}, - {".ascii", L_MACH}, - {".asciiz", L_MACH}, - {".byte", L_MACH}, - {".even", L_MACH}, - {".globl", L_MACH}, - {".text", L_MACH}, - {"clr", L_MACH}, - {"(input,", L_PAS}, - {"program", L_PAS}, - {"record", L_PAS}, - {"dcl", L_PLI}, - {"Received:", L_MAIL}, - {">From", L_MAIL}, - {"Return-Path:",L_MAIL}, - {"Cc:", L_MAIL}, - {"Newsgroups:", L_NEWS}, - {"Path:", L_NEWS}, - {"Organization:",L_NEWS}, - {"href=", L_HTML}, - {"HREF=", L_HTML}, - {"From", L_MAIL, 2 }, + {"Return-Path:",L_MAIL, 2 }, + {"Cc:", L_MAIL, 2 }, + {"Newsgroups:", L_NEWS, 2 }, + {"Path:", L_NEWS, 2 }, + {"Organization:",L_NEWS, 2 }, + {"href=", L_HTML, 2 }, + {"HREF=", L_HTML, 2 }, + {"