]> granicus.if.org Git - php/commitdiff
levenshtein() fixed, regression tests added (bug id #6562 and #7368)
authorHartmut Holzgraefe <hholzgra@php.net>
Wed, 13 Dec 2000 23:26:19 +0000 (23:26 +0000)
committerHartmut Holzgraefe <hholzgra@php.net>
Wed, 13 Dec 2000 23:26:19 +0000 (23:26 +0000)
# fallback to unoptimized version for 4.0.4 release

ext/standard/levenshtein.c
ext/standard/tests/general_functions/003.phpt [new file with mode: 0644]

index d157cf7fab71e500dd030df8bb95ddf5fe4f9c81..080c86a3271c12138c4dbd7c8de6fb8fa2ae9b88 100644 (file)
 #include <ctype.h>
 #include "php_string.h"
 
-/* faster, but obfuscated, all operations have a cost of 1 */
-static int fastest_levdist(const char *s1, const char *s2) 
+/* reference implementation, only optimized for memory usage, not speed */
+static int reference_levdist(const char *s1, int l1, 
+                                                                                                                const char *s2, int l2, 
+                                                                                                                int cost_ins, int cost_rep, int cost_del )
 {
-       register char *p1,*p2; 
-       register int i,j,n;
-       int l1=0,l2=0;
-       char r[512];
-       const char *tmp;
-
-       /* skip equal start sequence, if any */
-       while(*s1==*s2) {
-               if(!*s1) break;
-               s1++; s2++;
-       }
-       
-       /* if we already used up one string, then
-      the result is the length of the other */
-       if(*s1=='\0') return strlen(s2);
-       if(*s2=='\0') return strlen(s1);
-
-       /* length count */
-       while(*s1++) l1++;
-       while(*s2++) l2++;
-       
-       /* cut of equal tail sequence, if any */
-       while(*--s1 == *--s2) {
-               l1--; l2--;             
-       }
-       
-
-       /* reset pointers, adjust length */
-       s1-=l1++;
-       s2-=l2++;
+       int *p1,*p2,*tmp;
+       int i1,i2,c0,c1,c2;
        
+       if(l1==0) return l2*cost_ins;
+       if(l2==0) return l1*cost_del;
+
+       p1=malloc(l2*sizeof(int));
+       p2=malloc(l2*sizeof(int));
+
+       p1[0]=(s1[0]==s2[0])?0:cost_rep;
+
+       for(i2=1;i2<l2;i2++)
+               p1[i2]=i2*cost_ins;
+
+       for(i1=1;i1<l1;i1++)
+               {
+                       p2[0]=i1*cost_del;
+                       for(i2=1;i2<l2;i2++)
+                               {
+                                       c0=p1[i2-1]+((s1[i1]==s2[i2])?0:cost_rep);
+                                       c1=p1[i2]+cost_del; if(c1<c0) c0=c1;
+                                       c2=p2[i2-1]+cost_ins; if(c2<c0) c0=c2;                          
+                                       p2[i2]=c0;
+                               }
+                       tmp=p1; p1=p2; p2=tmp;
+               }
 
-       /* possible dist to great? */
-       if(abs(l1-l2)>=255) return -1;
+       c0=p1[l2-1];
 
-       /* swap if l2 longer than l1 */
-       if(l1<l2) {
-               tmp=s1; s1=s2; s2=tmp;
-               l1 ^= l2; l2 ^= l1; l1 ^= l2;
-       }
+       free(p1);
+       free(p2);
 
-       
-       /* fill initial row */
-       n=1;
-       for(i=0,p1=r;i<l1;i++,*p1++=n++,p1++) {/*empty*/}
-       
-       /* calc. rowwise */
-       for(j=1;j<l2;j++) {
-               /* init pointers and col#0 */
-               p1 = r + !(j&1);
-               p2 = r + (j&1);
-               n=*p1+1;
-               *p2++=n;p2++;
-               s2++;
-               
-               /* foreach column */
-               for(i=1;i<l1;i++) {
-                       if(*p1<n) n=*p1+(*(s1+i)!=*(s2)); /* replace cheaper than delete? */
-                       p1++;
-                       if(*++p1<n) n=*p1+1; /* insert cheaper then replace ? */
-                       *p2++=n++; /* update field and cost for next col's delete */
-                       p2++;
-               }       
-       }
-
-       /* return result */
-       return n-1;
+       return c0;
 }
 
 
-static int weighted_levdist( const char *s1
-                                                                                                                , const char *s2
-                                                                                                                , const int cost_ins
-                                                                                                                , const int cost_rep
-                                                                                                                , const int cost_del
-                                                                                                                ) 
-{
-       register int *p1,*p2; 
-       register int i,j,n,c;
-       int l1=0,l2=0;
-       int r[512];
-       const char *tmp;
-
-       /* skip equal start sequence, if any */
-       while(*s1==*s2) {
-               if(!*s1) break;
-               s1++; s2++;
-       }
-       
-       /* if we already used up one string, then
-      the result is the length of the other */
-       if(*s1=='\0') return strlen(s2);
-       if(*s2=='\0') return strlen(s1);
-
-       /* length count */
-       while(*s1++) l1++;
-       while(*s2++) l2++;
-       
-       /* cut of equal tail sequence, if any */
-       while(*--s1 == *--s2) {
-               l1--; l2--;             
-       }
-       
-
-       /* reset pointers, adjust length */
-       s1-=l1++;
-       s2-=l2++;
-       
-
-       /* possible dist to great? */
-       if(abs(l1-l2)>=255) return -1;
-
-       /* swap if l2 longer than l1 */
-       if(l1<l2) {
-               tmp=s1; s1=s2; s2=tmp;
-               l1 ^= l2; l2 ^= l1; l1 ^= l2;
-       }
-
-       if((l1==1)&&(l2==1)) {
-               n= cost_del+cost_ins;
-               return n<cost_rep?n:cost_rep;
-       }
-
-       /* fill initial row */
-       n=cost_ins;
-       for(i=0,p1=r;i<l1;i++,*p1++=n,p1++) {n+=cost_ins;}
-       
-       /* calc. rowwise */
-       for(j=1;j<l2;j++) {
-               /* init pointers and col#0 */
-               p1 = r + !(j&1);
-               p2 = r + (j&1);
-               n=*p1+cost_del;
-               *p2++=n;p2++;
-               s2++;
-               
-               /* foreach column */
-               for(i=1;i<l1;i++) {
-                       c = *p1; if(*(s1+i)!=*(s2)) c+=cost_rep;
-                       if(c<n) n=c; /* replace cheaper than delete? */
-                       p1++;
-                       c= *++p1+cost_ins;
-                       if(c<n) n=c; /* insert cheaper then replace ? */
-                       *p2++=n; /* update field and cost for next col's delete */
-                       n+=cost_del; /* update field and cost for next col's delete */
-                       p2++;
-               }       
-       }
-
-       /* return result */
-       return n-=cost_del;
-}
-
 static int custom_levdist(char *str1,char *str2,char *callback_name) 
 {
                php_error(E_WARNING,"the general Levenshtein support is not there yet");
@@ -202,8 +87,11 @@ PHP_FUNCTION(levenshtein)
                }
                convert_to_string_ex(str1);
                convert_to_string_ex(str2);
-               
-               distance = fastest_levdist((*str1)->value.str.val, (*str2)->value.str.val);
+
+               distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, 
+                                                                                                                                (*str2)->value.str.val,(*str2)->value.str.len,
+                                                                                                                                1,1,1);
+
                break;
 
        case 5: /* more general version: calc cost by ins/rep/del weights */
@@ -216,11 +104,11 @@ PHP_FUNCTION(levenshtein)
                convert_to_long_ex(cost_rep);
                convert_to_long_ex(cost_del);
                
-               distance = weighted_levdist((*str1)->value.str.val
-                                                                                                                               , (*str2)->value.str.val
-                                                                                                                               , (*cost_ins)->value.lval
-                                                                                                                               , (*cost_rep)->value.lval
-                                                                                                                               , (*cost_del)->value.lval
+               distance = reference_levdist((*str1)->value.str.val,(*str1)->value.str.len, 
+                                                                                                                                (*str2)->value.str.val,(*str2)->value.str.len,
+                                                                                                                                (*cost_ins)->value.lval,
+                                                                                                                                (*cost_rep)->value.lval,
+                                                                                                                                (*cost_del)->value.lval
                                                                                                                                );
                
                break;
diff --git a/ext/standard/tests/general_functions/003.phpt b/ext/standard/tests/general_functions/003.phpt
new file mode 100644 (file)
index 0000000..22cbd4e
--- /dev/null
@@ -0,0 +1,54 @@
+--TEST--
+levenshtein() function test
+--POST--
+--GET--
+--FILE--
+<?php 
+
+function test_me($title,$expect,$text1,$text2,$cost1=0,$cost2=0,$cost3=0) {
+
+  if($cost1==0) 
+         $result=levenshtein($text1,$text2);
+       else
+    $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
+
+       if($result==$expect) return 0;
+
+       echo "$title: result is $result instead of $expect ";
+       echo "for '$text1'/'$text2' ";
+       if($cost1) echo "($cost1:$cost2:$cost3)";
+       echo "\n";
+       
+       return 1;               
+}
+
+$n=0;
+
+$n += test_me("equal"      , 0, "12345", "12345");
+$n += test_me("1st empty"  , 3,      "",   "xzy");
+$n += test_me("2nd empty"  , 3,   "xzy",      "");
+$n += test_me("both empty" , 0,      "",      "");
+$n += test_me("1 char"     , 1,     "1",     "2");
+$n += test_me("2 char swap", 2,    "12",    "21");
+
+$n += test_me("inexpensive delete",  2, "2121",   "11", 2, 1, 1);
+$n += test_me("expensive delete"  , 10, "2121",   "11", 2, 1, 5);
+$n += test_me("inexpensive insert",  2,   "11", "2121", 1, 1, 1);
+$n += test_me("expensive insert"  , 10,   "11", "2121", 5, 1, 1);
+
+$n += test_me("expensive replace"     , 3, "111", "121", 2, 3, 2);
+$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);
+
+$n += test_me("bug #7368", 2,     "13458",    "12345");
+$n += test_me("bug #7368", 2,      "1345",     "1234");
+
+$n += test_me("bug #6562", 1,    "debugg",    "debug");
+$n += test_me("bug #6562", 1,    "ddebug",    "debug");
+$n += test_me("bug #6562", 2,   "debbbug",    "debug");
+$n += test_me("bug #6562", 1, "debugging", "debuging");
+
+echo ($n==0)?"all passed\n":"$n failed\n";
+
+?>
+--EXPECT--
+all passed