]> granicus.if.org Git - php/commitdiff
Refactor levenshtein()
authorGeorge Peter Banyard <girgias@php.net>
Mon, 6 Jul 2020 17:04:52 +0000 (19:04 +0200)
committerGeorge Peter Banyard <girgias@php.net>
Mon, 6 Jul 2020 22:41:10 +0000 (00:41 +0200)
Closes GH-5816

ext/standard/basic_functions.stub.php
ext/standard/basic_functions_arginfo.h
ext/standard/levenshtein.c
ext/standard/tests/strings/levenshtein.phpt
ext/standard/tests/strings/levenshtein_bug_16473.phpt [new file with mode: 0644]
ext/standard/tests/strings/levenshtein_bug_6562.phpt [new file with mode: 0644]
ext/standard/tests/strings/levenshtein_bug_7368.phpt [new file with mode: 0644]
ext/standard/tests/strings/levenshtein_error_conditions.phpt [new file with mode: 0644]

index 2b6e7709e6d081d2785eb7b06f7c86d5210e1ebe..f871a7633a293726a97d150c84da13efea1e5452 100755 (executable)
@@ -985,7 +985,7 @@ function iptcparse(string $iptcblock): array|false {}
 
 /* levenshtein.c */
 
-function levenshtein(string $str1, string $str2, $cost_ins = UNKNOWN, int $cost_rep = UNKNOWN, int $cost_del = UNKNOWN): int {}
+function levenshtein(string $str1, string $str2, int $cost_ins = 1, int $cost_rep = 1, int $cost_del = 1): int {}
 
 /* link.c */
 
index e6f5b00771efc9a2f107617d74d87088a53de7ee..a4b70b23bb8238a167ea579ca555afde5b0fe941 100755 (executable)
@@ -1,5 +1,5 @@
 /* This is a generated file, edit the .stub.php file instead.
- * Stub hash: 9cf2c691c081d9aaefaa9d22337a9e00efb0af77 */
+ * Stub hash: 05b740207a70a9d272b4c327882fd0b52016a0af */
 
 ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0)
        ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0)
@@ -1568,9 +1568,9 @@ ZEND_END_ARG_INFO()
 ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0)
        ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0)
        ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0)
-       ZEND_ARG_INFO(0, cost_ins)
-       ZEND_ARG_TYPE_INFO(0, cost_rep, IS_LONG, 0)
-       ZEND_ARG_TYPE_INFO(0, cost_del, IS_LONG, 0)
+       ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_ins, IS_LONG, 0, "1")
+       ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_rep, IS_LONG, 0, "1")
+       ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_del, IS_LONG, 0, "1")
 ZEND_END_ARG_INFO()
 
 #if defined(HAVE_SYMLINK) || defined(PHP_WIN32)
index 75f317fc61a60b66362e6cdd0cdabc54be60e12f..181674f4d287573e436d804ae55cccfa11cb3878 100644 (file)
  */
 
 #include "php.h"
-#include <stdlib.h>
-#include <errno.h>
-#include <ctype.h>
 #include "php_string.h"
 
 #define LEVENSHTEIN_MAX_LENGTH 255
 
 /* {{{ reference_levdist
  * reference implementation, only optimized for memory usage, not speed */
-static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, size_t l2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del )
+static zend_long reference_levdist(const zend_string *string1, const zend_string *string2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del )
 {
        zend_long *p1, *p2, *tmp;
        zend_long c0, c1, c2;
        size_t i1, i2;
 
-       if (l1 == 0) {
-               return l2 * cost_ins;
+       if (ZSTR_LEN(string1) == 0) {
+               return ZSTR_LEN(string2) * cost_ins;
        }
-       if (l2 == 0) {
-               return l1 * cost_del;
+       if (ZSTR_LEN(string2) == 0) {
+               return ZSTR_LEN(string1) * cost_del;
        }
 
-       if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) {
-               return -1;
-       }
-       p1 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
-       p2 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
+       p1 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);
+       p2 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);
 
-       for (i2 = 0; i2 <= l2; i2++) {
+       for (i2 = 0; i2 <= ZSTR_LEN(string2); i2++) {
                p1[i2] = i2 * cost_ins;
        }
-       for (i1 = 0; i1 < l1 ; i1++) {
+       for (i1 = 0; i1 < ZSTR_LEN(string1) ; i1++) {
                p2[0] = p1[0] + cost_del;
 
-               for (i2 = 0; i2 < l2; i2++) {
-                       c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep);
+               for (i2 = 0; i2 < ZSTR_LEN(string2); i2++) {
+                       c0 = p1[i2] + ((ZSTR_VAL(string1)[i1] == ZSTR_VAL(string2)[i2]) ? 0 : cost_rep);
                        c1 = p1[i2 + 1] + cost_del;
                        if (c1 < c0) {
                                c0 = c1;
@@ -65,7 +59,7 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
                p1 = p2;
                p2 = tmp;
        }
-       c0 = p1[l2];
+       c0 = p1[ZSTR_LEN(string2)];
 
        efree(p1);
        efree(p2);
@@ -74,56 +68,31 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
 }
 /* }}} */
 
-/* {{{ custom_levdist */
-static int custom_levdist(char *str1, char *str2, char *callback_name)
-{
-       php_error_docref(NULL, E_WARNING, "The general Levenshtein support is not there yet");
-       /* not there yet */
-
-       return -1;
-}
-/* }}} */
-
 /* {{{ Calculate Levenshtein distance between two strings */
 PHP_FUNCTION(levenshtein)
 {
-       int argc = ZEND_NUM_ARGS();
-       char *str1, *str2;
-       char *callback_name;
-       size_t str1_len, str2_len, callback_len;
-       zend_long cost_ins, cost_rep, cost_del;
-       zend_long distance = -1;
-
-       switch (argc) {
-               case 2: /* just two strings: use maximum performance version */
-                       if (zend_parse_parameters(2, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) {
-                               RETURN_THROWS();
-                       }
-                       distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1);
-                       break;
-
-               case 5: /* more general version: calc cost by ins/rep/del weights */
-                       if (zend_parse_parameters(5, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
-                               RETURN_THROWS();
-                       }
-                       distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del);
-                       break;
-
-               case 3: /* most general version: calc cost by user-supplied function */
-                       if (zend_parse_parameters(3, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) {
-                               RETURN_THROWS();
-                       }
-                       distance = custom_levdist(str1, str2, callback_name);
-                       break;
+       zend_string *string1, *string2;
+       zend_long cost_ins = 1;
+       zend_long cost_rep = 1;
+       zend_long cost_del = 1;
+       zend_long distance = 0;
+
+       if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS|lll", &string1, &string2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
+               RETURN_THROWS();
+       }
 
-               default:
-                       WRONG_PARAM_COUNT;
+       if (ZSTR_LEN(string1) > LEVENSHTEIN_MAX_LENGTH) {
+               zend_argument_value_error(1, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
+               RETURN_THROWS();
        }
 
-       if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) {
-               php_error_docref(NULL, E_WARNING, "Argument string(s) too long");
+       if (ZSTR_LEN(string2) > LEVENSHTEIN_MAX_LENGTH) {
+               zend_argument_value_error(2, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
+               RETURN_THROWS();
        }
 
+       distance = reference_levdist(string1, string2, cost_ins, cost_rep, cost_del);
+
        RETURN_LONG(distance);
 }
 /* }}} */
index b4b7c03e7f5800ea264e9d9648545db9f772eb35..41b559f8cbdf983440b325b616b9a91043ebf3a6 100644 (file)
@@ -3,76 +3,61 @@ levenshtein() function test
 --FILE--
 <?php
 
-function test_me($title,$expect,$text1,$text2,$cost1="",$cost2="",$cost3="") {
-
-  if ($cost1=="") {
-          $result=levenshtein($text1,$text2);
-  }
-  elseif ($cost2=="") {
-          $result=levenshtein($text1,$text2,$cost1);
-  }
-  else  {
-    $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
-  }
-        if($result==$expect) return 0;
-
-        echo "$title: result is $result instead of $expect ";
-        echo "for '$text1'/'$text2' ";
-        if($cost1) echo "($cost1:$cost2:$cost3)";
-        echo "\n";
-
-        return 1;
-}
-
-$n=0;
-
-$n += test_me("equal"      , 0, "12345", "12345");
-$n += test_me("1st empty"  , 3,      "",   "xzy");
-$n += test_me("2nd empty"  , 3,   "xzy",      "");
-$n += test_me("both empty" , 0,      "",      "");
-$n += test_me("1 char"     , 1,     "1",     "2");
-$n += test_me("2 char swap", 2,    "12",    "21");
-
-$n += test_me("inexpensive delete",  2, "2121",   "11", 2, 1, 1);
-$n += test_me("expensive delete"  , 10, "2121",   "11", 2, 1, 5);
-$n += test_me("inexpensive insert",  2,   "11", "2121", 1, 1, 1);
-$n += test_me("expensive insert"  , 10,   "11", "2121", 5, 1, 1);
-
-$n += test_me("expensive replace"     , 3, "111", "121", 2, 3, 2);
-$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);
-
-$n += test_me("bug #7368", 2,     "13458",    "12345");
-$n += test_me("bug #7368", 2,      "1345",     "1234");
-
-$n += test_me("bug #6562", 1,    "debugg",    "debug");
-$n += test_me("bug #6562", 1,    "ddebug",    "debug");
-$n += test_me("bug #6562", 2,   "debbbug",    "debug");
-$n += test_me("bug #6562", 1, "debugging", "debuging");
-
-$n += test_me("bug #16473", 2, "a", "bc");
-$n += test_me("bug #16473", 2, "xa", "xbc");
-$n += test_me("bug #16473", 2, "xax", "xbcx");
-$n += test_me("bug #16473", 2, "ax", "bcx");
-
-$n += test_me("custom", -1, "111", "121", "my_levcode");
-$n += test_me("lt maxlength1", 254, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu", "A");
-$n += test_me("gt maxlength1", -1, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv", "A");
-
-$n += test_me("lt maxlength2", 254, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu");
-$n += test_me("gt maxlength2", -1, "A",  "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv");
-
-echo ($n==0)?"all passed\n":"$n failed\n";
-
-var_dump(levenshtein("", "", -1, -1, -1));
-var_dump(levenshtein("", "", 10, 10, 10));
+echo '--- Equal ---' . \PHP_EOL;
+var_dump(levenshtein('12345', '12345'));
+
+echo '--- First string empty ---' . \PHP_EOL;
+var_dump(levenshtein('', 'xyz'));
+echo '--- Second string empty ---' . \PHP_EOL;
+var_dump(levenshtein('xyz', ''));
+echo '--- Both empty ---' . \PHP_EOL;
+var_dump(levenshtein('', ''));
+var_dump(levenshtein('', '', 10, 10, 10));
+
+echo '--- 1 character ---' . \PHP_EOL;
+var_dump(levenshtein('1', '2'));
+echo '--- 2 character swapped ---' . \PHP_EOL;
+var_dump(levenshtein('12', '21'));
+
+echo '--- Inexpensive deletion ---' . \PHP_EOL;
+var_dump(levenshtein('2121', '11', 2));
+echo '--- Expensive deletion ---' . \PHP_EOL;
+var_dump(levenshtein('2121', '11', 2, 1, 5));
+
+echo '--- Inexpensive insertion ---' . \PHP_EOL;
+var_dump(levenshtein('11', '2121'));
+echo '--- Expensive insertion ---' . \PHP_EOL;
+var_dump(levenshtein('11', '2121', 5));
+
+echo '--- Expensive replacement ---' . \PHP_EOL;
+var_dump(levenshtein('111', '121', 2, 3, 2));
+echo '--- Very expensive replacement ---' . \PHP_EOL;
+var_dump(levenshtein('111', '121', 2, 9, 2));
 
 ?>
---EXPECTF--
-Warning: levenshtein(): The general Levenshtein support is not there yet in %s on line %d
-
-Warning: levenshtein(): Argument string(s) too long in %s on line %d
-
-Warning: levenshtein(): Argument string(s) too long in %s on line %d
-all passed
+--EXPECT--
+--- Equal ---
+int(0)
+--- First string empty ---
+int(3)
+--- Second string empty ---
+int(3)
+--- Both empty ---
 int(0)
 int(0)
+--- 1 character ---
+int(1)
+--- 2 character swapped ---
+int(2)
+--- Inexpensive deletion ---
+int(2)
+--- Expensive deletion ---
+int(10)
+--- Inexpensive insertion ---
+int(2)
+--- Expensive insertion ---
+int(10)
+--- Expensive replacement ---
+int(3)
+--- Very expensive replacement ---
+int(4)
diff --git a/ext/standard/tests/strings/levenshtein_bug_16473.phpt b/ext/standard/tests/strings/levenshtein_bug_16473.phpt
new file mode 100644 (file)
index 0000000..94c2650
--- /dev/null
@@ -0,0 +1,16 @@
+--TEST--
+levenshtein() bug 16473
+--FILE--
+<?php
+
+var_dump(levenshtein('a', 'bc'));
+var_dump(levenshtein('xa', 'xbc'));
+var_dump(levenshtein('xax', 'xbcx'));
+var_dump(levenshtein('ax', 'bcx'));
+
+?>
+--EXPECT--
+int(2)
+int(2)
+int(2)
+int(2)
diff --git a/ext/standard/tests/strings/levenshtein_bug_6562.phpt b/ext/standard/tests/strings/levenshtein_bug_6562.phpt
new file mode 100644 (file)
index 0000000..31f07f4
--- /dev/null
@@ -0,0 +1,16 @@
+--TEST--
+levenshtein() bug 6562
+--FILE--
+<?php
+
+var_dump(levenshtein('debugg', 'debug'));
+var_dump(levenshtein('ddebug', 'debug'));
+var_dump(levenshtein('debbbug', 'debug'));
+var_dump(levenshtein('debugging', 'debuging'));
+
+?>
+--EXPECT--
+int(1)
+int(1)
+int(2)
+int(1)
diff --git a/ext/standard/tests/strings/levenshtein_bug_7368.phpt b/ext/standard/tests/strings/levenshtein_bug_7368.phpt
new file mode 100644 (file)
index 0000000..ffbb7cd
--- /dev/null
@@ -0,0 +1,12 @@
+--TEST--
+levenshtein() bug 7368
+--FILE--
+<?php
+
+var_dump(levenshtein('13458', '12345'));
+var_dump(levenshtein('1345', '1234'));
+
+?>
+--EXPECT--
+int(2)
+int(2)
diff --git a/ext/standard/tests/strings/levenshtein_error_conditions.phpt b/ext/standard/tests/strings/levenshtein_error_conditions.phpt
new file mode 100644 (file)
index 0000000..bd1b807
--- /dev/null
@@ -0,0 +1,31 @@
+--TEST--
+levenshtein() error conditions
+--FILE--
+<?php
+
+echo '--- String 1 ---' . \PHP_EOL;
+var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu', 'A'));
+try {
+    var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv', 'A'));
+} catch (\ValueError $e) {
+    echo $e->getMessage() . \PHP_EOL;
+}
+echo '--- String 2 ---' . \PHP_EOL;
+var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu'));
+try {
+    var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv'));
+} catch (\ValueError $e) {
+    echo $e->getMessage() . \PHP_EOL;
+}
+
+// TODO ValueError for negative costs?
+// var_dump(levenshtein("", "", -1, -1, -1));
+
+?>
+--EXPECT--
+--- String 1 ---
+int(254)
+levenshtein(): Argument #1 ($str1) must be less than 256 characters
+--- String 2 ---
+int(254)
+levenshtein(): Argument #2 ($str2) must be less than 256 characters