From ca15b22212d3ea30c931e4cb7f4e1be869ed36fc Mon Sep 17 00:00:00 2001 From: jim winstead Date: Sat, 5 Jan 2002 20:46:43 +0000 Subject: [PATCH] New memcpy()-based wordwrap() implementation. The simple case (single-character break, no forced break) appears to be about 60% faster, and there's simply no comparison for non-simple cases with non-trivial amounts of text. The old algorithm was O(n^2) (with an unfortunately large constant factor) because of the use of strncat(), the new one is O(n). Added some more tests, too. @ - Made wordwrap() significantly faster. (Jim) # test case: $t = join('',file('ChangeLog')); $w = wordwrap($t,10,"\n",1); # new code completes in less than a second. i'm still waiting for the # old code to finish. --- ext/standard/string.c | 173 ++++++++++------------- ext/standard/tests/strings/wordwrap.phpt | 13 ++ 2 files changed, 88 insertions(+), 98 deletions(-) diff --git a/ext/standard/string.c b/ext/standard/string.c index 6b897346e0..3149130089 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -613,9 +613,11 @@ PHP_FUNCTION(ltrim) Wraps buffer to selected number of characters using string break char */ PHP_FUNCTION(wordwrap) { - char *text, *breakchar = "\n", *newtext; + const char *text, *breakchar = "\n"; + char *newtext; int textlen, breakcharlen = 1, newtextlen; - long linelength = 75, i = 0, l = 0, pgr = 0, last = 0; + long current = 0, laststart = 0, lastspace = 0; + long linelength = 75; zend_bool docut = 0; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|lsb", &text, &textlen, &linelength, &breakchar, &breakcharlen, &docut) == FAILURE) { @@ -634,121 +636,96 @@ PHP_FUNCTION(wordwrap) additional storage space */ if (breakcharlen == 1 && !docut) { newtext = estrndup(text, textlen); - while (newtext[i] != '\0') { - /* prescan line to see if it is greater than linelength */ - l = 0; - while (newtext[i+l] != breakchar[0]) { - if (newtext[i+l] == '\0') { - l--; - break; - } - l++; + laststart = lastspace = 0; + for (current = 0; current < textlen; current++) { + if (text[current] == breakchar[0]) { + laststart = lastspace = current; } - - if (l >= linelength) { - pgr = l; - l = linelength; - - /* needs breaking; work backwards to find previous word */ - while (l >= 0) { - if (newtext[i+l] == ' ') { - newtext[i+l] = breakchar[0]; - break; - } - l--; - } - - if (l == -1) { - /* couldn't break is backwards, try looking forwards */ - l = linelength; - while (l <= pgr) { - if(newtext[i+l] == ' ') { - newtext[i+l] = breakchar[0]; - break; - } - l++; - } + else if (text[current] == ' ') { + if (current - laststart >= linelength) { + newtext[current] = breakchar[0]; + laststart = current; } + lastspace = current; + } + else if (current - laststart >= linelength + && laststart != lastspace) { + newtext[lastspace] = breakchar[0]; + laststart = lastspace; } - - i += l + 1; } RETURN_STRINGL(newtext, textlen, 0); } else { - /* Multiple character line break */ - newtextlen = textlen * (breakcharlen + 1) + 1; + /* Multiple character line break or forced cut */ + if (linelength > 0) { + newtextlen = textlen + (textlen/linelength) * breakcharlen + 1; + } + else { + newtextlen = textlen * (breakcharlen + 1) + 1; + } newtext = emalloc(newtextlen); - newtext[0] = '\0'; - i = 0; - while (text[i] != '\0') { - - /* prescan line to see if it is greater than linelength */ - l = 0; - while (text[i+l] != '\0') { - if (text[i+l] == breakchar[0]) { - if (breakcharlen == 1 || !strncmp(text+i+l, breakchar, breakcharlen)) - break; - } - l++; + /* now keep track of the actual new text length */ + newtextlen = 0; + + laststart = lastspace = 0; + for (current = 0; current < textlen; current++) { + /* when we hit an existing break, copy to new buffer, and + * fix up laststart and lastspace */ + if (text[current] == breakchar[0] + && current + breakcharlen < textlen + && !strncmp(text+current, breakchar, breakcharlen)) { + memcpy(newtext+newtextlen, text+laststart, current-laststart+breakcharlen); + newtextlen += current-laststart+breakcharlen; + current += breakcharlen - 1; + laststart = lastspace = current + 1; } - - if (l >= linelength) { - pgr = l; - l = linelength; - - /* needs breaking; work backwards to find previous word */ - while (l >= 0) { - if (text[i+l] == ' ') { - strncat(newtext, text+last, i+l-last); - strncat(newtext, breakchar, breakcharlen); - last = i + l + 1; - break; - } - l--; + /* if it is a space, check if it is at the line boundary, + * copy and insert a break, or just keep track of it */ + else if (text[current] == ' ') { + if (current - laststart >= linelength) { + memcpy(newtext+newtextlen, text+laststart, current-laststart); + newtextlen += current - laststart; + memcpy(newtext+newtextlen, breakchar, breakcharlen); + newtextlen += breakcharlen; + laststart = current + 1; } - - if (l == -1) { - /* couldn't break it backwards, try looking forwards */ - l = linelength - 1; - while (l <= pgr) { - if (!docut) { - if (text[i+l] == ' ') { - strncat(newtext, text+last, i+l-last); - strncat(newtext, breakchar, breakcharlen); - last = i + l + 1; - ++l; - break; - } - } - /* cut if longer than allowed */ - else { - if (text[i+l] == ' ' || l > i-last) { - strncat(newtext, text+last, i+l-last+1); - strncat(newtext, breakchar, breakcharlen); - last = i + l + 1; - ++l; - break; - } - } - l++; - } - } - i += l + 1; + lastspace = current; } - else { - i += (l ? l : 1); + /* if we are cutting, and we've accumulated enough + * characters, copy and insert a break. */ + else if (current - laststart >= linelength && docut) { + memcpy(newtext+newtextlen, text+laststart, current-laststart); + newtextlen += current - laststart; + memcpy(newtext+newtextlen, breakchar, breakcharlen); + newtextlen += breakcharlen; + laststart = lastspace = current; + } + /* if the current word puts us over the linelength, copy + * back up until the last space, insert a break, and move + * up the laststart */ + else if (current - laststart >= linelength + && laststart < lastspace) { + memcpy(newtext+newtextlen, text+laststart, lastspace-laststart); + newtextlen += lastspace - laststart; + memcpy(newtext+newtextlen, breakchar, breakcharlen); + newtextlen += breakcharlen; + laststart = lastspace = lastspace + 1; } } - if (i + l > last) { - strncat(newtext, text+last, i+l-last); + /* copy over any stragglers */ + if (laststart != current) { + memcpy(newtext+newtextlen, text+laststart, current-laststart); + newtextlen += current - laststart; } - RETURN_STRINGL(newtext, strlen(newtext), 0); + newtext[newtextlen] = '\0'; + + RETURN_STRINGL(newtext, newtextlen, 0); } } /* }}} */ diff --git a/ext/standard/tests/strings/wordwrap.phpt b/ext/standard/tests/strings/wordwrap.phpt index 4e30fb41bb..7fa85f40f8 100644 --- a/ext/standard/tests/strings/wordwrap.phpt +++ b/ext/standard/tests/strings/wordwrap.phpt @@ -11,6 +11,19 @@ $tests = <<