From f2c2a4be9e466f14677089efe33e20ca0b146809 Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" <cmb@php.net> Date: Thu, 21 Jul 2016 18:36:12 +0200 Subject: [PATCH] Fix #72330: CSV fields incorrectly split if escape char followed by UTF chars We must not forget to properly reset the state for multibyte characters following an escape character. --- NEWS | 4 ++++ ext/standard/file.c | 1 + ext/standard/tests/file/bug72330.phpt | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 ext/standard/tests/file/bug72330.phpt diff --git a/NEWS b/NEWS index fb08c0213e..c2cae0af64 100644 --- a/NEWS +++ b/NEWS @@ -44,6 +44,10 @@ PHP NEWS . Fixed bug #72222 (ReflectionClass::export doesn't handle array constants). (Nikita Nefedov) +- Standard: + . Fixed bug #72330 (CSV fields incorrectly split if escape char followed by + UTF chars). (cmb) + - SPL: . Fixed bug #72122 (IteratorIterator breaks '@' error suppression). (kinglozzer) diff --git a/ext/standard/file.c b/ext/standard/file.c index f8c4e0450b..d8471fff1c 100644 --- a/ext/standard/file.c +++ b/ext/standard/file.c @@ -2219,6 +2219,7 @@ PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, char memcpy(tptr, hunk_begin, bptr - hunk_begin); tptr += (bptr - hunk_begin); hunk_begin = bptr; + state = 0; break; default: bptr += inc_len; diff --git a/ext/standard/tests/file/bug72330.phpt b/ext/standard/tests/file/bug72330.phpt new file mode 100644 index 0000000000..843032ae2d --- /dev/null +++ b/ext/standard/tests/file/bug72330.phpt @@ -0,0 +1,26 @@ +--TEST-- +Bug #72330 (CSV fields incorrectly split if escape char followed by UTF chars) +--SKIPIF-- +<?php +if (setlocale(LC_ALL, "en_US.utf8", "en_AU.utf8", "ko_KR.utf8", "zh_CN.utf8", "de_DE.utf8", "es_EC.utf8", "fr_FR.utf8", "ja_JP.utf8", "el_GR.utf8", "nl_NL.utf8") === false) { + die('skip available locales not usable'); +} +?> +--FILE-- +<?php +setlocale(LC_ALL, "en_US.utf8", "en_AU.utf8", "ko_KR.utf8", "zh_CN.utf8", "de_DE.utf8", "es_EC.utf8", "fr_FR.utf8", "ja_JP.utf8", "el_GR.utf8", "nl_NL.utf8"); + +$utf_1 = chr(0xD1) . chr(0x81); // U+0440; +$utf_2 = chr(0xD8) . chr(0x80); // U+0600 + +$string = '"first #' . $utf_1 . $utf_2 . '";"second"'; +$fields = str_getcsv($string, ';', '"', "#"); +var_dump($fields); +?> +--EXPECT-- +array(2) { + [0]=> + string(11) "first #ÑØ" + [1]=> + string(6) "second" +} -- 2.40.0