From 5cbbbd73a6acb6f96f5d6646aa7498d3dfb1706d Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Sun, 30 Jun 2019 01:54:43 -0400 Subject: [PATCH] bpo-29505: Add more fuzzing for re.compile, re.load and csv.reader (GH-14255) Add more fuzz testing for re.compile, re.load and csv.reader --- Lib/test/test_xxtestfuzz.py | 2 + .../dictionaries/fuzz_sre_compile.dict | 219 ++++++++++++++ .../fuzz_csv_reader_corpus/test.csv | Bin 0 -> 118 bytes .../fuzz_sre_compile_corpus/anchor_links | 1 + .../fuzz_sre_compile_corpus/characters | 1 + .../_xxtestfuzz/fuzz_sre_compile_corpus/isbn | 1 + .../fuzz_sre_compile_corpus/phone_number | 1 + Modules/_xxtestfuzz/fuzz_tests.txt | 3 + Modules/_xxtestfuzz/fuzzer.c | 281 +++++++++++++++++- 9 files changed, 493 insertions(+), 16 deletions(-) create mode 100644 Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict create mode 100644 Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv create mode 100644 Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links create mode 100644 Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters create mode 100644 Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn create mode 100644 Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number diff --git a/Lib/test/test_xxtestfuzz.py b/Lib/test/test_xxtestfuzz.py index 532f5fe72a..15924aaeff 100644 --- a/Lib/test/test_xxtestfuzz.py +++ b/Lib/test/test_xxtestfuzz.py @@ -16,6 +16,8 @@ class TestFuzzer(unittest.TestCase): _xxtestfuzz.run(b" ") _xxtestfuzz.run(b"x") _xxtestfuzz.run(b"1") + _xxtestfuzz.run(b"AAAAAAA") + _xxtestfuzz.run(b"AAAAAA\0") if __name__ == "__main__": diff --git a/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict b/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict new file mode 100644 index 0000000000..961306a879 --- /dev/null +++ b/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict @@ -0,0 +1,219 @@ +"?" +"abc" +"()" +"[]" +"abc|def" +"abc|def|ghi" +"^xxx$" +"ab\\b\\d\\bcd" +"\\w|\\d" +"a*?" +"abc+" +"abc+?" +"xyz?" +"xyz??" +"xyz{0,1}" +"xyz{0,1}?" +"xyz{93}" +"xyz{1,32}" +"xyz{1,32}?" +"xyz{1,}" +"xyz{1,}?" +"a\\fb\\nc\\rd\\te\\vf" +"a\\nb\\bc" +"(?:foo)" +"(?: foo )" +"foo|(bar|baz)|quux" +"foo(?=bar)baz" +"foo(?!bar)baz" +"foo(?<=bar)baz" +"foo(?)" +"(?.)" +"(?.)\\k" diff --git a/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv b/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv new file mode 100644 index 0000000000000000000000000000000000000000..8b7887d0f1d2426354ec0d01fb06768604406dc0 GIT binary patch literal 118 zcmXwwNeX~45ClEv6~mkq8$tbmUm11KfN_oBf`3;C)}bkAs#j@sr5t^b;+GPOf(O5_Fsw=uRKQIfG-Vn@SIH^Pt&RC5#NlL@()@BrgB} literal 0 HcmV?d00001 diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links new file mode 100644 index 0000000000..d99247ccad --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links @@ -0,0 +1 @@ +XX] diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters new file mode 100644 index 0000000000..0c67ee7dfc --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters @@ -0,0 +1 @@ +XX^(Tim|Robert)\s+the\s+(Enchanter|Shrubber)$ diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn new file mode 100644 index 0000000000..cce8919e72 --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn @@ -0,0 +1 @@ +XX/((978[\--– ])?[0-9][0-9\--– ]{10}[\--– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])/ diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number new file mode 100644 index 0000000000..1e2efc5110 --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number @@ -0,0 +1 @@ +XX(\+1|1)?[ \-\.]?\(?(?[0-9]{3})\)?[ \-\.]?(?[0-9]{3})[ \-\.]?(?[0-9]{4})[ \.]*(ext|x)?[ \.]*(?[0-9]{0,5}) diff --git a/Modules/_xxtestfuzz/fuzz_tests.txt b/Modules/_xxtestfuzz/fuzz_tests.txt index f0121291ea..9d330a668e 100644 --- a/Modules/_xxtestfuzz/fuzz_tests.txt +++ b/Modules/_xxtestfuzz/fuzz_tests.txt @@ -2,3 +2,6 @@ fuzz_builtin_float fuzz_builtin_int fuzz_builtin_unicode fuzz_json_loads +fuzz_sre_compile +fuzz_sre_match +fuzz_csv_reader diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index e862a99cfb..16104e492a 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -81,8 +81,17 @@ static int fuzz_builtin_unicode(const char* data, size_t size) { #define MAX_JSON_TEST_SIZE 0x10000 -/* Initialized in LLVMFuzzerTestOneInput */ PyObject* json_loads_method = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_json_loads() { + /* Import json.loads */ + PyObject* json_module = PyImport_ImportModule("json"); + if (json_module == NULL) { + return 0; + } + json_loads_method = PyObject_GetAttrString(json_module, "loads"); + return json_loads_method != NULL; +} /* Fuzz json.loads(x) */ static int fuzz_json_loads(const char* data, size_t size) { /* Since python supports arbitrarily large ints in JSON, @@ -96,22 +105,227 @@ static int fuzz_json_loads(const char* data, size_t size) { return 0; } PyObject* parsed = PyObject_CallFunctionObjArgs(json_loads_method, input_bytes, NULL); + if (parsed == NULL) { + /* Ignore ValueError as the fuzzer will more than likely + generate some invalid json and values */ + if (PyErr_ExceptionMatches(PyExc_ValueError) || + /* Ignore RecursionError as the fuzzer generates long sequences of + arrays such as `[[[...` */ + PyErr_ExceptionMatches(PyExc_RecursionError) || + /* Ignore unicode errors, invalid byte sequences are common */ + PyErr_ExceptionMatches(PyExc_UnicodeDecodeError) + ) { + PyErr_Clear(); + } + } + Py_DECREF(input_bytes); + Py_XDECREF(parsed); + return 0; +} + +#define MAX_RE_TEST_SIZE 0x10000 + +PyObject* sre_compile_method = NULL; +PyObject* sre_error_exception = NULL; +int SRE_FLAG_DEBUG = 0; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_sre_compile() { + /* Import sre_compile.compile and sre.error */ + PyObject* sre_compile_module = PyImport_ImportModule("sre_compile"); + if (sre_compile_module == NULL) { + return 0; + } + sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile"); + if (sre_compile_method == NULL) { + return 0; + } + + PyObject* sre_constants = PyImport_ImportModule("sre_constants"); + if (sre_constants == NULL) { + return 0; + } + sre_error_exception = PyObject_GetAttrString(sre_constants, "error"); + if (sre_error_exception == NULL) { + return 0; + } + PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG"); + if (debug_flag == NULL) { + return 0; + } + SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag); + return 1; +} +/* Fuzz _sre.compile(x) */ +static int fuzz_sre_compile(const char* data, size_t size) { + /* Ignore really long regex patterns that will timeout the fuzzer */ + if (size > MAX_RE_TEST_SIZE) { + return 0; + } + /* We treat the first 2 bytes of the input as a number for the flags */ + if (size < 2) { + return 0; + } + uint16_t flags = ((uint16_t*) data)[0]; + /* We remove the SRE_FLAG_DEBUG if present. This is because it + prints to stdout which greatly decreases fuzzing speed */ + flags &= ~SRE_FLAG_DEBUG; + + /* Pull the pattern from the remaining bytes */ + PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2); + if (pattern_bytes == NULL) { + return 0; + } + PyObject* flags_obj = PyLong_FromUnsignedLong(flags); + if (flags_obj == NULL) { + Py_DECREF(pattern_bytes); + return 0; + } + + /* compiled = _sre.compile(data[2:], data[0:2] */ + PyObject* compiled = PyObject_CallFunctionObjArgs( + sre_compile_method, pattern_bytes, flags_obj, NULL); /* Ignore ValueError as the fuzzer will more than likely - generate some invalid json and values */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { + generate some invalid combination of flags */ + if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { PyErr_Clear(); } - /* Ignore RecursionError as the fuzzer generates long sequences of - arrays such as `[[[...` */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_RecursionError)) { + /* Ignore some common errors thrown by sre_parse: + Overflow, Assertion and Index */ + if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) || + PyErr_ExceptionMatches(PyExc_AssertionError) || + PyErr_ExceptionMatches(PyExc_IndexError)) + ) { PyErr_Clear(); } - /* Ignore unicode errors, invalid byte sequences are common */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { + /* Ignore re.error */ + if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) { PyErr_Clear(); } - Py_DECREF(input_bytes); - Py_XDECREF(parsed); + + Py_DECREF(pattern_bytes); + Py_DECREF(flags_obj); + Py_XDECREF(compiled); + return 0; +} + +/* Some random patterns used to test re.match. + Be careful not to add catostraphically slow regexes here, we want to + excercise the matching code without causing timeouts.*/ +static const char* regex_patterns[] = { + ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]", + "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?", + "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$", + "(?:a*)*", "a{1,2}?" +}; +const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]); +PyObject** compiled_patterns = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_sre_match() { + PyObject* re_module = PyImport_ImportModule("re"); + if (re_module == NULL) { + return 0; + } + compiled_patterns = (PyObject**) PyMem_RawMalloc( + sizeof(PyObject*) * NUM_PATTERNS); + if (compiled_patterns == NULL) { + PyErr_NoMemory(); + return 0; + } + + /* Precompile all the regex patterns on the first run for faster fuzzing */ + for (size_t i = 0; i < NUM_PATTERNS; i++) { + PyObject* compiled = PyObject_CallMethod( + re_module, "compile", "y", regex_patterns[i]); + /* Bail if any of the patterns fail to compile */ + if (compiled == NULL) { + return 0; + } + compiled_patterns[i] = compiled; + } + return 1; +} +/* Fuzz re.match(x) */ +static int fuzz_sre_match(const char* data, size_t size) { + if (size < 1 || size > MAX_RE_TEST_SIZE) { + return 0; + } + /* Use the first byte as a uint8_t specifying the index of the + regex to use */ + unsigned char idx = (unsigned char) data[0]; + idx = idx % NUM_PATTERNS; + + /* Pull the string to match from the remaining bytes */ + PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1); + if (to_match == NULL) { + return 0; + } + + PyObject* pattern = compiled_patterns[idx]; + PyObject* match_callable = PyObject_GetAttrString(pattern, "match"); + + PyObject* matches = PyObject_CallFunctionObjArgs(match_callable, to_match, NULL); + + Py_XDECREF(matches); + Py_DECREF(match_callable); + Py_DECREF(to_match); + return 0; +} + +#define MAX_CSV_TEST_SIZE 0x10000 +PyObject* csv_module = NULL; +PyObject* csv_error = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_csv_reader() { + /* Import csv and csv.Error */ + csv_module = PyImport_ImportModule("csv"); + if (csv_module == NULL) { + return 0; + } + csv_error = PyObject_GetAttrString(csv_module, "Error"); + return csv_error != NULL; +} +/* Fuzz csv.reader([x]) */ +static int fuzz_csv_reader(const char* data, size_t size) { + if (size < 1 || size > MAX_CSV_TEST_SIZE) { + return 0; + } + /* Ignore non null-terminated strings since _csv can't handle + embeded nulls */ + if (memchr(data, '\0', size) == NULL) { + return 0; + } + + PyObject* s = PyUnicode_FromString(data); + /* Ignore exceptions until we have a valid string */ + if (s == NULL) { + PyErr_Clear(); + return 0; + } + + /* Split on \n so we can test multiple lines */ + PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n"); + if (lines == NULL) { + Py_DECREF(s); + return 0; + } + + PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines); + if (reader) { + /* Consume all of the reader as an iterator */ + PyObject* parsed_line; + while ((parsed_line = PyIter_Next(reader))) { + Py_DECREF(parsed_line); + } + } + + /* Ignore csv.Error because we're probably going to generate + some bad files (embeded new-lines, unterminated quotes etc) */ + if (PyErr_ExceptionMatches(csv_error)) { + PyErr_Clear(); + } + + Py_XDECREF(reader); + Py_DECREF(s); return 0; } @@ -152,12 +366,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { initialize CPython ourselves on the first run. */ Py_InitializeEx(0); } -#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) - if (json_loads_method == NULL) { - PyObject* json_module = PyImport_ImportModule("json"); - json_loads_method = PyObject_GetAttrString(json_module, "loads"); - } -#endif int rv = 0; @@ -171,7 +379,48 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { rv |= _run_fuzz(data, size, fuzz_builtin_unicode); #endif #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) + static int JSON_LOADS_INITIALIZED = 0; + if (!JSON_LOADS_INITIALIZED && !init_json_loads()) { + PyErr_Print(); + abort(); + } else { + JSON_LOADS_INITIALIZED = 1; + } + rv |= _run_fuzz(data, size, fuzz_json_loads); +#endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile) + static int SRE_COMPILE_INITIALIZED = 0; + if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) { + PyErr_Print(); + abort(); + } else { + SRE_COMPILE_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_sre_compile); +#endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match) + static int SRE_MATCH_INITIALIZED = 0; + if (!SRE_MATCH_INITIALIZED && !init_sre_match()) { + PyErr_Print(); + abort(); + } else { + SRE_MATCH_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_sre_match); +#endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader) + static int CSV_READER_INITIALIZED = 0; + if (!CSV_READER_INITIALIZED && !init_csv_reader()) { + PyErr_Print(); + abort(); + } else { + CSV_READER_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_csv_reader); #endif return rv; } -- 2.40.0