mirror of
				https://github.com/catchorg/Catch2.git
				synced 2025-10-31 12:17:11 +01:00 
			
		
		
		
	Modify XML encoder to hex-encode invalid UTF-8 sequences
There are still some holes, e.g. we leave surrogate pairs be even though they are not a part of valid UTF-8, but this might be for the better -- WTF-8 does support surrogate pairs inside text. Closes #1207
This commit is contained in:
		| @@ -1,5 +1,4 @@ | ||||
| #include "catch.hpp" | ||||
|  | ||||
| #include "internal/catch_xmlwriter.h" | ||||
|  | ||||
| #include <sstream> | ||||
| @@ -10,7 +9,7 @@ inline std::string encode( std::string const& str, Catch::XmlEncode::ForWhat for | ||||
|     return oss.str(); | ||||
| } | ||||
|  | ||||
| TEST_CASE( "XmlEncode" ) { | ||||
| TEST_CASE( "XmlEncode", "[XML]" ) { | ||||
|     SECTION( "normal string" ) { | ||||
|         REQUIRE( encode( "normal string" ) == "normal string" ); | ||||
|     } | ||||
| @@ -38,4 +37,76 @@ TEST_CASE( "XmlEncode" ) { | ||||
|     SECTION( "string with control char (x7F)" ) { | ||||
|         REQUIRE( encode( "[\x7F]" ) == "[\\x7F]" ); | ||||
|     } | ||||
| } | ||||
| } | ||||
|  | ||||
| // Thanks to Peter Bindels (dascandy) for some of the tests | ||||
| TEST_CASE("XmlEncode: UTF-8", "[XML][UTF-8]") { | ||||
|     SECTION("Valid utf-8 strings") { | ||||
|         CHECK(encode(u8"Here be 👾") == u8"Here be 👾"); | ||||
|         CHECK(encode(u8"šš") == u8"šš"); | ||||
|  | ||||
|         CHECK(encode("\xDF\xBF")         == "\xDF\xBF"); // 0x7FF | ||||
|         CHECK(encode("\xE0\xA0\x80")     == "\xE0\xA0\x80"); // 0x800 | ||||
|         CHECK(encode("\xED\x9F\xBF")     == "\xED\x9F\xBF"); // 0xD7FF | ||||
|         CHECK(encode("\xEE\x80\x80")     == "\xEE\x80\x80"); // 0xE000 | ||||
|         CHECK(encode("\xEF\xBF\xBF")     == "\xEF\xBF\xBF"); // 0xFFFF | ||||
|         CHECK(encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80"); // 0x10000 | ||||
|         CHECK(encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF"); // 0x10FFFF | ||||
|     } | ||||
|     SECTION("Invalid utf-8 strings") { | ||||
|         SECTION("Various broken strings") { | ||||
|             CHECK(encode("Here \xFF be 👾") == u8"Here \\xFF be 👾"); | ||||
|             CHECK(encode("\xFF") == "\\xFF"); | ||||
|             CHECK(encode("\xC5\xC5\xA0") == u8"\\xC5Š"); | ||||
|             CHECK(encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80"); // 0x110000 -- out of unicode range | ||||
|         } | ||||
|  | ||||
|         SECTION("Overlong encodings") { | ||||
|             CHECK(encode("\xC0\x80") == u8"\\xC0\\x80"); // \0 | ||||
|             CHECK(encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80"); // Super-over-long \0 | ||||
|             CHECK(encode("\xC1\xBF") == u8"\\xC1\\xBF"); // ASCII char as UTF-8 (0x7F) | ||||
|             CHECK(encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF"); // 0x7FF | ||||
|             CHECK(encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF"); // 0xFFFF | ||||
|         } | ||||
|  | ||||
|         // Note that we actually don't modify surrogate pairs, as we do not do strict checking | ||||
|         SECTION("Surrogate pairs") { | ||||
|             CHECK(encode("\xED\xA0\x80") == "\xED\xA0\x80"); // Invalid surrogate half 0xD800 | ||||
|             CHECK(encode("\xED\xAF\xBF") == "\xED\xAF\xBF"); // Invalid surrogate half 0xDBFF | ||||
|             CHECK(encode("\xED\xB0\x80") == "\xED\xB0\x80"); // Invalid surrogate half 0xDC00 | ||||
|             CHECK(encode("\xED\xBF\xBF") == "\xED\xBF\xBF"); // Invalid surrogate half 0xDFFF | ||||
|         } | ||||
|  | ||||
|         SECTION("Invalid start byte") { | ||||
|             CHECK(encode("\x80") == u8"\\x80"); | ||||
|             CHECK(encode("\x81") == u8"\\x81"); | ||||
|             CHECK(encode("\xBC") == u8"\\xBC"); | ||||
|             CHECK(encode("\xBF") == u8"\\xBF"); | ||||
|             // Out of range | ||||
|             CHECK(encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80"); | ||||
|             CHECK(encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80"); | ||||
|             CHECK(encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80"); | ||||
|         } | ||||
|  | ||||
|         SECTION("Missing continuation byte(s)") { | ||||
|             // Missing first continuation byte | ||||
|             CHECK(encode("\xDE") == u8"\\xDE"); | ||||
|             CHECK(encode("\xDF") == u8"\\xDF"); | ||||
|             CHECK(encode("\xE0") == u8"\\xE0"); | ||||
|             CHECK(encode("\xEF") == u8"\\xEF"); | ||||
|             CHECK(encode("\xF0") == u8"\\xF0"); | ||||
|             CHECK(encode("\xF4") == u8"\\xF4"); | ||||
|  | ||||
|             // Missing second continuation byte | ||||
|             CHECK(encode("\xE0\x80") == u8"\\xE0\\x80"); | ||||
|             CHECK(encode("\xE0\xBF") == u8"\\xE0\\xBF"); | ||||
|             CHECK(encode("\xE1\x80") == u8"\\xE1\\x80"); | ||||
|             CHECK(encode("\xF0\x80") == u8"\\xF0\\x80"); | ||||
|             CHECK(encode("\xF4\x80") == u8"\\xF4\\x80"); | ||||
|  | ||||
|             // Missing third continuation byte | ||||
|             CHECK(encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80"); | ||||
|             CHECK(encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80"); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Martin Hořeňovský
					Martin Hořeňovský