diff --git a/include/internal/catch_xmlwriter.cpp b/include/internal/catch_xmlwriter.cpp index a3316f46..221f1c63 100644 --- a/include/internal/catch_xmlwriter.cpp +++ b/include/internal/catch_xmlwriter.cpp @@ -7,51 +7,145 @@ #include "catch_xmlwriter.h" +#include "catch_enforce.h" + #include +using uchar = unsigned char; + namespace Catch { +namespace { + + size_t trailingBytes(unsigned char c) { + if ((c & 0xE0) == 0xC0) { + return 2; + } + if ((c & 0xF0) == 0xE0) { + return 3; + } + if ((c & 0xF8) == 0xF0) { + return 4; + } + CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered"); + } + + uint32_t headerValue(unsigned char c) { + if ((c & 0xE0) == 0xC0) { + return c & 0x1F; + } + if ((c & 0xF0) == 0xE0) { + return c & 0x0F; + } + if ((c & 0xF8) == 0xF0) { + return c & 0x07; + } + CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered"); + } + + void hexEscapeChar(std::ostream& os, unsigned char c) { + os << "\\x" + << std::uppercase << std::hex << std::setfill('0') << std::setw(2) + << static_cast(c); + } + +} // anonymous namespace + XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat ) : m_str( str ), m_forWhat( forWhat ) {} void XmlEncode::encodeTo( std::ostream& os ) const { - // Apostrophe escaping not necessary if we always use " to write attributes // (see: http://www.w3.org/TR/xml/#syntax) - for( std::size_t i = 0; i < m_str.size(); ++ i ) { - char c = m_str[i]; - switch( c ) { - case '<': os << "<"; break; - case '&': os << "&"; break; + for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) { + uchar c = m_str[idx]; + switch (c) { + case '<': os << "<"; break; + case '&': os << "&"; break; - case '>': - // See: http://www.w3.org/TR/xml/#syntax - if( i > 2 && m_str[i-1] == ']' && m_str[i-2] == ']' ) - os << ">"; - else - os << c; + case '>': + // See: http://www.w3.org/TR/xml/#syntax + if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']') + os << ">"; + else + os << c; + break; + + case '\"': + if (m_forWhat == ForAttributes) + os << """; + else + os << c; + break; + + default: + // Check for control characters and invalid utf-8 + + // Escape control characters in standard ascii + // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0 + if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) { + hexEscapeChar(os, c); break; + } - case '\"': - if( m_forWhat == ForAttributes ) - os << """; - else - os << c; + // Plain ASCII: Write it to stream + if (c < 0x7F) { + os << c; break; + } - default: - // Escape control chars - based on contribution by @espenalb in PR #465 and - // by @mrpi PR #588 - if ( ( c >= 0 && c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' ) { - // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0 - os << "\\x" << std::uppercase << std::hex << std::setfill('0') << std::setw(2) - << static_cast( c ); - } - else - os << c; + // UTF-8 territory + // Check if the encoding is valid and if it is not, hex escape bytes. + // Important: We do not check the exact decoded values for validity, only the encoding format + // First check that this bytes is a valid lead byte: + // This means that it is not encoded as 1111 1XXX + // Or as 10XX XXXX + if (c < 0xC0 || + c >= 0xF8) { + hexEscapeChar(os, c); + break; + } + + auto encBytes = trailingBytes(c); + // Are there enough bytes left to avoid accessing out-of-bounds memory? + if (idx + encBytes - 1 >= m_str.size()) { + hexEscapeChar(os, c); + break; + } + // The header is valid, check data + // The next encBytes bytes must together be a valid utf-8 + // This means: bitpattern 10XX XXXX and the extracted value is sane (ish) + bool valid = true; + uint32_t value = headerValue(c); + for (std::size_t n = 1; n < encBytes; ++n) { + uchar nc = m_str[idx + n]; + valid &= ((nc & 0xC0) == 0x80); + value = (value << 6) | (nc & 0x3F); + } + + if ( + // Wrong bit pattern of following bytes + (!valid) || + // Overlong encodings + (value < 0x80) || + (0x80 <= value && value < 0x800 && encBytes > 2) || + (0x800 < value && value < 0x10000 && encBytes > 3) || + // Encoded value out of range + (value >= 0x110000) + ) { + hexEscapeChar(os, c); + break; + } + + // If we got here, this is in fact a valid(ish) utf-8 sequence + for (std::size_t n = 0; n < encBytes; ++n) { + os << m_str[idx + n]; + } + idx += encBytes - 1; + break; } } } diff --git a/include/internal/catch_xmlwriter.h b/include/internal/catch_xmlwriter.h index 76456f96..c4b1c035 100644 --- a/include/internal/catch_xmlwriter.h +++ b/include/internal/catch_xmlwriter.h @@ -56,7 +56,7 @@ namespace Catch { XmlWriter( std::ostream& os = Catch::cout() ); ~XmlWriter(); - + XmlWriter( XmlWriter const& ) = delete; XmlWriter& operator=( XmlWriter const& ) = delete; diff --git a/projects/SelfTest/Baselines/compact.sw.approved.txt b/projects/SelfTest/Baselines/compact.sw.approved.txt index b02a9e0e..a233aa6c 100644 --- a/projects/SelfTest/Baselines/compact.sw.approved.txt +++ b/projects/SelfTest/Baselines/compact.sw.approved.txt @@ -905,6 +905,48 @@ Xml.tests.cpp:: passed: encode( stringWithQuotes, Catch::XmlEncode: "don't "quote" me on that" Xml.tests.cpp:: passed: encode( "[/x01]" ) == "[//x01]" for: "[/x01]" == "[/x01]" Xml.tests.cpp:: passed: encode( "[/x7F]" ) == "[//x7F]" for: "[/x7F]" == "[/x7F]" +Xml.tests.cpp:: passed: encode(u8"Here be 👾") == u8"Here be 👾" for: "Here be 👾" == "Here be 👾" +Xml.tests.cpp:: passed: encode(u8"šš") == u8"šš" for: "šš" == "šš" +Xml.tests.cpp:: passed: encode("/xDF/xBF") == "/xDF/xBF" for: "߿" == "߿" +Xml.tests.cpp:: passed: encode("/xE0/xA0/x80") == "/xE0/xA0/x80" for: "ࠀ" == "ࠀ" +Xml.tests.cpp:: passed: encode("/xED/x9F/xBF") == "/xED/x9F/xBF" for: "퟿" == "퟿" +Xml.tests.cpp:: passed: encode("/xEE/x80/x80") == "/xEE/x80/x80" for: "" == "" +Xml.tests.cpp:: passed: encode("/xEF/xBF/xBF") == "/xEF/xBF/xBF" for: "￿" == "￿" +Xml.tests.cpp:: passed: encode("/xF0/x90/x80/x80") == "/xF0/x90/x80/x80" for: "𐀀" == "𐀀" +Xml.tests.cpp:: passed: encode("/xF4/x8F/xBF/xBF") == "/xF4/x8F/xBF/xBF" for: "􏿿" == "􏿿" +Xml.tests.cpp:: passed: encode("Here /xFF be 👾") == u8"Here //xFF be 👾" for: "Here /xFF be 👾" == "Here /xFF be 👾" +Xml.tests.cpp:: passed: encode("/xFF") == "//xFF" for: "/xFF" == "/xFF" +Xml.tests.cpp:: passed: encode("/xC5/xC5/xA0") == u8"//xC5Š" for: "/xC5Š" == "/xC5Š" +Xml.tests.cpp:: passed: encode("/xF4/x90/x80/x80") == u8"//xF4//x90//x80//x80" for: "/xF4/x90/x80/x80" == "/xF4/x90/x80/x80" +Xml.tests.cpp:: passed: encode("/xC0/x80") == u8"//xC0//x80" for: "/xC0/x80" == "/xC0/x80" +Xml.tests.cpp:: passed: encode("/xF0/x80/x80/x80") == u8"//xF0//x80//x80//x80" for: "/xF0/x80/x80/x80" == "/xF0/x80/x80/x80" +Xml.tests.cpp:: passed: encode("/xC1/xBF") == u8"//xC1//xBF" for: "/xC1/xBF" == "/xC1/xBF" +Xml.tests.cpp:: passed: encode("/xE0/x9F/xBF") == u8"//xE0//x9F//xBF" for: "/xE0/x9F/xBF" == "/xE0/x9F/xBF" +Xml.tests.cpp:: passed: encode("/xF0/x8F/xBF/xBF") == u8"//xF0//x8F//xBF//xBF" for: "/xF0/x8F/xBF/xBF" == "/xF0/x8F/xBF/xBF" +Xml.tests.cpp:: passed: encode("/xED/xA0/x80") == "/xED/xA0/x80" for: "" == "" +Xml.tests.cpp:: passed: encode("/xED/xAF/xBF") == "/xED/xAF/xBF" for: "" == "" +Xml.tests.cpp:: passed: encode("/xED/xB0/x80") == "/xED/xB0/x80" for: "" == "" +Xml.tests.cpp:: passed: encode("/xED/xBF/xBF") == "/xED/xBF/xBF" for: "" == "" +Xml.tests.cpp:: passed: encode("/x80") == u8"//x80" for: "/x80" == "/x80" +Xml.tests.cpp:: passed: encode("/x81") == u8"//x81" for: "/x81" == "/x81" +Xml.tests.cpp:: passed: encode("/xBC") == u8"//xBC" for: "/xBC" == "/xBC" +Xml.tests.cpp:: passed: encode("/xBF") == u8"//xBF" for: "/xBF" == "/xBF" +Xml.tests.cpp:: passed: encode("/xF5/x80/x80/x80") == u8"//xF5//x80//x80//x80" for: "/xF5/x80/x80/x80" == "/xF5/x80/x80/x80" +Xml.tests.cpp:: passed: encode("/xF6/x80/x80/x80") == u8"//xF6//x80//x80//x80" for: "/xF6/x80/x80/x80" == "/xF6/x80/x80/x80" +Xml.tests.cpp:: passed: encode("/xF7/x80/x80/x80") == u8"//xF7//x80//x80//x80" for: "/xF7/x80/x80/x80" == "/xF7/x80/x80/x80" +Xml.tests.cpp:: passed: encode("/xDE") == u8"//xDE" for: "/xDE" == "/xDE" +Xml.tests.cpp:: passed: encode("/xDF") == u8"//xDF" for: "/xDF" == "/xDF" +Xml.tests.cpp:: passed: encode("/xE0") == u8"//xE0" for: "/xE0" == "/xE0" +Xml.tests.cpp:: passed: encode("/xEF") == u8"//xEF" for: "/xEF" == "/xEF" +Xml.tests.cpp:: passed: encode("/xF0") == u8"//xF0" for: "/xF0" == "/xF0" +Xml.tests.cpp:: passed: encode("/xF4") == u8"//xF4" for: "/xF4" == "/xF4" +Xml.tests.cpp:: passed: encode("/xE0/x80") == u8"//xE0//x80" for: "/xE0/x80" == "/xE0/x80" +Xml.tests.cpp:: passed: encode("/xE0/xBF") == u8"//xE0//xBF" for: "/xE0/xBF" == "/xE0/xBF" +Xml.tests.cpp:: passed: encode("/xE1/x80") == u8"//xE1//x80" for: "/xE1/x80" == "/xE1/x80" +Xml.tests.cpp:: passed: encode("/xF0/x80") == u8"//xF0//x80" for: "/xF0/x80" == "/xF0/x80" +Xml.tests.cpp:: passed: encode("/xF4/x80") == u8"//xF4//x80" for: "/xF4/x80" == "/xF4/x80" +Xml.tests.cpp:: passed: encode("/xF0/x80/x80") == u8"//xF0//x80//x80" for: "/xF0/x80/x80" == "/xF0/x80/x80" +Xml.tests.cpp:: passed: encode("/xF4/x80/x80") == u8"//xF4//x80//x80" for: "/xF4/x80/x80" == "/xF4/x80/x80" ToStringVector.tests.cpp:: passed: Catch::Detail::stringify( empty ) == "{ }" for: "{ }" == "{ }" ToStringVector.tests.cpp:: passed: Catch::Detail::stringify( oneValue ) == "{ 42 }" for: "{ 42 }" == "{ 42 }" ToStringVector.tests.cpp:: passed: Catch::Detail::stringify( twoValues ) == "{ 42, 250 }" for: "{ 42, 250 }" == "{ 42, 250 }" diff --git a/projects/SelfTest/Baselines/console.std.approved.txt b/projects/SelfTest/Baselines/console.std.approved.txt index 6df0a222..bd5f62cf 100644 --- a/projects/SelfTest/Baselines/console.std.approved.txt +++ b/projects/SelfTest/Baselines/console.std.approved.txt @@ -1084,6 +1084,6 @@ due to unexpected exception with message: Why would you throw a std::string? =============================================================================== -test cases: 202 | 149 passed | 49 failed | 4 failed as expected -assertions: 1015 | 887 passed | 107 failed | 21 failed as expected +test cases: 203 | 150 passed | 49 failed | 4 failed as expected +assertions: 1057 | 929 passed | 107 failed | 21 failed as expected diff --git a/projects/SelfTest/Baselines/console.sw.approved.txt b/projects/SelfTest/Baselines/console.sw.approved.txt index 3a611b72..244bcffc 100644 --- a/projects/SelfTest/Baselines/console.sw.approved.txt +++ b/projects/SelfTest/Baselines/console.sw.approved.txt @@ -7112,6 +7112,305 @@ PASSED: with expansion: "[\x7F]" == "[\x7F]" +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Valid utf-8 strings +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode(u8"Here be 👾") == u8"Here be 👾" ) +with expansion: + "Here be 👾" == "Here be 👾" + +Xml.tests.cpp:: +PASSED: + CHECK( encode(u8"šš") == u8"šš" ) +with expansion: + "šš" == "šš" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xDF\xBF") == "\xDF\xBF" ) +with expansion: + "߿" == "߿" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE0\xA0\x80") == "\xE0\xA0\x80" ) +with expansion: + "ࠀ" == "ࠀ" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xED\x9F\xBF") == "\xED\x9F\xBF" ) +with expansion: + "퟿" == "퟿" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xEE\x80\x80") == "\xEE\x80\x80" ) +with expansion: + "" == "" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xEF\xBF\xBF") == "\xEF\xBF\xBF" ) +with expansion: + "￿" == "￿" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80" ) +with expansion: + "𐀀" == "𐀀" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF" ) +with expansion: + "􏿿" == "􏿿" + +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Invalid utf-8 strings + Various broken strings +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode("Here \xFF be 👾") == u8"Here \\xFF be 👾" ) +with expansion: + "Here \xFF be 👾" == "Here \xFF be 👾" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xFF") == "\\xFF" ) +with expansion: + "\xFF" == "\xFF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xC5\xC5\xA0") == u8"\\xC5Š" ) +with expansion: + "\xC5Š" == "\xC5Š" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80" ) +with expansion: + "\xF4\x90\x80\x80" == "\xF4\x90\x80\x80" + +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Invalid utf-8 strings + Overlong encodings +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xC0\x80") == u8"\\xC0\\x80" ) +with expansion: + "\xC0\x80" == "\xC0\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80" ) +with expansion: + "\xF0\x80\x80\x80" == "\xF0\x80\x80\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xC1\xBF") == u8"\\xC1\\xBF" ) +with expansion: + "\xC1\xBF" == "\xC1\xBF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF" ) +with expansion: + "\xE0\x9F\xBF" == "\xE0\x9F\xBF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF" ) +with expansion: + "\xF0\x8F\xBF\xBF" == "\xF0\x8F\xBF\xBF" + +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Invalid utf-8 strings + Surrogate pairs +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xED\xA0\x80") == "\xED\xA0\x80" ) +with expansion: + "" == "" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xED\xAF\xBF") == "\xED\xAF\xBF" ) +with expansion: + "" == "" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xED\xB0\x80") == "\xED\xB0\x80" ) +with expansion: + "" == "" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xED\xBF\xBF") == "\xED\xBF\xBF" ) +with expansion: + "" == "" + +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Invalid utf-8 strings + Invalid start byte +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\x80") == u8"\\x80" ) +with expansion: + "\x80" == "\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\x81") == u8"\\x81" ) +with expansion: + "\x81" == "\x81" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xBC") == u8"\\xBC" ) +with expansion: + "\xBC" == "\xBC" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xBF") == u8"\\xBF" ) +with expansion: + "\xBF" == "\xBF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80" ) +with expansion: + "\xF5\x80\x80\x80" == "\xF5\x80\x80\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80" ) +with expansion: + "\xF6\x80\x80\x80" == "\xF6\x80\x80\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80" ) +with expansion: + "\xF7\x80\x80\x80" == "\xF7\x80\x80\x80" + +------------------------------------------------------------------------------- +XmlEncode: UTF-8 + Invalid utf-8 strings + Missing continuation byte(s) +------------------------------------------------------------------------------- +Xml.tests.cpp: +............................................................................... + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xDE") == u8"\\xDE" ) +with expansion: + "\xDE" == "\xDE" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xDF") == u8"\\xDF" ) +with expansion: + "\xDF" == "\xDF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE0") == u8"\\xE0" ) +with expansion: + "\xE0" == "\xE0" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xEF") == u8"\\xEF" ) +with expansion: + "\xEF" == "\xEF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0") == u8"\\xF0" ) +with expansion: + "\xF0" == "\xF0" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF4") == u8"\\xF4" ) +with expansion: + "\xF4" == "\xF4" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE0\x80") == u8"\\xE0\\x80" ) +with expansion: + "\xE0\x80" == "\xE0\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE0\xBF") == u8"\\xE0\\xBF" ) +with expansion: + "\xE0\xBF" == "\xE0\xBF" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xE1\x80") == u8"\\xE1\\x80" ) +with expansion: + "\xE1\x80" == "\xE1\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0\x80") == u8"\\xF0\\x80" ) +with expansion: + "\xF0\x80" == "\xF0\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF4\x80") == u8"\\xF4\\x80" ) +with expansion: + "\xF4\x80" == "\xF4\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80" ) +with expansion: + "\xF0\x80\x80" == "\xF0\x80\x80" + +Xml.tests.cpp:: +PASSED: + CHECK( encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80" ) +with expansion: + "\xF4\x80\x80" == "\xF4\x80\x80" + ------------------------------------------------------------------------------- array -> toString ------------------------------------------------------------------------------- @@ -8598,6 +8897,6 @@ Misc.tests.cpp:: PASSED: =============================================================================== -test cases: 202 | 136 passed | 62 failed | 4 failed as expected -assertions: 1029 | 887 passed | 121 failed | 21 failed as expected +test cases: 203 | 137 passed | 62 failed | 4 failed as expected +assertions: 1071 | 929 passed | 121 failed | 21 failed as expected diff --git a/projects/SelfTest/Baselines/junit.sw.approved.txt b/projects/SelfTest/Baselines/junit.sw.approved.txt index 3cdf5a4f..ea31fb6e 100644 --- a/projects/SelfTest/Baselines/junit.sw.approved.txt +++ b/projects/SelfTest/Baselines/junit.sw.approved.txt @@ -1,7 +1,7 @@ - + @@ -706,6 +706,12 @@ Exception.tests.cpp: + + + + + + diff --git a/projects/SelfTest/Baselines/xml.sw.approved.txt b/projects/SelfTest/Baselines/xml.sw.approved.txt index 69d35c83..2d24824a 100644 --- a/projects/SelfTest/Baselines/xml.sw.approved.txt +++ b/projects/SelfTest/Baselines/xml.sw.approved.txt @@ -7881,7 +7881,7 @@ Message from section two - +
@@ -7994,6 +7994,378 @@ Message from section two
+ +
+ + + encode(u8"Here be 👾") == u8"Here be 👾" + + + "Here be 👾" == "Here be 👾" + + + + + encode(u8"šš") == u8"šš" + + + "šš" == "šš" + + + + + encode("\xDF\xBF") == "\xDF\xBF" + + + "߿" == "߿" + + + + + encode("\xE0\xA0\x80") == "\xE0\xA0\x80" + + + "ࠀ" == "ࠀ" + + + + + encode("\xED\x9F\xBF") == "\xED\x9F\xBF" + + + "퟿" == "퟿" + + + + + encode("\xEE\x80\x80") == "\xEE\x80\x80" + + + "" == "" + + + + + encode("\xEF\xBF\xBF") == "\xEF\xBF\xBF" + + + "￿" == "￿" + + + + + encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80" + + + "𐀀" == "𐀀" + + + + + encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF" + + + "􏿿" == "􏿿" + + + +
+
+
+ + + encode("Here \xFF be 👾") == u8"Here \\xFF be 👾" + + + "Here \xFF be 👾" == "Here \xFF be 👾" + + + + + encode("\xFF") == "\\xFF" + + + "\xFF" == "\xFF" + + + + + encode("\xC5\xC5\xA0") == u8"\\xC5Š" + + + "\xC5Š" == "\xC5Š" + + + + + encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80" + + + "\xF4\x90\x80\x80" == "\xF4\x90\x80\x80" + + + +
+ +
+
+
+ + + encode("\xC0\x80") == u8"\\xC0\\x80" + + + "\xC0\x80" == "\xC0\x80" + + + + + encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80" + + + "\xF0\x80\x80\x80" == "\xF0\x80\x80\x80" + + + + + encode("\xC1\xBF") == u8"\\xC1\\xBF" + + + "\xC1\xBF" == "\xC1\xBF" + + + + + encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF" + + + "\xE0\x9F\xBF" == "\xE0\x9F\xBF" + + + + + encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF" + + + "\xF0\x8F\xBF\xBF" == "\xF0\x8F\xBF\xBF" + + + +
+ +
+
+
+ + + encode("\xED\xA0\x80") == "\xED\xA0\x80" + + + "" == "" + + + + + encode("\xED\xAF\xBF") == "\xED\xAF\xBF" + + + "" == "" + + + + + encode("\xED\xB0\x80") == "\xED\xB0\x80" + + + "" == "" + + + + + encode("\xED\xBF\xBF") == "\xED\xBF\xBF" + + + "" == "" + + + +
+ +
+
+
+ + + encode("\x80") == u8"\\x80" + + + "\x80" == "\x80" + + + + + encode("\x81") == u8"\\x81" + + + "\x81" == "\x81" + + + + + encode("\xBC") == u8"\\xBC" + + + "\xBC" == "\xBC" + + + + + encode("\xBF") == u8"\\xBF" + + + "\xBF" == "\xBF" + + + + + encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80" + + + "\xF5\x80\x80\x80" == "\xF5\x80\x80\x80" + + + + + encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80" + + + "\xF6\x80\x80\x80" == "\xF6\x80\x80\x80" + + + + + encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80" + + + "\xF7\x80\x80\x80" == "\xF7\x80\x80\x80" + + + +
+ +
+
+
+ + + encode("\xDE") == u8"\\xDE" + + + "\xDE" == "\xDE" + + + + + encode("\xDF") == u8"\\xDF" + + + "\xDF" == "\xDF" + + + + + encode("\xE0") == u8"\\xE0" + + + "\xE0" == "\xE0" + + + + + encode("\xEF") == u8"\\xEF" + + + "\xEF" == "\xEF" + + + + + encode("\xF0") == u8"\\xF0" + + + "\xF0" == "\xF0" + + + + + encode("\xF4") == u8"\\xF4" + + + "\xF4" == "\xF4" + + + + + encode("\xE0\x80") == u8"\\xE0\\x80" + + + "\xE0\x80" == "\xE0\x80" + + + + + encode("\xE0\xBF") == u8"\\xE0\\xBF" + + + "\xE0\xBF" == "\xE0\xBF" + + + + + encode("\xE1\x80") == u8"\\xE1\\x80" + + + "\xE1\x80" == "\xE1\x80" + + + + + encode("\xF0\x80") == u8"\\xF0\\x80" + + + "\xF0\x80" == "\xF0\x80" + + + + + encode("\xF4\x80") == u8"\\xF4\\x80" + + + "\xF4\x80" == "\xF4\x80" + + + + + encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80" + + + "\xF0\x80\x80" == "\xF0\x80\x80" + + + + + encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80" + + + "\xF4\x80\x80" == "\xF4\x80\x80" + + + +
+ +
+ +
@@ -9469,7 +9841,7 @@ loose text artifact - + - + diff --git a/projects/SelfTest/IntrospectiveTests/Xml.tests.cpp b/projects/SelfTest/IntrospectiveTests/Xml.tests.cpp index 9bbed258..c3886ab2 100644 --- a/projects/SelfTest/IntrospectiveTests/Xml.tests.cpp +++ b/projects/SelfTest/IntrospectiveTests/Xml.tests.cpp @@ -1,5 +1,4 @@ #include "catch.hpp" - #include "internal/catch_xmlwriter.h" #include @@ -10,7 +9,7 @@ inline std::string encode( std::string const& str, Catch::XmlEncode::ForWhat for return oss.str(); } -TEST_CASE( "XmlEncode" ) { +TEST_CASE( "XmlEncode", "[XML]" ) { SECTION( "normal string" ) { REQUIRE( encode( "normal string" ) == "normal string" ); } @@ -38,4 +37,76 @@ TEST_CASE( "XmlEncode" ) { SECTION( "string with control char (x7F)" ) { REQUIRE( encode( "[\x7F]" ) == "[\\x7F]" ); } -} \ No newline at end of file +} + +// Thanks to Peter Bindels (dascandy) for some of the tests +TEST_CASE("XmlEncode: UTF-8", "[XML][UTF-8]") { + SECTION("Valid utf-8 strings") { + CHECK(encode(u8"Here be 👾") == u8"Here be 👾"); + CHECK(encode(u8"šš") == u8"šš"); + + CHECK(encode("\xDF\xBF") == "\xDF\xBF"); // 0x7FF + CHECK(encode("\xE0\xA0\x80") == "\xE0\xA0\x80"); // 0x800 + CHECK(encode("\xED\x9F\xBF") == "\xED\x9F\xBF"); // 0xD7FF + CHECK(encode("\xEE\x80\x80") == "\xEE\x80\x80"); // 0xE000 + CHECK(encode("\xEF\xBF\xBF") == "\xEF\xBF\xBF"); // 0xFFFF + CHECK(encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80"); // 0x10000 + CHECK(encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF"); // 0x10FFFF + } + SECTION("Invalid utf-8 strings") { + SECTION("Various broken strings") { + CHECK(encode("Here \xFF be 👾") == u8"Here \\xFF be 👾"); + CHECK(encode("\xFF") == "\\xFF"); + CHECK(encode("\xC5\xC5\xA0") == u8"\\xC5Š"); + CHECK(encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80"); // 0x110000 -- out of unicode range + } + + SECTION("Overlong encodings") { + CHECK(encode("\xC0\x80") == u8"\\xC0\\x80"); // \0 + CHECK(encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80"); // Super-over-long \0 + CHECK(encode("\xC1\xBF") == u8"\\xC1\\xBF"); // ASCII char as UTF-8 (0x7F) + CHECK(encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF"); // 0x7FF + CHECK(encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF"); // 0xFFFF + } + + // Note that we actually don't modify surrogate pairs, as we do not do strict checking + SECTION("Surrogate pairs") { + CHECK(encode("\xED\xA0\x80") == "\xED\xA0\x80"); // Invalid surrogate half 0xD800 + CHECK(encode("\xED\xAF\xBF") == "\xED\xAF\xBF"); // Invalid surrogate half 0xDBFF + CHECK(encode("\xED\xB0\x80") == "\xED\xB0\x80"); // Invalid surrogate half 0xDC00 + CHECK(encode("\xED\xBF\xBF") == "\xED\xBF\xBF"); // Invalid surrogate half 0xDFFF + } + + SECTION("Invalid start byte") { + CHECK(encode("\x80") == u8"\\x80"); + CHECK(encode("\x81") == u8"\\x81"); + CHECK(encode("\xBC") == u8"\\xBC"); + CHECK(encode("\xBF") == u8"\\xBF"); + // Out of range + CHECK(encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80"); + CHECK(encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80"); + CHECK(encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80"); + } + + SECTION("Missing continuation byte(s)") { + // Missing first continuation byte + CHECK(encode("\xDE") == u8"\\xDE"); + CHECK(encode("\xDF") == u8"\\xDF"); + CHECK(encode("\xE0") == u8"\\xE0"); + CHECK(encode("\xEF") == u8"\\xEF"); + CHECK(encode("\xF0") == u8"\\xF0"); + CHECK(encode("\xF4") == u8"\\xF4"); + + // Missing second continuation byte + CHECK(encode("\xE0\x80") == u8"\\xE0\\x80"); + CHECK(encode("\xE0\xBF") == u8"\\xE0\\xBF"); + CHECK(encode("\xE1\x80") == u8"\\xE1\\x80"); + CHECK(encode("\xF0\x80") == u8"\\xF0\\x80"); + CHECK(encode("\xF4\x80") == u8"\\xF4\\x80"); + + // Missing third continuation byte + CHECK(encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80"); + CHECK(encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80"); + } + } +}