From f37c95088cea5c1e87c0781bc2b7e26791fad928 Mon Sep 17 00:00:00 2001 From: Ludger Sprenker Date: Sat, 6 Feb 2016 14:31:24 +0100 Subject: [PATCH] Produce valid xml and preserve utf8 characters in attributes --- include/internal/catch_xmlwriter.hpp | 87 ++++++++++++++++++++++++++-- projects/SelfTest/MiscTests.cpp | 10 +++- 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/include/internal/catch_xmlwriter.hpp b/include/internal/catch_xmlwriter.hpp index c59725b0..1e1d6e95 100644 --- a/include/internal/catch_xmlwriter.hpp +++ b/include/internal/catch_xmlwriter.hpp @@ -16,12 +16,88 @@ #include #include #include +#include namespace Catch { + + // (see: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout) + namespace Utf8 { + inline bool isSingleByteChar(unsigned char b) { + // Plain ASCII chars + return b <= 0x7F; + } + + inline bool isFollowByteInMultiByteChar(unsigned char b) { + return b >= 0x80 && b <= 0xBF; + } + + inline bool isFirstInTwoByteChar(unsigned char b) { + return b >= 0xC2 && b <= 0xDF; + } + + inline bool isFirstInThreeByteChar(unsigned char b) { + return b >= 0xE0 && b <= 0xEF; + } + + inline bool isFirstInFourByteChar(unsigned char b) { + return b >= 0xF0 && b <= 0xF4; + } + + inline bool isInvalidChar(unsigned char b) { + return b == 0xC0 || b == 0xC1 || b >= 0xF5; + } + + inline bool isValid(const char* str, size_t len) { + int outstandingBytesOfCurrentChar = 0; + + for( std::size_t i = 0; i < len; ++ i ) { + unsigned char b = static_cast( str[i] ); + + switch( outstandingBytesOfCurrentChar ) + { + case 0: + if( isSingleByteChar( b ) ) + outstandingBytesOfCurrentChar = 0; + else if( isFirstInTwoByteChar( b ) ) + outstandingBytesOfCurrentChar = 1; + else if( isFirstInThreeByteChar( b ) ) + outstandingBytesOfCurrentChar = 2; + else if( isFirstInFourByteChar( b ) ) + outstandingBytesOfCurrentChar = 3; + else + return false; + + break; + + case 1: + case 2: + case 3: + if( !isFollowByteInMultiByteChar( b ) ) + return false; + + outstandingBytesOfCurrentChar--; + break; + + default: + // outstandingBytesOfCurrentChar is negative: got follow byte when start byte was expected + return false; + } + + // explicit negative check (sould be fully redundant here) + assert( isInvalidChar( b ) == false ); + } + + return outstandingBytesOfCurrentChar == 0; + } + + inline bool isValid(const std::string& str) { + return isValid(str.c_str(), str.size()); + } + } class XmlEncode { public: - enum ForWhat { ForTextNodes, ForAttributes }; + enum ForWhat { ForTextNodes, ForAttributes }; XmlEncode( std::string const& str, ForWhat forWhat = ForTextNodes ) : m_str( str ), @@ -32,9 +108,12 @@ namespace Catch { // Apostrophe escaping not necessary if we always use " to write attributes // (see: http://www.w3.org/TR/xml/#syntax) + + // Preserve utf8 as it is the default on most platforms and in xml + bool isValidUtf8 = Utf8::isValid( m_str ); for( std::size_t i = 0; i < m_str.size(); ++ i ) { - char c = m_str[i]; + unsigned char c = static_cast( m_str[i] ); switch( c ) { case '<': os << "<"; break; case '&': os << "&"; break; @@ -56,8 +135,8 @@ namespace Catch { default: // Escape control chars - based on contribution by @espenalb in PR #465 - if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' ) - os << "&#x" << std::uppercase << std::hex << static_cast( c ); + if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c == '\x7F' || (c > '\x7F' && !isValidUtf8) ) + os << "&#x" << std::uppercase << std::hex << static_cast( c ) << ';'; else os << c; } diff --git a/projects/SelfTest/MiscTests.cpp b/projects/SelfTest/MiscTests.cpp index cf7f48cf..685ca592 100644 --- a/projects/SelfTest/MiscTests.cpp +++ b/projects/SelfTest/MiscTests.cpp @@ -458,10 +458,16 @@ TEST_CASE( "XmlEncode" ) { REQUIRE( encode( stringWithQuotes, Catch::XmlEncode::ForAttributes ) == "don't "quote" me on that" ); } SECTION( "string with control char (1)" ) { - REQUIRE( encode( "[\x01]" ) == "[]" ); + REQUIRE( encode( "[\x01]" ) == "[]" ); } SECTION( "string with control char (x7F)" ) { - REQUIRE( encode( "[\x7F]" ) == "[]" ); + REQUIRE( encode( "[\x7F]" ) == "[]" ); + } + SECTION( "string with control char that is negativ on signed char (xFF)" ) { + REQUIRE( encode( "[\xFF]" ) == "[ÿ]" ); + } + SECTION( "string with utf8 multi byte char (german 'ae' umlaut)" ) { + REQUIRE( encode( "[\xC3\xA4]" ) == "[\xC3\xA4]" ); } }