Produce valid xml and preserve utf8 characters in attributes

This commit is contained in:
Ludger Sprenker 2016-02-06 14:31:24 +01:00
parent 3bd20bf2cd
commit f37c95088c
2 changed files with 91 additions and 6 deletions

View File

@ -16,9 +16,85 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iomanip> #include <iomanip>
#include <cassert>
namespace Catch { namespace Catch {
// (see: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout)
namespace Utf8 {
inline bool isSingleByteChar(unsigned char b) {
// Plain ASCII chars
return b <= 0x7F;
}
inline bool isFollowByteInMultiByteChar(unsigned char b) {
return b >= 0x80 && b <= 0xBF;
}
inline bool isFirstInTwoByteChar(unsigned char b) {
return b >= 0xC2 && b <= 0xDF;
}
inline bool isFirstInThreeByteChar(unsigned char b) {
return b >= 0xE0 && b <= 0xEF;
}
inline bool isFirstInFourByteChar(unsigned char b) {
return b >= 0xF0 && b <= 0xF4;
}
inline bool isInvalidChar(unsigned char b) {
return b == 0xC0 || b == 0xC1 || b >= 0xF5;
}
inline bool isValid(const char* str, size_t len) {
int outstandingBytesOfCurrentChar = 0;
for( std::size_t i = 0; i < len; ++ i ) {
unsigned char b = static_cast<unsigned char>( str[i] );
switch( outstandingBytesOfCurrentChar )
{
case 0:
if( isSingleByteChar( b ) )
outstandingBytesOfCurrentChar = 0;
else if( isFirstInTwoByteChar( b ) )
outstandingBytesOfCurrentChar = 1;
else if( isFirstInThreeByteChar( b ) )
outstandingBytesOfCurrentChar = 2;
else if( isFirstInFourByteChar( b ) )
outstandingBytesOfCurrentChar = 3;
else
return false;
break;
case 1:
case 2:
case 3:
if( !isFollowByteInMultiByteChar( b ) )
return false;
outstandingBytesOfCurrentChar--;
break;
default:
// outstandingBytesOfCurrentChar is negative: got follow byte when start byte was expected
return false;
}
// explicit negative check (sould be fully redundant here)
assert( isInvalidChar( b ) == false );
}
return outstandingBytesOfCurrentChar == 0;
}
inline bool isValid(const std::string& str) {
return isValid(str.c_str(), str.size());
}
}
class XmlEncode { class XmlEncode {
public: public:
enum ForWhat { ForTextNodes, ForAttributes }; enum ForWhat { ForTextNodes, ForAttributes };
@ -33,8 +109,11 @@ namespace Catch {
// Apostrophe escaping not necessary if we always use " to write attributes // Apostrophe escaping not necessary if we always use " to write attributes
// (see: http://www.w3.org/TR/xml/#syntax) // (see: http://www.w3.org/TR/xml/#syntax)
// Preserve utf8 as it is the default on most platforms and in xml
bool isValidUtf8 = Utf8::isValid( m_str );
for( std::size_t i = 0; i < m_str.size(); ++ i ) { for( std::size_t i = 0; i < m_str.size(); ++ i ) {
char c = m_str[i]; unsigned char c = static_cast<unsigned char>( m_str[i] );
switch( c ) { switch( c ) {
case '<': os << "&lt;"; break; case '<': os << "&lt;"; break;
case '&': os << "&amp;"; break; case '&': os << "&amp;"; break;
@ -56,8 +135,8 @@ namespace Catch {
default: default:
// Escape control chars - based on contribution by @espenalb in PR #465 // Escape control chars - based on contribution by @espenalb in PR #465
if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' ) if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c == '\x7F' || (c > '\x7F' && !isValidUtf8) )
os << "&#x" << std::uppercase << std::hex << static_cast<int>( c ); os << "&#x" << std::uppercase << std::hex << static_cast<int>( c ) << ';';
else else
os << c; os << c;
} }

View File

@ -458,10 +458,16 @@ TEST_CASE( "XmlEncode" ) {
REQUIRE( encode( stringWithQuotes, Catch::XmlEncode::ForAttributes ) == "don't &quot;quote&quot; me on that" ); REQUIRE( encode( stringWithQuotes, Catch::XmlEncode::ForAttributes ) == "don't &quot;quote&quot; me on that" );
} }
SECTION( "string with control char (1)" ) { SECTION( "string with control char (1)" ) {
REQUIRE( encode( "[\x01]" ) == "[&#x1]" ); REQUIRE( encode( "[\x01]" ) == "[&#x1;]" );
} }
SECTION( "string with control char (x7F)" ) { SECTION( "string with control char (x7F)" ) {
REQUIRE( encode( "[\x7F]" ) == "[&#x7F]" ); REQUIRE( encode( "[\x7F]" ) == "[&#x7F;]" );
}
SECTION( "string with control char that is negativ on signed char (xFF)" ) {
REQUIRE( encode( "[\xFF]" ) == "[&#xFF;]" );
}
SECTION( "string with utf8 multi byte char (german 'ae' umlaut)" ) {
REQUIRE( encode( "[\xC3\xA4]" ) == "[\xC3\xA4]" );
} }
} }