mirror of
https://github.com/catchorg/Catch2.git
synced 2024-11-17 11:12:25 +01:00
Produce valid xml and preserve utf8 characters in attributes
This commit is contained in:
parent
3bd20bf2cd
commit
f37c95088c
@ -16,12 +16,88 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
#include <cassert>
|
||||
|
||||
namespace Catch {
|
||||
|
||||
// (see: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout)
|
||||
namespace Utf8 {
|
||||
inline bool isSingleByteChar(unsigned char b) {
|
||||
// Plain ASCII chars
|
||||
return b <= 0x7F;
|
||||
}
|
||||
|
||||
inline bool isFollowByteInMultiByteChar(unsigned char b) {
|
||||
return b >= 0x80 && b <= 0xBF;
|
||||
}
|
||||
|
||||
inline bool isFirstInTwoByteChar(unsigned char b) {
|
||||
return b >= 0xC2 && b <= 0xDF;
|
||||
}
|
||||
|
||||
inline bool isFirstInThreeByteChar(unsigned char b) {
|
||||
return b >= 0xE0 && b <= 0xEF;
|
||||
}
|
||||
|
||||
inline bool isFirstInFourByteChar(unsigned char b) {
|
||||
return b >= 0xF0 && b <= 0xF4;
|
||||
}
|
||||
|
||||
inline bool isInvalidChar(unsigned char b) {
|
||||
return b == 0xC0 || b == 0xC1 || b >= 0xF5;
|
||||
}
|
||||
|
||||
inline bool isValid(const char* str, size_t len) {
|
||||
int outstandingBytesOfCurrentChar = 0;
|
||||
|
||||
for( std::size_t i = 0; i < len; ++ i ) {
|
||||
unsigned char b = static_cast<unsigned char>( str[i] );
|
||||
|
||||
switch( outstandingBytesOfCurrentChar )
|
||||
{
|
||||
case 0:
|
||||
if( isSingleByteChar( b ) )
|
||||
outstandingBytesOfCurrentChar = 0;
|
||||
else if( isFirstInTwoByteChar( b ) )
|
||||
outstandingBytesOfCurrentChar = 1;
|
||||
else if( isFirstInThreeByteChar( b ) )
|
||||
outstandingBytesOfCurrentChar = 2;
|
||||
else if( isFirstInFourByteChar( b ) )
|
||||
outstandingBytesOfCurrentChar = 3;
|
||||
else
|
||||
return false;
|
||||
|
||||
break;
|
||||
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
if( !isFollowByteInMultiByteChar( b ) )
|
||||
return false;
|
||||
|
||||
outstandingBytesOfCurrentChar--;
|
||||
break;
|
||||
|
||||
default:
|
||||
// outstandingBytesOfCurrentChar is negative: got follow byte when start byte was expected
|
||||
return false;
|
||||
}
|
||||
|
||||
// explicit negative check (sould be fully redundant here)
|
||||
assert( isInvalidChar( b ) == false );
|
||||
}
|
||||
|
||||
return outstandingBytesOfCurrentChar == 0;
|
||||
}
|
||||
|
||||
inline bool isValid(const std::string& str) {
|
||||
return isValid(str.c_str(), str.size());
|
||||
}
|
||||
}
|
||||
|
||||
class XmlEncode {
|
||||
public:
|
||||
enum ForWhat { ForTextNodes, ForAttributes };
|
||||
enum ForWhat { ForTextNodes, ForAttributes };
|
||||
|
||||
XmlEncode( std::string const& str, ForWhat forWhat = ForTextNodes )
|
||||
: m_str( str ),
|
||||
@ -32,9 +108,12 @@ namespace Catch {
|
||||
|
||||
// Apostrophe escaping not necessary if we always use " to write attributes
|
||||
// (see: http://www.w3.org/TR/xml/#syntax)
|
||||
|
||||
// Preserve utf8 as it is the default on most platforms and in xml
|
||||
bool isValidUtf8 = Utf8::isValid( m_str );
|
||||
|
||||
for( std::size_t i = 0; i < m_str.size(); ++ i ) {
|
||||
char c = m_str[i];
|
||||
unsigned char c = static_cast<unsigned char>( m_str[i] );
|
||||
switch( c ) {
|
||||
case '<': os << "<"; break;
|
||||
case '&': os << "&"; break;
|
||||
@ -56,8 +135,8 @@ namespace Catch {
|
||||
|
||||
default:
|
||||
// Escape control chars - based on contribution by @espenalb in PR #465
|
||||
if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' )
|
||||
os << "&#x" << std::uppercase << std::hex << static_cast<int>( c );
|
||||
if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c == '\x7F' || (c > '\x7F' && !isValidUtf8) )
|
||||
os << "&#x" << std::uppercase << std::hex << static_cast<int>( c ) << ';';
|
||||
else
|
||||
os << c;
|
||||
}
|
||||
|
@ -458,10 +458,16 @@ TEST_CASE( "XmlEncode" ) {
|
||||
REQUIRE( encode( stringWithQuotes, Catch::XmlEncode::ForAttributes ) == "don't "quote" me on that" );
|
||||
}
|
||||
SECTION( "string with control char (1)" ) {
|
||||
REQUIRE( encode( "[\x01]" ) == "[]" );
|
||||
REQUIRE( encode( "[\x01]" ) == "[]" );
|
||||
}
|
||||
SECTION( "string with control char (x7F)" ) {
|
||||
REQUIRE( encode( "[\x7F]" ) == "[]" );
|
||||
REQUIRE( encode( "[\x7F]" ) == "[]" );
|
||||
}
|
||||
SECTION( "string with control char that is negativ on signed char (xFF)" ) {
|
||||
REQUIRE( encode( "[\xFF]" ) == "[ÿ]" );
|
||||
}
|
||||
SECTION( "string with utf8 multi byte char (german 'ae' umlaut)" ) {
|
||||
REQUIRE( encode( "[\xC3\xA4]" ) == "[\xC3\xA4]" );
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user