mirror of
				https://github.com/catchorg/Catch2.git
				synced 2025-11-04 05:59:32 +01:00 
			
		
		
		
	Produce valid xml and preserve utf8 characters in attributes
This commit is contained in:
		@@ -16,9 +16,85 @@
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <iomanip>
 | 
			
		||||
#include <cassert>
 | 
			
		||||
 | 
			
		||||
namespace Catch {
 | 
			
		||||
   
 | 
			
		||||
    // (see: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout)
 | 
			
		||||
    namespace Utf8 {
 | 
			
		||||
        inline bool isSingleByteChar(unsigned char b) {
 | 
			
		||||
            // Plain ASCII chars
 | 
			
		||||
            return b <= 0x7F;
 | 
			
		||||
        }
 | 
			
		||||
       
 | 
			
		||||
        inline bool isFollowByteInMultiByteChar(unsigned char b) {
 | 
			
		||||
            return b >= 0x80 && b <= 0xBF;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isFirstInTwoByteChar(unsigned char b) {
 | 
			
		||||
            return b >= 0xC2 && b <= 0xDF;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isFirstInThreeByteChar(unsigned char b) {
 | 
			
		||||
            return b >= 0xE0 && b <= 0xEF;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isFirstInFourByteChar(unsigned char b) {
 | 
			
		||||
            return b >= 0xF0 && b <= 0xF4;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isInvalidChar(unsigned char b) {
 | 
			
		||||
            return b == 0xC0 || b == 0xC1 || b >= 0xF5;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isValid(const char* str, size_t len) {
 | 
			
		||||
            int outstandingBytesOfCurrentChar = 0;
 | 
			
		||||
           
 | 
			
		||||
            for( std::size_t i = 0; i < len; ++ i ) {
 | 
			
		||||
                unsigned char b = static_cast<unsigned char>( str[i] );
 | 
			
		||||
                
 | 
			
		||||
                switch( outstandingBytesOfCurrentChar )
 | 
			
		||||
                {
 | 
			
		||||
                    case 0:
 | 
			
		||||
                        if( isSingleByteChar( b ) )
 | 
			
		||||
                            outstandingBytesOfCurrentChar = 0;
 | 
			
		||||
                        else if( isFirstInTwoByteChar( b ) )
 | 
			
		||||
                            outstandingBytesOfCurrentChar = 1;
 | 
			
		||||
                        else if( isFirstInThreeByteChar( b ) )
 | 
			
		||||
                            outstandingBytesOfCurrentChar = 2;
 | 
			
		||||
                        else if( isFirstInFourByteChar( b ) )
 | 
			
		||||
                            outstandingBytesOfCurrentChar = 3;
 | 
			
		||||
                        else
 | 
			
		||||
                            return false;
 | 
			
		||||
                        
 | 
			
		||||
                        break;
 | 
			
		||||
                        
 | 
			
		||||
                    case 1:
 | 
			
		||||
                    case 2:
 | 
			
		||||
                    case 3:
 | 
			
		||||
                        if( !isFollowByteInMultiByteChar( b ) )
 | 
			
		||||
                            return false;
 | 
			
		||||
                        
 | 
			
		||||
                        outstandingBytesOfCurrentChar--;
 | 
			
		||||
                        break;
 | 
			
		||||
                        
 | 
			
		||||
                    default:
 | 
			
		||||
                        // outstandingBytesOfCurrentChar is negative: got follow byte when start byte was expected
 | 
			
		||||
                        return false;
 | 
			
		||||
                }
 | 
			
		||||
                                
 | 
			
		||||
                // explicit negative check (sould be fully redundant here)
 | 
			
		||||
                assert( isInvalidChar( b ) == false );
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            return outstandingBytesOfCurrentChar == 0;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        inline bool isValid(const std::string& str) {
 | 
			
		||||
            return isValid(str.c_str(), str.size());
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    class XmlEncode {
 | 
			
		||||
    public:
 | 
			
		||||
        enum ForWhat { ForTextNodes, ForAttributes };        
 | 
			
		||||
@@ -33,8 +109,11 @@ namespace Catch {
 | 
			
		||||
            // Apostrophe escaping not necessary if we always use " to write attributes
 | 
			
		||||
            // (see: http://www.w3.org/TR/xml/#syntax)
 | 
			
		||||
           
 | 
			
		||||
            // Preserve utf8 as it is the default on most platforms and in xml
 | 
			
		||||
            bool isValidUtf8 = Utf8::isValid( m_str );
 | 
			
		||||
 | 
			
		||||
            for( std::size_t i = 0; i < m_str.size(); ++ i ) {
 | 
			
		||||
                char c = m_str[i];
 | 
			
		||||
                unsigned char c = static_cast<unsigned char>( m_str[i] );
 | 
			
		||||
                switch( c ) {
 | 
			
		||||
                    case '<':   os << "<"; break;
 | 
			
		||||
                    case '&':   os << "&"; break;
 | 
			
		||||
@@ -56,8 +135,8 @@ namespace Catch {
 | 
			
		||||
 | 
			
		||||
                    default:
 | 
			
		||||
                        // Escape control chars - based on contribution by @espenalb in PR #465
 | 
			
		||||
                        if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' )
 | 
			
		||||
                            os << "&#x" << std::uppercase << std::hex << static_cast<int>( c );
 | 
			
		||||
                        if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c == '\x7F' || (c > '\x7F' && !isValidUtf8) )
 | 
			
		||||
                            os << "&#x" << std::uppercase << std::hex << static_cast<int>( c ) << ';';
 | 
			
		||||
                        else
 | 
			
		||||
                            os << c;
 | 
			
		||||
                }
 | 
			
		||||
 
 | 
			
		||||
@@ -458,10 +458,16 @@ TEST_CASE( "XmlEncode" ) {
 | 
			
		||||
        REQUIRE( encode( stringWithQuotes, Catch::XmlEncode::ForAttributes ) == "don't "quote" me on that" );
 | 
			
		||||
    }
 | 
			
		||||
    SECTION( "string with control char (1)" ) {
 | 
			
		||||
        REQUIRE( encode( "[\x01]" ) == "[]" );
 | 
			
		||||
        REQUIRE( encode( "[\x01]" ) == "[]" );
 | 
			
		||||
    }
 | 
			
		||||
    SECTION( "string with control char (x7F)" ) {
 | 
			
		||||
        REQUIRE( encode( "[\x7F]" ) == "[]" );
 | 
			
		||||
        REQUIRE( encode( "[\x7F]" ) == "[]" );
 | 
			
		||||
    }
 | 
			
		||||
    SECTION( "string with control char that is negativ on signed char (xFF)" ) {
 | 
			
		||||
        REQUIRE( encode( "[\xFF]" ) == "[ÿ]" );
 | 
			
		||||
    }
 | 
			
		||||
    SECTION( "string with utf8 multi byte char (german 'ae' umlaut)" ) {
 | 
			
		||||
        REQUIRE( encode( "[\xC3\xA4]" ) == "[\xC3\xA4]" );
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user