mirror of
https://github.com/catchorg/Catch2.git
synced 2025-08-24 23:45:40 +02:00
Improve performance of writing XML
As with the JSON writer, the old code was made to be simple and for each char just decided whether it needs escaping, or should be written as-is. The new code instead looks for characters that need escaping and batches writes of characters that do not. This provides 4-8x speedup (length dependent) for writing strings that do not need escaping, and keeps roughly the same performance for those that do need escaping.
This commit is contained in:
@@ -47,7 +47,7 @@ namespace {
|
|||||||
|
|
||||||
void hexEscapeChar(std::ostream& os, unsigned char c) {
|
void hexEscapeChar(std::ostream& os, unsigned char c) {
|
||||||
std::ios_base::fmtflags f(os.flags());
|
std::ios_base::fmtflags f(os.flags());
|
||||||
os << "\\x"
|
os << "\\x"_sr
|
||||||
<< std::uppercase << std::hex << std::setfill('0') << std::setw(2)
|
<< std::uppercase << std::hex << std::setfill('0') << std::setw(2)
|
||||||
<< static_cast<int>(c);
|
<< static_cast<int>(c);
|
||||||
os.flags(f);
|
os.flags(f);
|
||||||
@@ -66,95 +66,111 @@ namespace {
|
|||||||
void XmlEncode::encodeTo( std::ostream& os ) const {
|
void XmlEncode::encodeTo( std::ostream& os ) const {
|
||||||
// Apostrophe escaping not necessary if we always use " to write attributes
|
// Apostrophe escaping not necessary if we always use " to write attributes
|
||||||
// (see: http://www.w3.org/TR/xml/#syntax)
|
// (see: http://www.w3.org/TR/xml/#syntax)
|
||||||
|
size_t last_start = 0;
|
||||||
|
auto write_to = [&]( size_t idx ) {
|
||||||
|
if ( last_start < idx ) {
|
||||||
|
os << m_str.substr( last_start, idx - last_start );
|
||||||
|
}
|
||||||
|
last_start = idx + 1;
|
||||||
|
};
|
||||||
|
|
||||||
for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
|
for ( std::size_t idx = 0; idx < m_str.size(); ++idx ) {
|
||||||
unsigned char c = static_cast<unsigned char>(m_str[idx]);
|
unsigned char c = static_cast<unsigned char>( m_str[idx] );
|
||||||
switch (c) {
|
switch ( c ) {
|
||||||
case '<': os << "<"; break;
|
case '<':
|
||||||
case '&': os << "&"; break;
|
write_to( idx );
|
||||||
|
os << "<"_sr;
|
||||||
|
break;
|
||||||
|
case '&':
|
||||||
|
write_to( idx );
|
||||||
|
os << "&"_sr;
|
||||||
|
break;
|
||||||
|
|
||||||
case '>':
|
case '>':
|
||||||
// See: http://www.w3.org/TR/xml/#syntax
|
// See: http://www.w3.org/TR/xml/#syntax
|
||||||
if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
|
if ( idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']' ) {
|
||||||
os << ">";
|
write_to( idx );
|
||||||
else
|
os << ">"_sr;
|
||||||
os << c;
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\"':
|
case '\"':
|
||||||
if (m_forWhat == ForAttributes)
|
if ( m_forWhat == ForAttributes ) {
|
||||||
os << """;
|
write_to( idx );
|
||||||
else
|
os << """_sr;
|
||||||
os << c;
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
// Check for control characters and invalid utf-8
|
// Check for control characters and invalid utf-8
|
||||||
|
|
||||||
// Escape control characters in standard ascii
|
// Escape control characters in standard ascii
|
||||||
// see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
|
// see
|
||||||
if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
|
// http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
|
||||||
hexEscapeChar(os, c);
|
if ( c < 0x09 || ( c > 0x0D && c < 0x20 ) || c == 0x7F ) {
|
||||||
|
write_to( idx );
|
||||||
|
hexEscapeChar( os, c );
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Plain ASCII: Write it to stream
|
// Plain ASCII: Write it to stream
|
||||||
if (c < 0x7F) {
|
if ( c < 0x7F ) {
|
||||||
os << c;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// UTF-8 territory
|
// UTF-8 territory
|
||||||
// Check if the encoding is valid and if it is not, hex escape bytes.
|
// Check if the encoding is valid and if it is not, hex escape
|
||||||
// Important: We do not check the exact decoded values for validity, only the encoding format
|
// bytes. Important: We do not check the exact decoded values for
|
||||||
// First check that this bytes is a valid lead byte:
|
// validity, only the encoding format First check that this bytes is
|
||||||
// This means that it is not encoded as 1111 1XXX
|
// a valid lead byte: This means that it is not encoded as 1111 1XXX
|
||||||
// Or as 10XX XXXX
|
// Or as 10XX XXXX
|
||||||
if (c < 0xC0 ||
|
if ( c < 0xC0 || c >= 0xF8 ) {
|
||||||
c >= 0xF8) {
|
write_to( idx );
|
||||||
hexEscapeChar(os, c);
|
hexEscapeChar( os, c );
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto encBytes = trailingBytes(c);
|
auto encBytes = trailingBytes( c );
|
||||||
// Are there enough bytes left to avoid accessing out-of-bounds memory?
|
// Are there enough bytes left to avoid accessing out-of-bounds
|
||||||
if (idx + encBytes - 1 >= m_str.size()) {
|
// memory?
|
||||||
hexEscapeChar(os, c);
|
if ( idx + encBytes - 1 >= m_str.size() ) {
|
||||||
|
write_to( idx );
|
||||||
|
hexEscapeChar( os, c );
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// The header is valid, check data
|
// The header is valid, check data
|
||||||
// The next encBytes bytes must together be a valid utf-8
|
// The next encBytes bytes must together be a valid utf-8
|
||||||
// This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
|
// This means: bitpattern 10XX XXXX and the extracted value is sane
|
||||||
|
// (ish)
|
||||||
bool valid = true;
|
bool valid = true;
|
||||||
uint32_t value = headerValue(c);
|
uint32_t value = headerValue( c );
|
||||||
for (std::size_t n = 1; n < encBytes; ++n) {
|
for ( std::size_t n = 1; n < encBytes; ++n ) {
|
||||||
unsigned char nc = static_cast<unsigned char>(m_str[idx + n]);
|
unsigned char nc = static_cast<unsigned char>( m_str[idx + n] );
|
||||||
valid &= ((nc & 0xC0) == 0x80);
|
valid &= ( ( nc & 0xC0 ) == 0x80 );
|
||||||
value = (value << 6) | (nc & 0x3F);
|
value = ( value << 6 ) | ( nc & 0x3F );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
// Wrong bit pattern of following bytes
|
// Wrong bit pattern of following bytes
|
||||||
(!valid) ||
|
( !valid ) ||
|
||||||
// Overlong encodings
|
// Overlong encodings
|
||||||
(value < 0x80) ||
|
( value < 0x80 ) ||
|
||||||
(0x80 <= value && value < 0x800 && encBytes > 2) ||
|
( 0x80 <= value && value < 0x800 && encBytes > 2 ) ||
|
||||||
(0x800 < value && value < 0x10000 && encBytes > 3) ||
|
( 0x800 < value && value < 0x10000 && encBytes > 3 ) ||
|
||||||
// Encoded value out of range
|
// Encoded value out of range
|
||||||
(value >= 0x110000)
|
( value >= 0x110000 ) ) {
|
||||||
) {
|
write_to( idx );
|
||||||
hexEscapeChar(os, c);
|
hexEscapeChar( os, c );
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we got here, this is in fact a valid(ish) utf-8 sequence
|
// If we got here, this is in fact a valid(ish) utf-8 sequence
|
||||||
for (std::size_t n = 0; n < encBytes; ++n) {
|
|
||||||
os << m_str[idx + n];
|
|
||||||
}
|
|
||||||
idx += encBytes - 1;
|
idx += encBytes - 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
write_to( m_str.size() );
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
|
std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
|
||||||
|
@@ -7,13 +7,16 @@
|
|||||||
// SPDX-License-Identifier: BSL-1.0
|
// SPDX-License-Identifier: BSL-1.0
|
||||||
|
|
||||||
#include <catch2/catch_test_macros.hpp>
|
#include <catch2/catch_test_macros.hpp>
|
||||||
#include <catch2/internal/catch_xmlwriter.hpp>
|
#include <catch2/benchmark/catch_benchmark.hpp>
|
||||||
|
#include <catch2/generators/catch_generators.hpp>
|
||||||
#include <catch2/internal/catch_reusable_string_stream.hpp>
|
#include <catch2/internal/catch_reusable_string_stream.hpp>
|
||||||
|
#include <catch2/internal/catch_xmlwriter.hpp>
|
||||||
#include <catch2/matchers/catch_matchers_string.hpp>
|
#include <catch2/matchers/catch_matchers_string.hpp>
|
||||||
|
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
static std::string encode( std::string const& str, Catch::XmlEncode::ForWhat forWhat = Catch::XmlEncode::ForTextNodes ) {
|
static std::string encode( std::string const& str, Catch::XmlEncode::ForWhat forWhat = Catch::XmlEncode::ForTextNodes ) {
|
||||||
Catch::ReusableStringStream oss;
|
Catch::ReusableStringStream oss;
|
||||||
oss << Catch::XmlEncode( str, forWhat );
|
oss << Catch::XmlEncode( str, forWhat );
|
||||||
@@ -181,3 +184,20 @@ TEST_CASE("XmlWriter escapes attributes properly", "[XML][XmlWriter][approvals]"
|
|||||||
REQUIRE_THAT(stream.str(),
|
REQUIRE_THAT(stream.str(),
|
||||||
ContainsSubstring(R"(some-attribute="Special chars need escaping: < > ' " &")"));
|
ContainsSubstring(R"(some-attribute="Special chars need escaping: < > ' " &")"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE( "XmlWriter benchmarks", "[XML][XmlWriter][!benchmark]" ) {
|
||||||
|
const auto input_length = GENERATE( as<size_t>{}, 10, 100, 10'000 );
|
||||||
|
std::string test_input( input_length, 'a' );
|
||||||
|
BENCHMARK_ADVANCED( "write string, no-escaping, len=" +
|
||||||
|
std::to_string( input_length ) ) {
|
||||||
|
return encode( test_input );
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string escape_input( input_length, '\b' );
|
||||||
|
BENCHMARK_ADVANCED( "write string, all-escaped, len=" +
|
||||||
|
std::to_string( input_length ) ) {
|
||||||
|
return encode( escape_input );
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
Reference in New Issue
Block a user