Improve performance of writing XML

As with the JSON writer, the old code was made to be simple and for each char just decided whether it needs escaping, or should be written as-is. The new code instead looks for characters that need escaping and batches writes of characters that do not. This provides 4-8x speedup (length dependent) for writing strings that do not need escaping, and keeps roughly the same performance for those that do need escaping.
2025-12-15 06:32:12 +01:00 · 2025-08-22 17:03:31 +02:00
parent fb2e4fbe41
commit f4e05a67bb
2 changed files with 84 additions and 48 deletions
--- a/src/catch2/internal/catch_xmlwriter.cpp
+++ b/src/catch2/internal/catch_xmlwriter.cpp
@@ -47,7 +47,7 @@ namespace {

    void hexEscapeChar(std::ostream& os, unsigned char c) {
        std::ios_base::fmtflags f(os.flags());
-        os << "\\x"
+        os << "\\x"_sr
            << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
            << static_cast<int>(c);
        os.flags(f);
@@ -66,95 +66,111 @@ namespace {
    void XmlEncode::encodeTo( std::ostream& os ) const {
        // Apostrophe escaping not necessary if we always use " to write attributes
        // (see: http://www.w3.org/TR/xml/#syntax)
+        size_t last_start = 0;
+        auto write_to = [&]( size_t idx ) {
+            if ( last_start < idx ) {
+                os << m_str.substr( last_start, idx - last_start );
+            }
+            last_start = idx + 1;
+        };

-        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
-            unsigned char c = static_cast<unsigned char>(m_str[idx]);
-            switch (c) {
-            case '<':   os << "&lt;"; break;
-            case '&':   os << "&amp;"; break;
+        for ( std::size_t idx = 0; idx < m_str.size(); ++idx ) {
+            unsigned char c = static_cast<unsigned char>( m_str[idx] );
+            switch ( c ) {
+            case '<':
+                write_to( idx );
+                os << "&lt;"_sr;
+                break;
+            case '&':
+                write_to( idx );
+                os << "&amp;"_sr;
+                break;

            case '>':
                // See: http://www.w3.org/TR/xml/#syntax
-                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
-                    os << "&gt;";
-                else
-                    os << c;
+                if ( idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']' ) {
+                    write_to( idx );
+                    os << "&gt;"_sr;
+                }
                break;

            case '\"':
-                if (m_forWhat == ForAttributes)
-                    os << "&quot;";
-                else
-                    os << c;
+                if ( m_forWhat == ForAttributes ) {
+                    write_to( idx );
+                    os << "&quot;"_sr;
+                }
                break;

            default:
                // Check for control characters and invalid utf-8

                // Escape control characters in standard ascii
-                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
-                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
-                    hexEscapeChar(os, c);
+                // see
+                // http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
+                if ( c < 0x09 || ( c > 0x0D && c < 0x20 ) || c == 0x7F ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                    break;
                }

                // Plain ASCII: Write it to stream
-                if (c < 0x7F) {
-                    os << c;
+                if ( c < 0x7F ) {
                    break;
                }

                // UTF-8 territory
-                // Check if the encoding is valid and if it is not, hex escape bytes.
-                // Important: We do not check the exact decoded values for validity, only the encoding format
-                // First check that this bytes is a valid lead byte:
-                // This means that it is not encoded as 1111 1XXX
+                // Check if the encoding is valid and if it is not, hex escape
+                // bytes. Important: We do not check the exact decoded values for
+                // validity, only the encoding format First check that this bytes is
+                // a valid lead byte: This means that it is not encoded as 1111 1XXX
                // Or as 10XX XXXX
-                if (c <  0xC0 ||
-                    c >= 0xF8) {
-                    hexEscapeChar(os, c);
+                if ( c < 0xC0 || c >= 0xF8 ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                    break;
                }

-                auto encBytes = trailingBytes(c);
-                // Are there enough bytes left to avoid accessing out-of-bounds memory?
-                if (idx + encBytes - 1 >= m_str.size()) {
-                    hexEscapeChar(os, c);
+                auto encBytes = trailingBytes( c );
+                // Are there enough bytes left to avoid accessing out-of-bounds
+                // memory?
+                if ( idx + encBytes - 1 >= m_str.size() ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                    break;
                }
                // The header is valid, check data
                // The next encBytes bytes must together be a valid utf-8
-                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
+                // This means: bitpattern 10XX XXXX and the extracted value is sane
+                // (ish)
                bool valid = true;
-                uint32_t value = headerValue(c);
-                for (std::size_t n = 1; n < encBytes; ++n) {
-                    unsigned char nc = static_cast<unsigned char>(m_str[idx + n]);
-                    valid &= ((nc & 0xC0) == 0x80);
-                    value = (value << 6) | (nc & 0x3F);
+                uint32_t value = headerValue( c );
+                for ( std::size_t n = 1; n < encBytes; ++n ) {
+                    unsigned char nc = static_cast<unsigned char>( m_str[idx + n] );
+                    valid &= ( ( nc & 0xC0 ) == 0x80 );
+                    value = ( value << 6 ) | ( nc & 0x3F );
                }

                if (
                    // Wrong bit pattern of following bytes
-                    (!valid) ||
+                    ( !valid ) ||
                    // Overlong encodings
-                    (value < 0x80) ||
-                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
-                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
+                    ( value < 0x80 ) ||
+                    ( 0x80 <= value && value < 0x800 && encBytes > 2 ) ||
+                    ( 0x800 < value && value < 0x10000 && encBytes > 3 ) ||
                    // Encoded value out of range
-                    (value >= 0x110000)
-                    ) {
-                    hexEscapeChar(os, c);
+                    ( value >= 0x110000 ) ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                    break;
                }

                // If we got here, this is in fact a valid(ish) utf-8 sequence
-                for (std::size_t n = 0; n < encBytes; ++n) {
-                    os << m_str[idx + n];
-                }
                idx += encBytes - 1;
                break;
            }
        }
+
+        write_to( m_str.size() );
    }

    std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
--- a/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp
+++ b/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp
@@ -7,13 +7,16 @@
 // SPDX-License-Identifier: BSL-1.0

 #include <catch2/catch_test_macros.hpp>
-#include <catch2/internal/catch_xmlwriter.hpp>
-
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/generators/catch_generators.hpp>
 #include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_xmlwriter.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>

 #include <sstream>

+namespace {
+
 static std::string encode( std::string const& str, Catch::XmlEncode::ForWhat forWhat = Catch::XmlEncode::ForTextNodes ) {
    Catch::ReusableStringStream oss;
    oss << Catch::XmlEncode( str, forWhat );
@@ -181,3 +184,20 @@ TEST_CASE("XmlWriter escapes attributes properly", "[XML][XmlWriter][approvals]"
    REQUIRE_THAT(stream.str(),
                 ContainsSubstring(R"(some-attribute="Special chars need escaping: &lt; > ' &quot; &amp;")"));
 }
+
+TEST_CASE( "XmlWriter benchmarks", "[XML][XmlWriter][!benchmark]" ) {
+    const auto input_length = GENERATE( as<size_t>{}, 10, 100, 10'000 );
+    std::string test_input( input_length, 'a' );
+    BENCHMARK_ADVANCED( "write string, no-escaping, len=" +
+                        std::to_string( input_length ) ) {
+        return encode( test_input );
+    };
+
+    std::string escape_input( input_length, '\b' );
+    BENCHMARK_ADVANCED( "write string, all-escaped, len=" +
+                        std::to_string( input_length ) ) {
+        return encode( escape_input );
+    };
+}
+
+} // namespace