From f4e05a67bb5a57ee6e051fc8c391b32cbf927f7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Ho=C5=99e=C5=88ovsk=C3=BD?=
 <martin.horenovsky@gmail.com>
Date: Fri, 22 Aug 2025 17:03:31 +0200
Subject: [PATCH] Improve performance of writing XML

As with the JSON writer, the old code was made to be simple and
for each char just decided whether it needs escaping, or should be
written as-is. The new code instead looks for characters that need
escaping and batches writes of characters that do not.

This provides 4-8x speedup (length dependent) for writing strings
that do not need escaping, and keeps roughly the same performance
for those that do need escaping.
---
 src/catch2/internal/catch_xmlwriter.cpp       | 108 ++++++++++--------
 .../SelfTest/IntrospectiveTests/Xml.tests.cpp |  24 +++-
 2 files changed, 84 insertions(+), 48 deletions(-)
diff --git a/src/catch2/internal/catch_xmlwriter.cpp b/src/catch2/internal/catch_xmlwriter.cpp
index ccf63a56..67126be1 100644
--- a/src/catch2/internal/catch_xmlwriter.cpp
+++ b/src/catch2/internal/catch_xmlwriter.cpp
@@ -47,7 +47,7 @@ namespace {
 
     void hexEscapeChar(std::ostream& os, unsigned char c) {
         std::ios_base::fmtflags f(os.flags());
-        os << "\\x"
+        os << "\\x"_sr
             << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
             << static_cast<int>(c);
         os.flags(f);
@@ -66,95 +66,111 @@ namespace {
     void XmlEncode::encodeTo( std::ostream& os ) const {
         // Apostrophe escaping not necessary if we always use " to write attributes
         // (see: http://www.w3.org/TR/xml/#syntax)
+        size_t last_start = 0;
+        auto write_to = [&]( size_t idx ) {
+            if ( last_start < idx ) {
+                os << m_str.substr( last_start, idx - last_start );
+            }
+            last_start = idx + 1;
+        };
 
-        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
-            unsigned char c = static_cast<unsigned char>(m_str[idx]);
-            switch (c) {
-            case '<':   os << "&lt;"; break;
-            case '&':   os << "&amp;"; break;
+        for ( std::size_t idx = 0; idx < m_str.size(); ++idx ) {
+            unsigned char c = static_cast<unsigned char>( m_str[idx] );
+            switch ( c ) {
+            case '<':
+                write_to( idx );
+                os << "&lt;"_sr;
+                break;
+            case '&':
+                write_to( idx );
+                os << "&amp;"_sr;
+                break;
 
             case '>':
                 // See: http://www.w3.org/TR/xml/#syntax
-                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
-                    os << "&gt;";
-                else
-                    os << c;
+                if ( idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']' ) {
+                    write_to( idx );
+                    os << "&gt;"_sr;
+                }
                 break;
 
             case '\"':
-                if (m_forWhat == ForAttributes)
-                    os << "&quot;";
-                else
-                    os << c;
+                if ( m_forWhat == ForAttributes ) {
+                    write_to( idx );
+                    os << "&quot;"_sr;
+                }
                 break;
 
             default:
                 // Check for control characters and invalid utf-8
 
                 // Escape control characters in standard ascii
-                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
-                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
-                    hexEscapeChar(os, c);
+                // see
+                // http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
+                if ( c < 0x09 || ( c > 0x0D && c < 0x20 ) || c == 0x7F ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                     break;
                 }
 
                 // Plain ASCII: Write it to stream
-                if (c < 0x7F) {
-                    os << c;
+                if ( c < 0x7F ) {
                     break;
                 }
 
                 // UTF-8 territory
-                // Check if the encoding is valid and if it is not, hex escape bytes.
-                // Important: We do not check the exact decoded values for validity, only the encoding format
-                // First check that this bytes is a valid lead byte:
-                // This means that it is not encoded as 1111 1XXX
+                // Check if the encoding is valid and if it is not, hex escape
+                // bytes. Important: We do not check the exact decoded values for
+                // validity, only the encoding format First check that this bytes is
+                // a valid lead byte: This means that it is not encoded as 1111 1XXX
                 // Or as 10XX XXXX
-                if (c <  0xC0 ||
-                    c >= 0xF8) {
-                    hexEscapeChar(os, c);
+                if ( c < 0xC0 || c >= 0xF8 ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                     break;
                 }
 
-                auto encBytes = trailingBytes(c);
-                // Are there enough bytes left to avoid accessing out-of-bounds memory?
-                if (idx + encBytes - 1 >= m_str.size()) {
-                    hexEscapeChar(os, c);
+                auto encBytes = trailingBytes( c );
+                // Are there enough bytes left to avoid accessing out-of-bounds
+                // memory?
+                if ( idx + encBytes - 1 >= m_str.size() ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                     break;
                 }
                 // The header is valid, check data
                 // The next encBytes bytes must together be a valid utf-8
-                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
+                // This means: bitpattern 10XX XXXX and the extracted value is sane
+                // (ish)
                 bool valid = true;
-                uint32_t value = headerValue(c);
-                for (std::size_t n = 1; n < encBytes; ++n) {
-                    unsigned char nc = static_cast<unsigned char>(m_str[idx + n]);
-                    valid &= ((nc & 0xC0) == 0x80);
-                    value = (value << 6) | (nc & 0x3F);
+                uint32_t value = headerValue( c );
+                for ( std::size_t n = 1; n < encBytes; ++n ) {
+                    unsigned char nc = static_cast<unsigned char>( m_str[idx + n] );
+                    valid &= ( ( nc & 0xC0 ) == 0x80 );
+                    value = ( value << 6 ) | ( nc & 0x3F );
                 }
 
                 if (
                     // Wrong bit pattern of following bytes
-                    (!valid) ||
+                    ( !valid ) ||
                     // Overlong encodings
-                    (value < 0x80) ||
-                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
-                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
+                    ( value < 0x80 ) ||
+                    ( 0x80 <= value && value < 0x800 && encBytes > 2 ) ||
+                    ( 0x800 < value && value < 0x10000 && encBytes > 3 ) ||
                     // Encoded value out of range
-                    (value >= 0x110000)
-                    ) {
-                    hexEscapeChar(os, c);
+                    ( value >= 0x110000 ) ) {
+                    write_to( idx );
+                    hexEscapeChar( os, c );
                     break;
                 }
 
                 // If we got here, this is in fact a valid(ish) utf-8 sequence
-                for (std::size_t n = 0; n < encBytes; ++n) {
-                    os << m_str[idx + n];
-                }
                 idx += encBytes - 1;
                 break;
             }
         }
+
+        write_to( m_str.size() );
     }
 
     std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
diff --git a/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp b/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp
index b5982b85..a53bae35 100644
--- a/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp
+++ b/tests/SelfTest/IntrospectiveTests/Xml.tests.cpp
@@ -7,13 +7,16 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/catch_test_macros.hpp>
-#include <catch2/internal/catch_xmlwriter.hpp>
-
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/generators/catch_generators.hpp>
 #include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_xmlwriter.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
 #include <sstream>
 
+namespace {
+
 static std::string encode( std::string const& str, Catch::XmlEncode::ForWhat forWhat = Catch::XmlEncode::ForTextNodes ) {
     Catch::ReusableStringStream oss;
     oss << Catch::XmlEncode( str, forWhat );
@@ -181,3 +184,20 @@ TEST_CASE("XmlWriter escapes attributes properly", "[XML][XmlWriter][approvals]"
     REQUIRE_THAT(stream.str(),
                  ContainsSubstring(R"(some-attribute="Special chars need escaping: &lt; > ' &quot; &amp;")"));
 }
+
+TEST_CASE( "XmlWriter benchmarks", "[XML][XmlWriter][!benchmark]" ) {
+    const auto input_length = GENERATE( as<size_t>{}, 10, 100, 10'000 );
+    std::string test_input( input_length, 'a' );
+    BENCHMARK_ADVANCED( "write string, no-escaping, len=" +
+                        std::to_string( input_length ) ) {
+        return encode( test_input );
+    };
+
+    std::string escape_input( input_length, '\b' );
+    BENCHMARK_ADVANCED( "write string, all-escaped, len=" +
+                        std::to_string( input_length ) ) {
+        return encode( escape_input );
+    };
+}
+
+} // namespace