mirror of
https://github.com/catchorg/Catch2.git
synced 2025-01-12 04:43:29 +01:00
3b801c4fda
There are still some holes, e.g. we leave surrogate pairs be even though they are not a part of valid UTF-8, but this might be for the better -- WTF-8 does support surrogate pairs inside text. Closes #1207
285 lines
8.6 KiB
C++
285 lines
8.6 KiB
C++
/*
|
|
* Created by Phil on 19/07/2017.
|
|
*
|
|
* Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
* file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
*/
|
|
|
|
#include "catch_xmlwriter.h"
|
|
|
|
#include "catch_enforce.h"
|
|
|
|
#include <iomanip>
|
|
|
|
using uchar = unsigned char;
|
|
|
|
namespace Catch {
|
|
|
|
namespace {
|
|
|
|
size_t trailingBytes(unsigned char c) {
|
|
if ((c & 0xE0) == 0xC0) {
|
|
return 2;
|
|
}
|
|
if ((c & 0xF0) == 0xE0) {
|
|
return 3;
|
|
}
|
|
if ((c & 0xF8) == 0xF0) {
|
|
return 4;
|
|
}
|
|
CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
|
|
}
|
|
|
|
uint32_t headerValue(unsigned char c) {
|
|
if ((c & 0xE0) == 0xC0) {
|
|
return c & 0x1F;
|
|
}
|
|
if ((c & 0xF0) == 0xE0) {
|
|
return c & 0x0F;
|
|
}
|
|
if ((c & 0xF8) == 0xF0) {
|
|
return c & 0x07;
|
|
}
|
|
CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
|
|
}
|
|
|
|
void hexEscapeChar(std::ostream& os, unsigned char c) {
|
|
os << "\\x"
|
|
<< std::uppercase << std::hex << std::setfill('0') << std::setw(2)
|
|
<< static_cast<int>(c);
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat )
|
|
: m_str( str ),
|
|
m_forWhat( forWhat )
|
|
{}
|
|
|
|
void XmlEncode::encodeTo( std::ostream& os ) const {
|
|
// Apostrophe escaping not necessary if we always use " to write attributes
|
|
// (see: http://www.w3.org/TR/xml/#syntax)
|
|
|
|
for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
|
|
uchar c = m_str[idx];
|
|
switch (c) {
|
|
case '<': os << "<"; break;
|
|
case '&': os << "&"; break;
|
|
|
|
case '>':
|
|
// See: http://www.w3.org/TR/xml/#syntax
|
|
if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
|
|
os << ">";
|
|
else
|
|
os << c;
|
|
break;
|
|
|
|
case '\"':
|
|
if (m_forWhat == ForAttributes)
|
|
os << """;
|
|
else
|
|
os << c;
|
|
break;
|
|
|
|
default:
|
|
// Check for control characters and invalid utf-8
|
|
|
|
// Escape control characters in standard ascii
|
|
// see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
|
|
if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
|
|
hexEscapeChar(os, c);
|
|
break;
|
|
}
|
|
|
|
// Plain ASCII: Write it to stream
|
|
if (c < 0x7F) {
|
|
os << c;
|
|
break;
|
|
}
|
|
|
|
// UTF-8 territory
|
|
// Check if the encoding is valid and if it is not, hex escape bytes.
|
|
// Important: We do not check the exact decoded values for validity, only the encoding format
|
|
// First check that this bytes is a valid lead byte:
|
|
// This means that it is not encoded as 1111 1XXX
|
|
// Or as 10XX XXXX
|
|
if (c < 0xC0 ||
|
|
c >= 0xF8) {
|
|
hexEscapeChar(os, c);
|
|
break;
|
|
}
|
|
|
|
auto encBytes = trailingBytes(c);
|
|
// Are there enough bytes left to avoid accessing out-of-bounds memory?
|
|
if (idx + encBytes - 1 >= m_str.size()) {
|
|
hexEscapeChar(os, c);
|
|
break;
|
|
}
|
|
// The header is valid, check data
|
|
// The next encBytes bytes must together be a valid utf-8
|
|
// This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
|
|
bool valid = true;
|
|
uint32_t value = headerValue(c);
|
|
for (std::size_t n = 1; n < encBytes; ++n) {
|
|
uchar nc = m_str[idx + n];
|
|
valid &= ((nc & 0xC0) == 0x80);
|
|
value = (value << 6) | (nc & 0x3F);
|
|
}
|
|
|
|
if (
|
|
// Wrong bit pattern of following bytes
|
|
(!valid) ||
|
|
// Overlong encodings
|
|
(value < 0x80) ||
|
|
(0x80 <= value && value < 0x800 && encBytes > 2) ||
|
|
(0x800 < value && value < 0x10000 && encBytes > 3) ||
|
|
// Encoded value out of range
|
|
(value >= 0x110000)
|
|
) {
|
|
hexEscapeChar(os, c);
|
|
break;
|
|
}
|
|
|
|
// If we got here, this is in fact a valid(ish) utf-8 sequence
|
|
for (std::size_t n = 0; n < encBytes; ++n) {
|
|
os << m_str[idx + n];
|
|
}
|
|
idx += encBytes - 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
|
|
xmlEncode.encodeTo( os );
|
|
return os;
|
|
}
|
|
|
|
XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer )
|
|
: m_writer( writer )
|
|
{}
|
|
|
|
XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept
|
|
: m_writer( other.m_writer ){
|
|
other.m_writer = nullptr;
|
|
}
|
|
XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept {
|
|
if ( m_writer ) {
|
|
m_writer->endElement();
|
|
}
|
|
m_writer = other.m_writer;
|
|
other.m_writer = nullptr;
|
|
return *this;
|
|
}
|
|
|
|
|
|
XmlWriter::ScopedElement::~ScopedElement() {
|
|
if( m_writer )
|
|
m_writer->endElement();
|
|
}
|
|
|
|
XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, bool indent ) {
|
|
m_writer->writeText( text, indent );
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter::XmlWriter( std::ostream& os ) : m_os( os )
|
|
{
|
|
writeDeclaration();
|
|
}
|
|
|
|
XmlWriter::~XmlWriter() {
|
|
while( !m_tags.empty() )
|
|
endElement();
|
|
}
|
|
|
|
XmlWriter& XmlWriter::startElement( std::string const& name ) {
|
|
ensureTagClosed();
|
|
newlineIfNecessary();
|
|
m_os << m_indent << '<' << name;
|
|
m_tags.push_back( name );
|
|
m_indent += " ";
|
|
m_tagIsOpen = true;
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name ) {
|
|
ScopedElement scoped( this );
|
|
startElement( name );
|
|
return scoped;
|
|
}
|
|
|
|
XmlWriter& XmlWriter::endElement() {
|
|
newlineIfNecessary();
|
|
m_indent = m_indent.substr( 0, m_indent.size()-2 );
|
|
if( m_tagIsOpen ) {
|
|
m_os << "/>";
|
|
m_tagIsOpen = false;
|
|
}
|
|
else {
|
|
m_os << m_indent << "</" << m_tags.back() << ">";
|
|
}
|
|
m_os << std::endl;
|
|
m_tags.pop_back();
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter& XmlWriter::writeAttribute( std::string const& name, std::string const& attribute ) {
|
|
if( !name.empty() && !attribute.empty() )
|
|
m_os << ' ' << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << '"';
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter& XmlWriter::writeAttribute( std::string const& name, bool attribute ) {
|
|
m_os << ' ' << name << "=\"" << ( attribute ? "true" : "false" ) << '"';
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter& XmlWriter::writeText( std::string const& text, bool indent ) {
|
|
if( !text.empty() ){
|
|
bool tagWasOpen = m_tagIsOpen;
|
|
ensureTagClosed();
|
|
if( tagWasOpen && indent )
|
|
m_os << m_indent;
|
|
m_os << XmlEncode( text );
|
|
m_needsNewline = true;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
XmlWriter& XmlWriter::writeComment( std::string const& text ) {
|
|
ensureTagClosed();
|
|
m_os << m_indent << "<!--" << text << "-->";
|
|
m_needsNewline = true;
|
|
return *this;
|
|
}
|
|
|
|
void XmlWriter::writeStylesheetRef( std::string const& url ) {
|
|
m_os << "<?xml-stylesheet type=\"text/xsl\" href=\"" << url << "\"?>\n";
|
|
}
|
|
|
|
XmlWriter& XmlWriter::writeBlankLine() {
|
|
ensureTagClosed();
|
|
m_os << '\n';
|
|
return *this;
|
|
}
|
|
|
|
void XmlWriter::ensureTagClosed() {
|
|
if( m_tagIsOpen ) {
|
|
m_os << ">" << std::endl;
|
|
m_tagIsOpen = false;
|
|
}
|
|
}
|
|
|
|
void XmlWriter::writeDeclaration() {
|
|
m_os << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
|
|
}
|
|
|
|
void XmlWriter::newlineIfNecessary() {
|
|
if( m_needsNewline ) {
|
|
m_os << std::endl;
|
|
m_needsNewline = false;
|
|
}
|
|
}
|
|
}
|