Modify XML encoder to hex-encode invalid UTF-8 sequences

There are still some holes, e.g. we leave surrogate pairs be
even though they are not a part of valid UTF-8, but this might
be for the better -- WTF-8 does support surrogate pairs inside
text.

Closes #1207
This commit is contained in:
Martin Hořeňovský
2018-03-25 20:44:30 +02:00
parent e11508b48a
commit 3b801c4fda
8 changed files with 923 additions and 39 deletions

View File

@@ -7112,6 +7112,305 @@ PASSED:
with expansion:
"[\x7F]" == "[\x7F]"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Valid utf-8 strings
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode(u8"Here be 👾") == u8"Here be 👾" )
with expansion:
"Here be 👾" == "Here be 👾"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode(u8"šš") == u8"šš" )
with expansion:
"šš" == "šš"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xDF\xBF") == "\xDF\xBF" )
with expansion:
"߿" == "߿"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE0\xA0\x80") == "\xE0\xA0\x80" )
with expansion:
"ࠀ" == "ࠀ"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xED\x9F\xBF") == "\xED\x9F\xBF" )
with expansion:
"퟿" == "퟿"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xEE\x80\x80") == "\xEE\x80\x80" )
with expansion:
"" == ""
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xEF\xBF\xBF") == "\xEF\xBF\xBF" )
with expansion:
"￿" == "￿"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80" )
with expansion:
"𐀀" == "𐀀"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF" )
with expansion:
"􏿿" == "􏿿"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Invalid utf-8 strings
Various broken strings
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("Here \xFF be 👾") == u8"Here \\xFF be 👾" )
with expansion:
"Here \xFF be 👾" == "Here \xFF be 👾"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xFF") == "\\xFF" )
with expansion:
"\xFF" == "\xFF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xC5\xC5\xA0") == u8"\\xC5Š" )
with expansion:
"\xC5Š" == "\xC5Š"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80" )
with expansion:
"\xF4\x90\x80\x80" == "\xF4\x90\x80\x80"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Invalid utf-8 strings
Overlong encodings
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xC0\x80") == u8"\\xC0\\x80" )
with expansion:
"\xC0\x80" == "\xC0\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80" )
with expansion:
"\xF0\x80\x80\x80" == "\xF0\x80\x80\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xC1\xBF") == u8"\\xC1\\xBF" )
with expansion:
"\xC1\xBF" == "\xC1\xBF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF" )
with expansion:
"\xE0\x9F\xBF" == "\xE0\x9F\xBF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF" )
with expansion:
"\xF0\x8F\xBF\xBF" == "\xF0\x8F\xBF\xBF"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Invalid utf-8 strings
Surrogate pairs
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xED\xA0\x80") == "\xED\xA0\x80" )
with expansion:
"<22><><EFBFBD>" == "<22><><EFBFBD>"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xED\xAF\xBF") == "\xED\xAF\xBF" )
with expansion:
"<22><><EFBFBD>" == "<22><><EFBFBD>"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xED\xB0\x80") == "\xED\xB0\x80" )
with expansion:
"<22><><EFBFBD>" == "<22><><EFBFBD>"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xED\xBF\xBF") == "\xED\xBF\xBF" )
with expansion:
"<22><><EFBFBD>" == "<22><><EFBFBD>"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Invalid utf-8 strings
Invalid start byte
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\x80") == u8"\\x80" )
with expansion:
"\x80" == "\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\x81") == u8"\\x81" )
with expansion:
"\x81" == "\x81"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xBC") == u8"\\xBC" )
with expansion:
"\xBC" == "\xBC"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xBF") == u8"\\xBF" )
with expansion:
"\xBF" == "\xBF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80" )
with expansion:
"\xF5\x80\x80\x80" == "\xF5\x80\x80\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80" )
with expansion:
"\xF6\x80\x80\x80" == "\xF6\x80\x80\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80" )
with expansion:
"\xF7\x80\x80\x80" == "\xF7\x80\x80\x80"
-------------------------------------------------------------------------------
XmlEncode: UTF-8
Invalid utf-8 strings
Missing continuation byte(s)
-------------------------------------------------------------------------------
Xml.tests.cpp:<line number>
...............................................................................
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xDE") == u8"\\xDE" )
with expansion:
"\xDE" == "\xDE"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xDF") == u8"\\xDF" )
with expansion:
"\xDF" == "\xDF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE0") == u8"\\xE0" )
with expansion:
"\xE0" == "\xE0"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xEF") == u8"\\xEF" )
with expansion:
"\xEF" == "\xEF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0") == u8"\\xF0" )
with expansion:
"\xF0" == "\xF0"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF4") == u8"\\xF4" )
with expansion:
"\xF4" == "\xF4"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE0\x80") == u8"\\xE0\\x80" )
with expansion:
"\xE0\x80" == "\xE0\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE0\xBF") == u8"\\xE0\\xBF" )
with expansion:
"\xE0\xBF" == "\xE0\xBF"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xE1\x80") == u8"\\xE1\\x80" )
with expansion:
"\xE1\x80" == "\xE1\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0\x80") == u8"\\xF0\\x80" )
with expansion:
"\xF0\x80" == "\xF0\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF4\x80") == u8"\\xF4\\x80" )
with expansion:
"\xF4\x80" == "\xF4\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80" )
with expansion:
"\xF0\x80\x80" == "\xF0\x80\x80"
Xml.tests.cpp:<line number>:
PASSED:
CHECK( encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80" )
with expansion:
"\xF4\x80\x80" == "\xF4\x80\x80"
-------------------------------------------------------------------------------
array<int, N> -> toString
-------------------------------------------------------------------------------
@@ -8598,6 +8897,6 @@ Misc.tests.cpp:<line number>:
PASSED:
===============================================================================
test cases: 202 | 136 passed | 62 failed | 4 failed as expected
assertions: 1029 | 887 passed | 121 failed | 21 failed as expected
test cases: 203 | 137 passed | 62 failed | 4 failed as expected
assertions: 1071 | 929 passed | 121 failed | 21 failed as expected