Modify XML encoder to hex-encode invalid UTF-8 sequences

There are still some holes, e.g. we leave surrogate pairs be
even though they are not a part of valid UTF-8, but this might
be for the better -- WTF-8 does support surrogate pairs inside
text.

Closes #1207
This commit is contained in:
Martin Hořeňovský
2018-03-25 20:44:30 +02:00
parent e11508b48a
commit 3b801c4fda
8 changed files with 923 additions and 39 deletions

View File

@@ -7881,7 +7881,7 @@ Message from section two
<TestCase name="X/level/1/b" tags="[Tricky]" filename="projects/<exe-name>/UsageTests/Tricky.tests.cpp" >
<OverallResult success="true"/>
</TestCase>
<TestCase name="XmlEncode" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<TestCase name="XmlEncode" tags="[XML]" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="normal string" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="REQUIRE" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
@@ -7994,6 +7994,378 @@ Message from section two
</Section>
<OverallResult success="true"/>
</TestCase>
<TestCase name="XmlEncode: UTF-8" tags="[UTF-8][XML]" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Valid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode(u8"Here be 👾") == u8"Here be 👾"
</Original>
<Expanded>
"Here be 👾" == "Here be 👾"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode(u8"šš") == u8"šš"
</Original>
<Expanded>
"šš" == "šš"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xDF\xBF") == "\xDF\xBF"
</Original>
<Expanded>
"߿" == "߿"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE0\xA0\x80") == "\xE0\xA0\x80"
</Original>
<Expanded>
"ࠀ" == "ࠀ"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xED\x9F\xBF") == "\xED\x9F\xBF"
</Original>
<Expanded>
"퟿" == "퟿"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xEE\x80\x80") == "\xEE\x80\x80"
</Original>
<Expanded>
"" == ""
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xEF\xBF\xBF") == "\xEF\xBF\xBF"
</Original>
<Expanded>
"￿" == "￿"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0\x90\x80\x80") == "\xF0\x90\x80\x80"
</Original>
<Expanded>
"𐀀" == "𐀀"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF4\x8F\xBF\xBF") == "\xF4\x8F\xBF\xBF"
</Original>
<Expanded>
"􏿿" == "􏿿"
</Expanded>
</Expression>
<OverallResults successes="9" failures="0" expectedFailures="0"/>
</Section>
<Section name="Invalid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Various broken strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("Here \xFF be 👾") == u8"Here \\xFF be 👾"
</Original>
<Expanded>
"Here \xFF be 👾" == "Here \xFF be 👾"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xFF") == "\\xFF"
</Original>
<Expanded>
"\xFF" == "\xFF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xC5\xC5\xA0") == u8"\\xC5Š"
</Original>
<Expanded>
"\xC5Š" == "\xC5Š"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF4\x90\x80\x80") == u8"\\xF4\\x90\\x80\\x80"
</Original>
<Expanded>
"\xF4\x90\x80\x80" == "\xF4\x90\x80\x80"
</Expanded>
</Expression>
<OverallResults successes="4" failures="0" expectedFailures="0"/>
</Section>
<OverallResults successes="4" failures="0" expectedFailures="0"/>
</Section>
<Section name="Invalid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Overlong encodings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xC0\x80") == u8"\\xC0\\x80"
</Original>
<Expanded>
"\xC0\x80" == "\xC0\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0\x80\x80\x80") == u8"\\xF0\\x80\\x80\\x80"
</Original>
<Expanded>
"\xF0\x80\x80\x80" == "\xF0\x80\x80\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xC1\xBF") == u8"\\xC1\\xBF"
</Original>
<Expanded>
"\xC1\xBF" == "\xC1\xBF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE0\x9F\xBF") == u8"\\xE0\\x9F\\xBF"
</Original>
<Expanded>
"\xE0\x9F\xBF" == "\xE0\x9F\xBF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0\x8F\xBF\xBF") == u8"\\xF0\\x8F\\xBF\\xBF"
</Original>
<Expanded>
"\xF0\x8F\xBF\xBF" == "\xF0\x8F\xBF\xBF"
</Expanded>
</Expression>
<OverallResults successes="5" failures="0" expectedFailures="0"/>
</Section>
<OverallResults successes="5" failures="0" expectedFailures="0"/>
</Section>
<Section name="Invalid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Surrogate pairs" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xED\xA0\x80") == "\xED\xA0\x80"
</Original>
<Expanded>
"<22><><EFBFBD>" == "<22><><EFBFBD>"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xED\xAF\xBF") == "\xED\xAF\xBF"
</Original>
<Expanded>
"<22><><EFBFBD>" == "<22><><EFBFBD>"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xED\xB0\x80") == "\xED\xB0\x80"
</Original>
<Expanded>
"<22><><EFBFBD>" == "<22><><EFBFBD>"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xED\xBF\xBF") == "\xED\xBF\xBF"
</Original>
<Expanded>
"<22><><EFBFBD>" == "<22><><EFBFBD>"
</Expanded>
</Expression>
<OverallResults successes="4" failures="0" expectedFailures="0"/>
</Section>
<OverallResults successes="4" failures="0" expectedFailures="0"/>
</Section>
<Section name="Invalid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Invalid start byte" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\x80") == u8"\\x80"
</Original>
<Expanded>
"\x80" == "\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\x81") == u8"\\x81"
</Original>
<Expanded>
"\x81" == "\x81"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xBC") == u8"\\xBC"
</Original>
<Expanded>
"\xBC" == "\xBC"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xBF") == u8"\\xBF"
</Original>
<Expanded>
"\xBF" == "\xBF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF5\x80\x80\x80") == u8"\\xF5\\x80\\x80\\x80"
</Original>
<Expanded>
"\xF5\x80\x80\x80" == "\xF5\x80\x80\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF6\x80\x80\x80") == u8"\\xF6\\x80\\x80\\x80"
</Original>
<Expanded>
"\xF6\x80\x80\x80" == "\xF6\x80\x80\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF7\x80\x80\x80") == u8"\\xF7\\x80\\x80\\x80"
</Original>
<Expanded>
"\xF7\x80\x80\x80" == "\xF7\x80\x80\x80"
</Expanded>
</Expression>
<OverallResults successes="7" failures="0" expectedFailures="0"/>
</Section>
<OverallResults successes="7" failures="0" expectedFailures="0"/>
</Section>
<Section name="Invalid utf-8 strings" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Section name="Missing continuation byte(s)" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xDE") == u8"\\xDE"
</Original>
<Expanded>
"\xDE" == "\xDE"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xDF") == u8"\\xDF"
</Original>
<Expanded>
"\xDF" == "\xDF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE0") == u8"\\xE0"
</Original>
<Expanded>
"\xE0" == "\xE0"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xEF") == u8"\\xEF"
</Original>
<Expanded>
"\xEF" == "\xEF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0") == u8"\\xF0"
</Original>
<Expanded>
"\xF0" == "\xF0"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF4") == u8"\\xF4"
</Original>
<Expanded>
"\xF4" == "\xF4"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE0\x80") == u8"\\xE0\\x80"
</Original>
<Expanded>
"\xE0\x80" == "\xE0\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE0\xBF") == u8"\\xE0\\xBF"
</Original>
<Expanded>
"\xE0\xBF" == "\xE0\xBF"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xE1\x80") == u8"\\xE1\\x80"
</Original>
<Expanded>
"\xE1\x80" == "\xE1\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0\x80") == u8"\\xF0\\x80"
</Original>
<Expanded>
"\xF0\x80" == "\xF0\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF4\x80") == u8"\\xF4\\x80"
</Original>
<Expanded>
"\xF4\x80" == "\xF4\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF0\x80\x80") == u8"\\xF0\\x80\\x80"
</Original>
<Expanded>
"\xF0\x80\x80" == "\xF0\x80\x80"
</Expanded>
</Expression>
<Expression success="true" type="CHECK" filename="projects/<exe-name>/IntrospectiveTests/Xml.tests.cpp" >
<Original>
encode("\xF4\x80\x80") == u8"\\xF4\\x80\\x80"
</Original>
<Expanded>
"\xF4\x80\x80" == "\xF4\x80\x80"
</Expanded>
</Expression>
<OverallResults successes="13" failures="0" expectedFailures="0"/>
</Section>
<OverallResults successes="13" failures="0" expectedFailures="0"/>
</Section>
<OverallResult success="true"/>
</TestCase>
<TestCase name="array&lt;int, N> -> toString" tags="[array][containers][toString]" filename="projects/<exe-name>/UsageTests/ToStringVector.tests.cpp" >
<Expression success="true" type="REQUIRE" filename="projects/<exe-name>/UsageTests/ToStringVector.tests.cpp" >
<Original>
@@ -9469,7 +9841,7 @@ loose text artifact
</Section>
<OverallResult success="true"/>
</TestCase>
<OverallResults successes="887" failures="122" expectedFailures="21"/>
<OverallResults successes="929" failures="122" expectedFailures="21"/>
</Group>
<OverallResults successes="887" failures="121" expectedFailures="21"/>
<OverallResults successes="929" failures="121" expectedFailures="21"/>
</Catch>