[F] Fix incorrect behaviour in case of wide east asian symbols

This commit is contained in:
seleznevae 2019-09-08 09:55:56 +03:00
parent 6082281d0e
commit d944b8c364
9 changed files with 252 additions and 25 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.0)
project(libfort VERSION 0.2.1)
project(libfort VERSION 0.2.2)
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)"
"\\1.\\2" libfort_SOVERSION

View File

@ -1,3 +1,9 @@
## v0.2.2
### Bug fixes
- Fix incorrect behaviour in case of wide east asian symbols.
## v0.2.1
### Bug fixes

View File

@ -480,6 +480,12 @@ utf8_nonnull utf8_weak void *utf8dup(const void *src);
// excluding the null terminating byte.
utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
// Visible width of utf8string.
utf8_nonnull utf8_pure utf8_weak size_t utf8width(const void *str);
// Visible width of codepoint.
utf8_nonnull utf8_pure utf8_weak int utf8cwidth(utf8_int32_t c);
// Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
// src2 respectively, case insensitive. Checking at most n bytes of each utf8
// string.
@ -823,6 +829,83 @@ size_t utf8len(const void *str)
return length;
}
// See
// https://unicode.org/Public/UNIDATA/EastAsianWidth.txt
// http://www.unicode.org/reports/tr11/tr11-33.html
int utf8cwidth(utf8_int32_t c)
{
// TODO: add non printable characters check
if (c == 0)
return 0;
if (c < 0x1100)
return 1;
// Fullwidth
if ((0x3000 == c) ||
(0xFF01 <= c && c <= 0xFF60) ||
(0xFFE0 <= c && c <= 0xFFE6)) {
return 2;
}
// Wide
if ((0x1100 <= c && c <= 0x115F) ||
(0x11A3 <= c && c <= 0x11A7) ||
(0x11FA <= c && c <= 0x11FF) ||
(0x2329 <= c && c <= 0x232A) ||
(0x2E80 <= c && c <= 0x2E99) ||
(0x2E9B <= c && c <= 0x2EF3) ||
(0x2F00 <= c && c <= 0x2FD5) ||
(0x2FF0 <= c && c <= 0x2FFB) ||
(0x3001 <= c && c <= 0x303E) ||
(0x3041 <= c && c <= 0x3096) ||
(0x3099 <= c && c <= 0x30FF) ||
(0x3105 <= c && c <= 0x312D) ||
(0x3131 <= c && c <= 0x318E) ||
(0x3190 <= c && c <= 0x31BA) ||
(0x31C0 <= c && c <= 0x31E3) ||
(0x31F0 <= c && c <= 0x321E) ||
(0x3220 <= c && c <= 0x3247) ||
(0x3250 <= c && c <= 0x32FE) ||
(0x3300 <= c && c <= 0x4DBF) ||
(0x4E00 <= c && c <= 0xA48C) ||
(0xA490 <= c && c <= 0xA4C6) ||
(0xA960 <= c && c <= 0xA97C) ||
(0xAC00 <= c && c <= 0xD7A3) ||
(0xD7B0 <= c && c <= 0xD7C6) ||
(0xD7CB <= c && c <= 0xD7FB) ||
(0xF900 <= c && c <= 0xFAFF) ||
(0xFE10 <= c && c <= 0xFE19) ||
(0xFE30 <= c && c <= 0xFE52) ||
(0xFE54 <= c && c <= 0xFE66) ||
(0xFE68 <= c && c <= 0xFE6B) ||
(0x1B000 <= c && c <= 0x1B001) ||
(0x1F200 <= c && c <= 0x1F202) ||
(0x1F210 <= c && c <= 0x1F23A) ||
(0x1F240 <= c && c <= 0x1F248) ||
(0x1F250 <= c && c <= 0x1F251) ||
(0x20000 <= c && c <= 0x2F73F) ||
(0x2B740 <= c && c <= 0x2FFFD) ||
(0x30000 <= c && c <= 0x3FFFD)) {
return 2;
}
return 1;
}
size_t utf8width(const void *str)
{
size_t length = 0;
utf8_int32_t c = 0;
str = utf8codepoint(str, &c);
while (c != 0) {
length += utf8cwidth(c);
str = utf8codepoint(str, &c);
}
return length;
}
int utf8ncasecmp(const void *src1, const void *src2, size_t n)
{
utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
@ -6217,8 +6300,8 @@ size_t string_buffer_raw_capacity(const f_string_buffer_t *buffer)
}
#ifdef FT_HAVE_UTF8
FT_INTERNAL
size_t ut8_width(const void *beg, const void *end)
static
size_t utf8_width(const void *beg, const void *end)
{
size_t sz = (size_t)((const char *)end - (const char *)beg);
char *tmp = (char *)F_MALLOC(sizeof(char) * (sz + 1));
@ -6227,7 +6310,7 @@ size_t ut8_width(const void *beg, const void *end)
memcpy(tmp, beg, sz);
tmp[sz] = '\0';
size_t result = utf8len(tmp);
size_t result = utf8width(tmp);
F_FREE(tmp);
return result;
}
@ -6277,7 +6360,7 @@ size_t buffer_text_visible_width(const f_string_buffer_t *buffer)
if (beg == NULL || end == NULL)
return max_length;
max_length = MAX(max_length, (size_t)ut8_width(beg, end));
max_length = MAX(max_length, (size_t)utf8_width(beg, end));
++n;
}
#endif /* FT_HAVE_WCHAR */
@ -6307,7 +6390,7 @@ buffer_substring(const f_string_buffer_t *buffer, size_t buffer_row, const void
case UTF8_BUF:
utf8_n_substring(buffer->str.u8str, '\n', buffer_row, begin, end);
if ((*(const char **)begin) && (*(const char **)end))
*str_it_width = ut8_width(*begin, *end);
*str_it_width = utf8_width(*begin, *end);
break;
#endif /* FT_HAVE_UTF8 */
default:

View File

@ -46,8 +46,8 @@ SOFTWARE.
#define LIBFORT_MAJOR_VERSION 0
#define LIBFORT_MINOR_VERSION 2
#define LIBFORT_REVISION 1
#define LIBFORT_VERSION_STR "0.2.1"
#define LIBFORT_REVISION 2
#define LIBFORT_VERSION_STR "0.2.2"
/*****************************************************************************

View File

@ -46,8 +46,8 @@ SOFTWARE.
#define LIBFORT_MAJOR_VERSION 0
#define LIBFORT_MINOR_VERSION 2
#define LIBFORT_REVISION 1
#define LIBFORT_VERSION_STR "0.2.1"
#define LIBFORT_REVISION 2
#define LIBFORT_VERSION_STR "0.2.2"
/*****************************************************************************

View File

@ -444,8 +444,8 @@ size_t string_buffer_raw_capacity(const f_string_buffer_t *buffer)
}
#ifdef FT_HAVE_UTF8
FT_INTERNAL
size_t ut8_width(const void *beg, const void *end)
static
size_t utf8_width(const void *beg, const void *end)
{
size_t sz = (size_t)((const char *)end - (const char *)beg);
char *tmp = (char *)F_MALLOC(sizeof(char) * (sz + 1));
@ -454,7 +454,7 @@ size_t ut8_width(const void *beg, const void *end)
memcpy(tmp, beg, sz);
tmp[sz] = '\0';
size_t result = utf8len(tmp);
size_t result = utf8width(tmp);
F_FREE(tmp);
return result;
}
@ -504,7 +504,7 @@ size_t buffer_text_visible_width(const f_string_buffer_t *buffer)
if (beg == NULL || end == NULL)
return max_length;
max_length = MAX(max_length, (size_t)ut8_width(beg, end));
max_length = MAX(max_length, (size_t)utf8_width(beg, end));
++n;
}
#endif /* FT_HAVE_WCHAR */
@ -534,7 +534,7 @@ buffer_substring(const f_string_buffer_t *buffer, size_t buffer_row, const void
case UTF8_BUF:
utf8_n_substring(buffer->str.u8str, '\n', buffer_row, begin, end);
if ((*(const char **)begin) && (*(const char **)end))
*str_it_width = ut8_width(*begin, *end);
*str_it_width = utf8_width(*begin, *end);
break;
#endif /* FT_HAVE_UTF8 */
default:

View File

@ -115,6 +115,12 @@ utf8_nonnull utf8_weak void *utf8dup(const void *src);
// excluding the null terminating byte.
utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
// Visible width of utf8string.
utf8_nonnull utf8_pure utf8_weak size_t utf8width(const void *str);
// Visible width of codepoint.
utf8_nonnull utf8_pure utf8_weak int utf8cwidth(utf8_int32_t c);
// Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
// src2 respectively, case insensitive. Checking at most n bytes of each utf8
// string.
@ -458,6 +464,83 @@ size_t utf8len(const void *str)
return length;
}
// See
// https://unicode.org/Public/UNIDATA/EastAsianWidth.txt
// http://www.unicode.org/reports/tr11/tr11-33.html
int utf8cwidth(utf8_int32_t c)
{
// TODO: add non printable characters check
if (c == 0)
return 0;
if (c < 0x1100)
return 1;
// Fullwidth
if ((0x3000 == c) ||
(0xFF01 <= c && c <= 0xFF60) ||
(0xFFE0 <= c && c <= 0xFFE6)) {
return 2;
}
// Wide
if ((0x1100 <= c && c <= 0x115F) ||
(0x11A3 <= c && c <= 0x11A7) ||
(0x11FA <= c && c <= 0x11FF) ||
(0x2329 <= c && c <= 0x232A) ||
(0x2E80 <= c && c <= 0x2E99) ||
(0x2E9B <= c && c <= 0x2EF3) ||
(0x2F00 <= c && c <= 0x2FD5) ||
(0x2FF0 <= c && c <= 0x2FFB) ||
(0x3001 <= c && c <= 0x303E) ||
(0x3041 <= c && c <= 0x3096) ||
(0x3099 <= c && c <= 0x30FF) ||
(0x3105 <= c && c <= 0x312D) ||
(0x3131 <= c && c <= 0x318E) ||
(0x3190 <= c && c <= 0x31BA) ||
(0x31C0 <= c && c <= 0x31E3) ||
(0x31F0 <= c && c <= 0x321E) ||
(0x3220 <= c && c <= 0x3247) ||
(0x3250 <= c && c <= 0x32FE) ||
(0x3300 <= c && c <= 0x4DBF) ||
(0x4E00 <= c && c <= 0xA48C) ||
(0xA490 <= c && c <= 0xA4C6) ||
(0xA960 <= c && c <= 0xA97C) ||
(0xAC00 <= c && c <= 0xD7A3) ||
(0xD7B0 <= c && c <= 0xD7C6) ||
(0xD7CB <= c && c <= 0xD7FB) ||
(0xF900 <= c && c <= 0xFAFF) ||
(0xFE10 <= c && c <= 0xFE19) ||
(0xFE30 <= c && c <= 0xFE52) ||
(0xFE54 <= c && c <= 0xFE66) ||
(0xFE68 <= c && c <= 0xFE6B) ||
(0x1B000 <= c && c <= 0x1B001) ||
(0x1F200 <= c && c <= 0x1F202) ||
(0x1F210 <= c && c <= 0x1F23A) ||
(0x1F240 <= c && c <= 0x1F248) ||
(0x1F250 <= c && c <= 0x1F251) ||
(0x20000 <= c && c <= 0x2F73F) ||
(0x2B740 <= c && c <= 0x2FFFD) ||
(0x30000 <= c && c <= 0x3FFFD)) {
return 2;
}
return 1;
}
size_t utf8width(const void *str)
{
size_t length = 0;
utf8_int32_t c = 0;
str = utf8codepoint(str, &c);
while (c != 0) {
length += utf8cwidth(c);
str = utf8codepoint(str, &c);
}
return length;
}
int utf8ncasecmp(const void *src1, const void *src2, size_t n)
{
utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;

View File

@ -146,6 +146,23 @@ void test_bug_fixes(void)
ft_destroy_table(table);
}
#endif /* FT_HAVE_UTF8 */
#ifdef FT_HAVE_UTF8
SCENARIO("Issue 15 - https://github.com/seleznevae/libfort/issues/15") {
ft_table_t *table = ft_create_table();
ft_set_border_style(table, FT_NICE_STYLE);
ft_u8write_ln(table, "視野無限廣");
const char *table_str = ft_to_u8string(table);
assert_true(table_str != NULL);
const char *table_str_etalon =
"╔════════════╗\n"
"║ 視野無限廣 ║\n"
"╚════════════╝\n";
assert_str_equal(table_str, table_str_etalon);
ft_destroy_table(table);
}
#endif /* FT_HAVE_UTF8 */
}
void test_table_basic(void)
@ -676,6 +693,23 @@ void test_utf8_table(void)
ft_destroy_table(table); \
}
#define TEST_UTF8_BORDER_WIDE(content) \
{ \
table = ft_create_table(); \
assert_true(table != NULL); \
assert(ft_set_border_style(table, FT_BASIC_STYLE) == 0); \
assert_true(ft_u8write_ln(table, content) == FT_SUCCESS); \
const char *table_str = ft_to_u8string(table); \
assert_true(table_str != NULL); \
char table_str_etalon[1024] = {'\0'}; \
snprintf(table_str_etalon, 1024, \
"+----------------------+\n" \
"| %s |\n" \
"+----------------------+\n", content); \
assert_str_equal(table_str, table_str_etalon); \
ft_destroy_table(table); \
}
TEST_UTF8_BORDER("1234567890");
TEST_UTF8_BORDER("Xylophmsik");
TEST_UTF8_BORDER("ψημένηζειθ");
@ -685,10 +719,11 @@ void test_utf8_table(void)
TEST_UTF8_BORDER("французких");
TEST_UTF8_BORDER("Benjamínúñ");
TEST_UTF8_BORDER("görmüştüçğ");
TEST_UTF8_BORDER("視野無限廣窗外有藍天");
TEST_UTF8_BORDER("いろはにほへとちりぬ");
TEST_UTF8_BORDER("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼");
TEST_UTF8_BORDER_WIDE("視野無限廣窗外有藍天");
TEST_UTF8_BORDER_WIDE("いろはにほへとちりぬ");
TEST_UTF8_BORDER_WIDE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼");
#undef TEST_UTF8_BORDER
#undef TEST_UTF8_BORDER_WIDE
#define TEST_UTF8_STYLE(content) \
{ \
@ -709,6 +744,25 @@ void test_utf8_table(void)
assert_str_equal(table_str, table_str_etalon); \
ft_destroy_table(table); \
}
#define TEST_UTF8_STYLE_WIDE(content) \
{ \
table = ft_create_table(); \
assert_true(table != NULL); \
assert(ft_set_border_style(table, FT_BASIC_STYLE) == 0); \
assert_true(ft_u8write_ln(table, content) == FT_SUCCESS); \
assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CONT_FG_COLOR, FT_COLOR_YELLOW) == FT_SUCCESS); \
assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CELL_BG_COLOR, FT_COLOR_RED) == FT_SUCCESS); \
assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CONT_TEXT_STYLE, FT_TSTYLE_UNDERLINED) == FT_SUCCESS); \
const char *table_str = ft_to_u8string(table); \
assert_true(table_str != NULL); \
char table_str_etalon[1024] = {'\0'}; \
snprintf(table_str_etalon, 1024, \
"+----------------------+\n" \
"|\033[41m \033[4m\033[33m%s\033[0m\033[41m \033[0m|\n" \
"+----------------------+\n", content); \
assert_str_equal(table_str, table_str_etalon); \
ft_destroy_table(table); \
}
TEST_UTF8_STYLE("1234567890");
TEST_UTF8_STYLE("Xylophmsik");
TEST_UTF8_STYLE("ψημένηζειθ");
@ -718,10 +772,11 @@ void test_utf8_table(void)
TEST_UTF8_STYLE("французких");
TEST_UTF8_STYLE("Benjamínúñ");
TEST_UTF8_STYLE("görmüştüçğ");
TEST_UTF8_STYLE("視野無限廣窗外有藍天");
TEST_UTF8_STYLE("いろはにほへとちりぬ");
TEST_UTF8_STYLE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼");
TEST_UTF8_STYLE_WIDE("視野無限廣窗外有藍天");
TEST_UTF8_STYLE_WIDE("いろはにほへとちりぬ");
TEST_UTF8_STYLE_WIDE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼");
#undef TEST_UTF8_STYLE
#undef TEST_UTF8_STYLE_WIDE
}
#endif /* FT_HAVE_UTF8 */

View File

@ -422,7 +422,7 @@ void test_buffer_text_visible_width(void)
buffer->str.u8str = (void *)"Numbers 01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890";
assert_true(buffer_text_visible_width(buffer) == 110);
buffer->str.u8str = (void *)"Chinese 視野無限廣, 窗外有藍天";
assert_true(buffer_text_visible_width(buffer) == 22);
assert_true(buffer_text_visible_width(buffer) == 32);
buffer->str.u8str = (void *)"German Falsches Üben von Xylophonmusik quält jeden größeren Zwerg";
assert_true(buffer_text_visible_width(buffer) == 68);
buffer->str.u8str = (void *)"Greek Ταχίστη αλώπηξ βαφής ψημένη γη, δρασκελίζει υπέρ νωθρού κυνός Takhístè";
@ -430,7 +430,7 @@ void test_buffer_text_visible_width(void)
buffer->str.u8str = (void *)"Irish Dḟuascail Íosa Úrṁac na hÓiġe Beannaiṫe pór Éaḃa agus Áḋaiṁ";
assert_true(buffer_text_visible_width(buffer) == 70);
buffer->str.u8str = (void *)"Japanese いろはにほへと ちりぬるを わかよたれそ つねならむ うゐ";
assert_true(buffer_text_visible_width(buffer) == 39);
assert_true(buffer_text_visible_width(buffer) == 64);
buffer->str.u8str = (void *)"Polish Pójdźże, kiń tę chmurność w głąb flaszy";
assert_true(buffer_text_visible_width(buffer) == 49);
buffer->str.u8str = (void *)"Portuguese Luís argüia à Júlia que «brações, fé, chá, óxido, pôr, zângão» eram palavras do português";
@ -446,7 +446,7 @@ void test_buffer_text_visible_width(void)
buffer->str.u8str = (void *)"Numbers 01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890";
assert_true(buffer_text_visible_width(buffer) == 110);
buffer->str.u8str = (void *)"Chinese 視野無限廣,\n 窗外有藍天";
assert_true(buffer_text_visible_width(buffer) == 16);
assert_true(buffer_text_visible_width(buffer) == 21);
buffer->str.u8str = (void *)"German Falsches Üben von Xy\nlophonmusik quält \njeden größeren Zwerg";
assert_true(buffer_text_visible_width(buffer) == 30);
buffer->str.u8str = (void *)"Greek Ταχίστη αλώπηξ βαφής\n ψημένη γη, δρασκελίζει\n υπέρ νωθρού \nκυνός Takhístè";
@ -454,7 +454,7 @@ void test_buffer_text_visible_width(void)
buffer->str.u8str = (void *)"Irish Dḟuascail Íosa Úrṁa\nc na hÓiġe Beannaiṫe\n pór Éaḃa agus Áḋaiṁ";
assert_true(buffer_text_visible_width(buffer) == 30);
buffer->str.u8str = (void *)"Japanese いろはにほへと ちり\nぬるを わかよたれそ つねならむ うゐ";
assert_true(buffer_text_visible_width(buffer) == 20);
assert_true(buffer_text_visible_width(buffer) == 35);
buffer->str.u8str = (void *)"Polish Pójdźże, kiń tę chmu\nrność w głąb flaszy";
assert_true(buffer_text_visible_width(buffer) == 30);
buffer->str.u8str = (void *)"Portuguese Luís argüia à Júlia\n que «brações, fé, chá,\n óxido, pôr, \nzângão» eram palavras\n do português";