From d944b8c36471730dd9b1a36b4a13bbcc0ca084e1 Mon Sep 17 00:00:00 2001 From: seleznevae Date: Sun, 8 Sep 2019 09:55:56 +0300 Subject: [PATCH] [F] Fix incorrect behaviour in case of wide east asian symbols --- CMakeLists.txt | 2 +- ChangeLog.md | 6 ++ lib/fort.c | 93 +++++++++++++++++++++++++++-- lib/fort.h | 4 +- src/fort.h | 4 +- src/string_buffer.c | 10 ++-- src/utf8.h | 83 +++++++++++++++++++++++++ tests/bb_tests/test_table_basic.c | 67 +++++++++++++++++++-- tests/wb_tests/test_string_buffer.c | 8 +-- 9 files changed, 252 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d5e93d0..ce5d703 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.0) -project(libfort VERSION 0.2.1) +project(libfort VERSION 0.2.2) string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1.\\2" libfort_SOVERSION diff --git a/ChangeLog.md b/ChangeLog.md index 3e12343..d8e754e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,9 @@ +## v0.2.2 + +### Bug fixes + +- Fix incorrect behaviour in case of wide east asian symbols. + ## v0.2.1 ### Bug fixes diff --git a/lib/fort.c b/lib/fort.c index f2b5c5f..74fe555 100644 --- a/lib/fort.c +++ b/lib/fort.c @@ -480,6 +480,12 @@ utf8_nonnull utf8_weak void *utf8dup(const void *src); // excluding the null terminating byte. utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str); +// Visible width of utf8string. +utf8_nonnull utf8_pure utf8_weak size_t utf8width(const void *str); + +// Visible width of codepoint. +utf8_nonnull utf8_pure utf8_weak int utf8cwidth(utf8_int32_t c); + // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 > // src2 respectively, case insensitive. Checking at most n bytes of each utf8 // string. @@ -823,6 +829,83 @@ size_t utf8len(const void *str) return length; } +// See +// https://unicode.org/Public/UNIDATA/EastAsianWidth.txt +// http://www.unicode.org/reports/tr11/tr11-33.html +int utf8cwidth(utf8_int32_t c) +{ + // TODO: add non printable characters check + if (c == 0) + return 0; + + if (c < 0x1100) + return 1; + + // Fullwidth + if ((0x3000 == c) || + (0xFF01 <= c && c <= 0xFF60) || + (0xFFE0 <= c && c <= 0xFFE6)) { + return 2; + } + + // Wide + if ((0x1100 <= c && c <= 0x115F) || + (0x11A3 <= c && c <= 0x11A7) || + (0x11FA <= c && c <= 0x11FF) || + (0x2329 <= c && c <= 0x232A) || + (0x2E80 <= c && c <= 0x2E99) || + (0x2E9B <= c && c <= 0x2EF3) || + (0x2F00 <= c && c <= 0x2FD5) || + (0x2FF0 <= c && c <= 0x2FFB) || + (0x3001 <= c && c <= 0x303E) || + (0x3041 <= c && c <= 0x3096) || + (0x3099 <= c && c <= 0x30FF) || + (0x3105 <= c && c <= 0x312D) || + (0x3131 <= c && c <= 0x318E) || + (0x3190 <= c && c <= 0x31BA) || + (0x31C0 <= c && c <= 0x31E3) || + (0x31F0 <= c && c <= 0x321E) || + (0x3220 <= c && c <= 0x3247) || + (0x3250 <= c && c <= 0x32FE) || + (0x3300 <= c && c <= 0x4DBF) || + (0x4E00 <= c && c <= 0xA48C) || + (0xA490 <= c && c <= 0xA4C6) || + (0xA960 <= c && c <= 0xA97C) || + (0xAC00 <= c && c <= 0xD7A3) || + (0xD7B0 <= c && c <= 0xD7C6) || + (0xD7CB <= c && c <= 0xD7FB) || + (0xF900 <= c && c <= 0xFAFF) || + (0xFE10 <= c && c <= 0xFE19) || + (0xFE30 <= c && c <= 0xFE52) || + (0xFE54 <= c && c <= 0xFE66) || + (0xFE68 <= c && c <= 0xFE6B) || + (0x1B000 <= c && c <= 0x1B001) || + (0x1F200 <= c && c <= 0x1F202) || + (0x1F210 <= c && c <= 0x1F23A) || + (0x1F240 <= c && c <= 0x1F248) || + (0x1F250 <= c && c <= 0x1F251) || + (0x20000 <= c && c <= 0x2F73F) || + (0x2B740 <= c && c <= 0x2FFFD) || + (0x30000 <= c && c <= 0x3FFFD)) { + return 2; + } + + return 1; +} + +size_t utf8width(const void *str) +{ + size_t length = 0; + utf8_int32_t c = 0; + + str = utf8codepoint(str, &c); + while (c != 0) { + length += utf8cwidth(c); + str = utf8codepoint(str, &c); + } + return length; +} + int utf8ncasecmp(const void *src1, const void *src2, size_t n) { utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp; @@ -6217,8 +6300,8 @@ size_t string_buffer_raw_capacity(const f_string_buffer_t *buffer) } #ifdef FT_HAVE_UTF8 -FT_INTERNAL -size_t ut8_width(const void *beg, const void *end) +static +size_t utf8_width(const void *beg, const void *end) { size_t sz = (size_t)((const char *)end - (const char *)beg); char *tmp = (char *)F_MALLOC(sizeof(char) * (sz + 1)); @@ -6227,7 +6310,7 @@ size_t ut8_width(const void *beg, const void *end) memcpy(tmp, beg, sz); tmp[sz] = '\0'; - size_t result = utf8len(tmp); + size_t result = utf8width(tmp); F_FREE(tmp); return result; } @@ -6277,7 +6360,7 @@ size_t buffer_text_visible_width(const f_string_buffer_t *buffer) if (beg == NULL || end == NULL) return max_length; - max_length = MAX(max_length, (size_t)ut8_width(beg, end)); + max_length = MAX(max_length, (size_t)utf8_width(beg, end)); ++n; } #endif /* FT_HAVE_WCHAR */ @@ -6307,7 +6390,7 @@ buffer_substring(const f_string_buffer_t *buffer, size_t buffer_row, const void case UTF8_BUF: utf8_n_substring(buffer->str.u8str, '\n', buffer_row, begin, end); if ((*(const char **)begin) && (*(const char **)end)) - *str_it_width = ut8_width(*begin, *end); + *str_it_width = utf8_width(*begin, *end); break; #endif /* FT_HAVE_UTF8 */ default: diff --git a/lib/fort.h b/lib/fort.h index 27a3a5f..f363479 100644 --- a/lib/fort.h +++ b/lib/fort.h @@ -46,8 +46,8 @@ SOFTWARE. #define LIBFORT_MAJOR_VERSION 0 #define LIBFORT_MINOR_VERSION 2 -#define LIBFORT_REVISION 1 -#define LIBFORT_VERSION_STR "0.2.1" +#define LIBFORT_REVISION 2 +#define LIBFORT_VERSION_STR "0.2.2" /***************************************************************************** diff --git a/src/fort.h b/src/fort.h index 27a3a5f..f363479 100644 --- a/src/fort.h +++ b/src/fort.h @@ -46,8 +46,8 @@ SOFTWARE. #define LIBFORT_MAJOR_VERSION 0 #define LIBFORT_MINOR_VERSION 2 -#define LIBFORT_REVISION 1 -#define LIBFORT_VERSION_STR "0.2.1" +#define LIBFORT_REVISION 2 +#define LIBFORT_VERSION_STR "0.2.2" /***************************************************************************** diff --git a/src/string_buffer.c b/src/string_buffer.c index af82b3c..02cf4bd 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -444,8 +444,8 @@ size_t string_buffer_raw_capacity(const f_string_buffer_t *buffer) } #ifdef FT_HAVE_UTF8 -FT_INTERNAL -size_t ut8_width(const void *beg, const void *end) +static +size_t utf8_width(const void *beg, const void *end) { size_t sz = (size_t)((const char *)end - (const char *)beg); char *tmp = (char *)F_MALLOC(sizeof(char) * (sz + 1)); @@ -454,7 +454,7 @@ size_t ut8_width(const void *beg, const void *end) memcpy(tmp, beg, sz); tmp[sz] = '\0'; - size_t result = utf8len(tmp); + size_t result = utf8width(tmp); F_FREE(tmp); return result; } @@ -504,7 +504,7 @@ size_t buffer_text_visible_width(const f_string_buffer_t *buffer) if (beg == NULL || end == NULL) return max_length; - max_length = MAX(max_length, (size_t)ut8_width(beg, end)); + max_length = MAX(max_length, (size_t)utf8_width(beg, end)); ++n; } #endif /* FT_HAVE_WCHAR */ @@ -534,7 +534,7 @@ buffer_substring(const f_string_buffer_t *buffer, size_t buffer_row, const void case UTF8_BUF: utf8_n_substring(buffer->str.u8str, '\n', buffer_row, begin, end); if ((*(const char **)begin) && (*(const char **)end)) - *str_it_width = ut8_width(*begin, *end); + *str_it_width = utf8_width(*begin, *end); break; #endif /* FT_HAVE_UTF8 */ default: diff --git a/src/utf8.h b/src/utf8.h index 4767c36..be0932e 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -115,6 +115,12 @@ utf8_nonnull utf8_weak void *utf8dup(const void *src); // excluding the null terminating byte. utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str); +// Visible width of utf8string. +utf8_nonnull utf8_pure utf8_weak size_t utf8width(const void *str); + +// Visible width of codepoint. +utf8_nonnull utf8_pure utf8_weak int utf8cwidth(utf8_int32_t c); + // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 > // src2 respectively, case insensitive. Checking at most n bytes of each utf8 // string. @@ -458,6 +464,83 @@ size_t utf8len(const void *str) return length; } +// See +// https://unicode.org/Public/UNIDATA/EastAsianWidth.txt +// http://www.unicode.org/reports/tr11/tr11-33.html +int utf8cwidth(utf8_int32_t c) +{ + // TODO: add non printable characters check + if (c == 0) + return 0; + + if (c < 0x1100) + return 1; + + // Fullwidth + if ((0x3000 == c) || + (0xFF01 <= c && c <= 0xFF60) || + (0xFFE0 <= c && c <= 0xFFE6)) { + return 2; + } + + // Wide + if ((0x1100 <= c && c <= 0x115F) || + (0x11A3 <= c && c <= 0x11A7) || + (0x11FA <= c && c <= 0x11FF) || + (0x2329 <= c && c <= 0x232A) || + (0x2E80 <= c && c <= 0x2E99) || + (0x2E9B <= c && c <= 0x2EF3) || + (0x2F00 <= c && c <= 0x2FD5) || + (0x2FF0 <= c && c <= 0x2FFB) || + (0x3001 <= c && c <= 0x303E) || + (0x3041 <= c && c <= 0x3096) || + (0x3099 <= c && c <= 0x30FF) || + (0x3105 <= c && c <= 0x312D) || + (0x3131 <= c && c <= 0x318E) || + (0x3190 <= c && c <= 0x31BA) || + (0x31C0 <= c && c <= 0x31E3) || + (0x31F0 <= c && c <= 0x321E) || + (0x3220 <= c && c <= 0x3247) || + (0x3250 <= c && c <= 0x32FE) || + (0x3300 <= c && c <= 0x4DBF) || + (0x4E00 <= c && c <= 0xA48C) || + (0xA490 <= c && c <= 0xA4C6) || + (0xA960 <= c && c <= 0xA97C) || + (0xAC00 <= c && c <= 0xD7A3) || + (0xD7B0 <= c && c <= 0xD7C6) || + (0xD7CB <= c && c <= 0xD7FB) || + (0xF900 <= c && c <= 0xFAFF) || + (0xFE10 <= c && c <= 0xFE19) || + (0xFE30 <= c && c <= 0xFE52) || + (0xFE54 <= c && c <= 0xFE66) || + (0xFE68 <= c && c <= 0xFE6B) || + (0x1B000 <= c && c <= 0x1B001) || + (0x1F200 <= c && c <= 0x1F202) || + (0x1F210 <= c && c <= 0x1F23A) || + (0x1F240 <= c && c <= 0x1F248) || + (0x1F250 <= c && c <= 0x1F251) || + (0x20000 <= c && c <= 0x2F73F) || + (0x2B740 <= c && c <= 0x2FFFD) || + (0x30000 <= c && c <= 0x3FFFD)) { + return 2; + } + + return 1; +} + +size_t utf8width(const void *str) +{ + size_t length = 0; + utf8_int32_t c = 0; + + str = utf8codepoint(str, &c); + while (c != 0) { + length += utf8cwidth(c); + str = utf8codepoint(str, &c); + } + return length; +} + int utf8ncasecmp(const void *src1, const void *src2, size_t n) { utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp; diff --git a/tests/bb_tests/test_table_basic.c b/tests/bb_tests/test_table_basic.c index 11de00d..d3f561f 100644 --- a/tests/bb_tests/test_table_basic.c +++ b/tests/bb_tests/test_table_basic.c @@ -146,6 +146,23 @@ void test_bug_fixes(void) ft_destroy_table(table); } #endif /* FT_HAVE_UTF8 */ + +#ifdef FT_HAVE_UTF8 + SCENARIO("Issue 15 - https://github.com/seleznevae/libfort/issues/15") { + ft_table_t *table = ft_create_table(); + ft_set_border_style(table, FT_NICE_STYLE); + + ft_u8write_ln(table, "視野無限廣"); + const char *table_str = ft_to_u8string(table); + assert_true(table_str != NULL); + const char *table_str_etalon = + "╔════════════╗\n" + "║ 視野無限廣 ║\n" + "╚════════════╝\n"; + assert_str_equal(table_str, table_str_etalon); + ft_destroy_table(table); + } +#endif /* FT_HAVE_UTF8 */ } void test_table_basic(void) @@ -676,6 +693,23 @@ void test_utf8_table(void) ft_destroy_table(table); \ } +#define TEST_UTF8_BORDER_WIDE(content) \ + { \ + table = ft_create_table(); \ + assert_true(table != NULL); \ + assert(ft_set_border_style(table, FT_BASIC_STYLE) == 0); \ + assert_true(ft_u8write_ln(table, content) == FT_SUCCESS); \ + const char *table_str = ft_to_u8string(table); \ + assert_true(table_str != NULL); \ + char table_str_etalon[1024] = {'\0'}; \ + snprintf(table_str_etalon, 1024, \ + "+----------------------+\n" \ + "| %s |\n" \ + "+----------------------+\n", content); \ + assert_str_equal(table_str, table_str_etalon); \ + ft_destroy_table(table); \ + } + TEST_UTF8_BORDER("1234567890"); TEST_UTF8_BORDER("Xylophmsik"); TEST_UTF8_BORDER("ψημένηζειθ"); @@ -685,10 +719,11 @@ void test_utf8_table(void) TEST_UTF8_BORDER("французких"); TEST_UTF8_BORDER("Benjamínúñ"); TEST_UTF8_BORDER("görmüştüçğ"); - TEST_UTF8_BORDER("視野無限廣窗外有藍天"); - TEST_UTF8_BORDER("いろはにほへとちりぬ"); - TEST_UTF8_BORDER("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼"); + TEST_UTF8_BORDER_WIDE("視野無限廣窗外有藍天"); + TEST_UTF8_BORDER_WIDE("いろはにほへとちりぬ"); + TEST_UTF8_BORDER_WIDE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼"); #undef TEST_UTF8_BORDER +#undef TEST_UTF8_BORDER_WIDE #define TEST_UTF8_STYLE(content) \ { \ @@ -709,6 +744,25 @@ void test_utf8_table(void) assert_str_equal(table_str, table_str_etalon); \ ft_destroy_table(table); \ } +#define TEST_UTF8_STYLE_WIDE(content) \ + { \ + table = ft_create_table(); \ + assert_true(table != NULL); \ + assert(ft_set_border_style(table, FT_BASIC_STYLE) == 0); \ + assert_true(ft_u8write_ln(table, content) == FT_SUCCESS); \ + assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CONT_FG_COLOR, FT_COLOR_YELLOW) == FT_SUCCESS); \ + assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CELL_BG_COLOR, FT_COLOR_RED) == FT_SUCCESS); \ + assert(ft_set_cell_prop(table, 0, 0, FT_CPROP_CONT_TEXT_STYLE, FT_TSTYLE_UNDERLINED) == FT_SUCCESS); \ + const char *table_str = ft_to_u8string(table); \ + assert_true(table_str != NULL); \ + char table_str_etalon[1024] = {'\0'}; \ + snprintf(table_str_etalon, 1024, \ + "+----------------------+\n" \ + "|\033[41m \033[4m\033[33m%s\033[0m\033[41m \033[0m|\n" \ + "+----------------------+\n", content); \ + assert_str_equal(table_str, table_str_etalon); \ + ft_destroy_table(table); \ + } TEST_UTF8_STYLE("1234567890"); TEST_UTF8_STYLE("Xylophmsik"); TEST_UTF8_STYLE("ψημένηζειθ"); @@ -718,10 +772,11 @@ void test_utf8_table(void) TEST_UTF8_STYLE("французких"); TEST_UTF8_STYLE("Benjamínúñ"); TEST_UTF8_STYLE("görmüştüçğ"); - TEST_UTF8_STYLE("視野無限廣窗外有藍天"); - TEST_UTF8_STYLE("いろはにほへとちりぬ"); - TEST_UTF8_STYLE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼"); + TEST_UTF8_STYLE_WIDE("視野無限廣窗外有藍天"); + TEST_UTF8_STYLE_WIDE("いろはにほへとちりぬ"); + TEST_UTF8_STYLE_WIDE("𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼"); #undef TEST_UTF8_STYLE +#undef TEST_UTF8_STYLE_WIDE } #endif /* FT_HAVE_UTF8 */ diff --git a/tests/wb_tests/test_string_buffer.c b/tests/wb_tests/test_string_buffer.c index db6bc62..dd1f933 100644 --- a/tests/wb_tests/test_string_buffer.c +++ b/tests/wb_tests/test_string_buffer.c @@ -422,7 +422,7 @@ void test_buffer_text_visible_width(void) buffer->str.u8str = (void *)"Numbers 01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"; assert_true(buffer_text_visible_width(buffer) == 110); buffer->str.u8str = (void *)"Chinese 視野無限廣, 窗外有藍天"; - assert_true(buffer_text_visible_width(buffer) == 22); + assert_true(buffer_text_visible_width(buffer) == 32); buffer->str.u8str = (void *)"German Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"; assert_true(buffer_text_visible_width(buffer) == 68); buffer->str.u8str = (void *)"Greek Ταχίστη αλώπηξ βαφής ψημένη γη, δρασκελίζει υπέρ νωθρού κυνός Takhístè"; @@ -430,7 +430,7 @@ void test_buffer_text_visible_width(void) buffer->str.u8str = (void *)"Irish D’ḟuascail Íosa Úrṁac na hÓiġe Beannaiṫe pór Éaḃa agus Áḋaiṁ"; assert_true(buffer_text_visible_width(buffer) == 70); buffer->str.u8str = (void *)"Japanese いろはにほへと ちりぬるを わかよたれそ つねならむ うゐ"; - assert_true(buffer_text_visible_width(buffer) == 39); + assert_true(buffer_text_visible_width(buffer) == 64); buffer->str.u8str = (void *)"Polish Pójdźże, kiń tę chmurność w głąb flaszy"; assert_true(buffer_text_visible_width(buffer) == 49); buffer->str.u8str = (void *)"Portuguese Luís argüia à Júlia que «brações, fé, chá, óxido, pôr, zângão» eram palavras do português"; @@ -446,7 +446,7 @@ void test_buffer_text_visible_width(void) buffer->str.u8str = (void *)"Numbers 01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"; assert_true(buffer_text_visible_width(buffer) == 110); buffer->str.u8str = (void *)"Chinese 視野無限廣,\n 窗外有藍天"; - assert_true(buffer_text_visible_width(buffer) == 16); + assert_true(buffer_text_visible_width(buffer) == 21); buffer->str.u8str = (void *)"German Falsches Üben von Xy\nlophonmusik quält \njeden größeren Zwerg"; assert_true(buffer_text_visible_width(buffer) == 30); buffer->str.u8str = (void *)"Greek Ταχίστη αλώπηξ βαφής\n ψημένη γη, δρασκελίζει\n υπέρ νωθρού \nκυνός Takhístè"; @@ -454,7 +454,7 @@ void test_buffer_text_visible_width(void) buffer->str.u8str = (void *)"Irish D’ḟuascail Íosa Úrṁa\nc na hÓiġe Beannaiṫe\n pór Éaḃa agus Áḋaiṁ"; assert_true(buffer_text_visible_width(buffer) == 30); buffer->str.u8str = (void *)"Japanese いろはにほへと ちり\nぬるを わかよたれそ つねならむ うゐ"; - assert_true(buffer_text_visible_width(buffer) == 20); + assert_true(buffer_text_visible_width(buffer) == 35); buffer->str.u8str = (void *)"Polish Pójdźże, kiń tę chmu\nrność w głąb flaszy"; assert_true(buffer_text_visible_width(buffer) == 30); buffer->str.u8str = (void *)"Portuguese Luís argüia à Júlia\n que «brações, fé, chá,\n óxido, pôr, \nzângão» eram palavras\n do português";