[F] Fix incorrect behaviour in case of wide east asian symbols

2019-09-08 09:55:56 +03:00
parent 6082281d0e
commit d944b8c364
9 changed files with 252 additions and 25 deletions
--- a/lib/fort.c
+++ b/lib/fort.c
@@ -480,6 +480,12 @@ utf8_nonnull utf8_weak void *utf8dup(const void *src);
 // excluding the null terminating byte.
 utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);

+// Visible width of utf8string.
+utf8_nonnull utf8_pure utf8_weak size_t utf8width(const void *str);
+
+// Visible width of codepoint.
+utf8_nonnull utf8_pure utf8_weak int utf8cwidth(utf8_int32_t c);
+
 // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
 // src2 respectively, case insensitive. Checking at most n bytes of each utf8
 // string.
@@ -823,6 +829,83 @@ size_t utf8len(const void *str)
    return length;
 }

+// See
+// https://unicode.org/Public/UNIDATA/EastAsianWidth.txt
+// http://www.unicode.org/reports/tr11/tr11-33.html
+int utf8cwidth(utf8_int32_t c)
+{
+    // TODO: add non printable characters check
+    if (c == 0)
+        return 0;
+
+    if (c < 0x1100)
+        return 1;
+
+    // Fullwidth
+    if ((0x3000 == c) ||
+        (0xFF01 <= c && c <= 0xFF60) ||
+        (0xFFE0 <= c && c <= 0xFFE6)) {
+        return 2;
+    }
+
+    // Wide
+    if ((0x1100 <= c && c <= 0x115F) ||
+        (0x11A3 <= c && c <= 0x11A7) ||
+        (0x11FA <= c && c <= 0x11FF) ||
+        (0x2329 <= c && c <= 0x232A) ||
+        (0x2E80 <= c && c <= 0x2E99) ||
+        (0x2E9B <= c && c <= 0x2EF3) ||
+        (0x2F00 <= c && c <= 0x2FD5) ||
+        (0x2FF0 <= c && c <= 0x2FFB) ||
+        (0x3001 <= c && c <= 0x303E) ||
+        (0x3041 <= c && c <= 0x3096) ||
+        (0x3099 <= c && c <= 0x30FF) ||
+        (0x3105 <= c && c <= 0x312D) ||
+        (0x3131 <= c && c <= 0x318E) ||
+        (0x3190 <= c && c <= 0x31BA) ||
+        (0x31C0 <= c && c <= 0x31E3) ||
+        (0x31F0 <= c && c <= 0x321E) ||
+        (0x3220 <= c && c <= 0x3247) ||
+        (0x3250 <= c && c <= 0x32FE) ||
+        (0x3300 <= c && c <= 0x4DBF) ||
+        (0x4E00 <= c && c <= 0xA48C) ||
+        (0xA490 <= c && c <= 0xA4C6) ||
+        (0xA960 <= c && c <= 0xA97C) ||
+        (0xAC00 <= c && c <= 0xD7A3) ||
+        (0xD7B0 <= c && c <= 0xD7C6) ||
+        (0xD7CB <= c && c <= 0xD7FB) ||
+        (0xF900 <= c && c <= 0xFAFF) ||
+        (0xFE10 <= c && c <= 0xFE19) ||
+        (0xFE30 <= c && c <= 0xFE52) ||
+        (0xFE54 <= c && c <= 0xFE66) ||
+        (0xFE68 <= c && c <= 0xFE6B) ||
+        (0x1B000 <= c && c <= 0x1B001) ||
+        (0x1F200 <= c && c <= 0x1F202) ||
+        (0x1F210 <= c && c <= 0x1F23A) ||
+        (0x1F240 <= c && c <= 0x1F248) ||
+        (0x1F250 <= c && c <= 0x1F251) ||
+        (0x20000 <= c && c <= 0x2F73F) ||
+        (0x2B740 <= c && c <= 0x2FFFD) ||
+        (0x30000 <= c && c <= 0x3FFFD)) {
+        return 2;
+    }
+
+    return 1;
+}
+
+size_t utf8width(const void *str)
+{
+    size_t length = 0;
+    utf8_int32_t c = 0;
+
+    str = utf8codepoint(str, &c);
+    while (c != 0) {
+        length += utf8cwidth(c);
+        str = utf8codepoint(str, &c);
+    }
+    return length;
+}
+
 int utf8ncasecmp(const void *src1, const void *src2, size_t n)
 {
    utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
@@ -6217,8 +6300,8 @@ size_t string_buffer_raw_capacity(const f_string_buffer_t *buffer)
 }

 #ifdef FT_HAVE_UTF8
-FT_INTERNAL
-size_t ut8_width(const void *beg, const void *end)
+static
+size_t utf8_width(const void *beg, const void *end)
 {
    size_t sz = (size_t)((const char *)end - (const char *)beg);
    char *tmp = (char *)F_MALLOC(sizeof(char) * (sz + 1));
@@ -6227,7 +6310,7 @@ size_t ut8_width(const void *beg, const void *end)

    memcpy(tmp, beg, sz);
    tmp[sz] = '\0';
-    size_t result = utf8len(tmp);
+    size_t result = utf8width(tmp);
    F_FREE(tmp);
    return result;
 }
@@ -6277,7 +6360,7 @@ size_t buffer_text_visible_width(const f_string_buffer_t *buffer)
            if (beg == NULL || end == NULL)
                return max_length;

-            max_length = MAX(max_length, (size_t)ut8_width(beg, end));
+            max_length = MAX(max_length, (size_t)utf8_width(beg, end));
            ++n;
        }
 #endif /* FT_HAVE_WCHAR */
@@ -6307,7 +6390,7 @@ buffer_substring(const f_string_buffer_t *buffer, size_t buffer_row, const void
        case UTF8_BUF:
            utf8_n_substring(buffer->str.u8str, '\n', buffer_row, begin, end);
            if ((*(const char **)begin) && (*(const char **)end))
-                *str_it_width = ut8_width(*begin, *end);
+                *str_it_width = utf8_width(*begin, *end);
            break;
 #endif /* FT_HAVE_UTF8 */
        default:
--- a/lib/fort.h
+++ b/lib/fort.h
@@ -46,8 +46,8 @@ SOFTWARE.

 #define LIBFORT_MAJOR_VERSION 0
 #define LIBFORT_MINOR_VERSION 2
-#define LIBFORT_REVISION 1
-#define LIBFORT_VERSION_STR "0.2.1"
+#define LIBFORT_REVISION 2
+#define LIBFORT_VERSION_STR "0.2.2"


 /*****************************************************************************