treat ideographic characters (Chinese/Japanese) as words

author: corvid <corvid@lavabit.com> 2009-08-02 03:59:14 +0000
committer: corvid <corvid@lavabit.com> 2009-08-02 03:59:14 +0000
commit: 9208c861e0b2c2772f2291376f5632a4d5de6c02 (patch)
tree: 35261ff2cc862f8bc77501c8a828bc245a0c09dc /src/utf8.cc
parent: 1ee0cb3aca76ca7661f5498ea0a73ff3e71ec569 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/src/utf8.cc b/src/utf8.cc
index 46a45fe4..47d8112b 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -11,6 +11,7 @@
 
 #include <fltk/utf.h>
 
+#include "../dlib/dlib.h"    /* TRUE/FALSE */
 #include "utf8.hh"
 
 // C++ functions with C linkage ----------------------------------------------
@@ -64,3 +65,30 @@ int a_Utf8_test(const char* src, unsigned int srclen)
 {
    return utf8test(src, srclen);
 }
+
+/*
+ * Does s point to a UTF-8-encoded ideographic character?
+ *
+ * This is based on http://unicode.org/reports/tr14/#ID plus some guesses
+ * for what might make the most sense for Dillo. Surprisingly, they include
+ * Hangul Compatibility Jamo, but they're the experts, so I'll follow along.
+ */
+bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
+{
+   bool_t ret = FALSE;
+
+   if ((uchar_t)*s >= 0xe2) {
+      /* Unicode char >= U+2000. */
+      unsigned unicode = a_Utf8_decode(s, end, len);
+
+      if (unicode >= 0x2e80 &&
+           ((unicode <= 0xa4cf) ||
+            (unicode >= 0xf900 && unicode <= 0xfaff) ||
+            (unicode >= 0xff00 && unicode <= 0xff9f))) {
+         ret = TRUE;
+     }
+   } else {
+      *len = 1 + (int)a_Utf8_end_of_char(s, 0);
+   }
+   return ret;
+}
author	corvid <corvid@lavabit.com>	2009-08-02 03:59:14 +0000
committer	corvid <corvid@lavabit.com>	2009-08-02 03:59:14 +0000
commit	9208c861e0b2c2772f2291376f5632a4d5de6c02 (patch)
tree	35261ff2cc862f8bc77501c8a828bc245a0c09dc /src/utf8.cc
parent	1ee0cb3aca76ca7661f5498ea0a73ff3e71ec569 (diff)