diff options
author | corvid <corvid@lavabit.com> | 2009-08-02 03:59:14 +0000 |
---|---|---|
committer | corvid <corvid@lavabit.com> | 2009-08-02 03:59:14 +0000 |
commit | 9208c861e0b2c2772f2291376f5632a4d5de6c02 (patch) | |
tree | 35261ff2cc862f8bc77501c8a828bc245a0c09dc | |
parent | 1ee0cb3aca76ca7661f5498ea0a73ff3e71ec569 (diff) |
treat ideographic characters (Chinese/Japanese) as words
-rw-r--r-- | src/html.cc | 17 | ||||
-rw-r--r-- | src/utf8.cc | 28 | ||||
-rw-r--r-- | src/utf8.hh | 1 |
3 files changed, 44 insertions, 2 deletions
diff --git a/src/html.cc b/src/html.cc index e5dde22f..8e85a0e0 100644 --- a/src/html.cc +++ b/src/html.cc @@ -1189,17 +1189,30 @@ static void Html_process_word(DilloHtml *html, char *word, int size) } } for (start = i = 0; Pword[i]; start = i) { + int len; + if (isspace(Pword[i])) { while (Pword[++i] && isspace(Pword[i])) ; Html_process_space(html, Pword + start, i - start); - } else { - while (Pword[++i] && !isspace(Pword[i])) ; + } else if (a_Utf8_ideographic(Pword+i, Pword_end, &len)) { + i += len; ch = Pword[i]; Pword[i] = '\0'; HT2TB(html)->addText(Pword + start, html->styleEngine->wordStyle ()); Pword[i] = ch; html->PrevWasSPC = false; + } else { + do { + i += len; + } while (Pword[i] && !isspace(Pword[i]) && + (!a_Utf8_ideographic(Pword+i, Pword_end, &len))); + ch = Pword[i]; + Pword[i] = 0; + HT2TB(html)->addText(Pword + start, + html->styleEngine->wordStyle ()); + Pword[i] = ch; + html->PrevWasSPC = false; } } if (word != Pword) diff --git a/src/utf8.cc b/src/utf8.cc index 46a45fe4..47d8112b 100644 --- a/src/utf8.cc +++ b/src/utf8.cc @@ -11,6 +11,7 @@ #include <fltk/utf.h> +#include "../dlib/dlib.h" /* TRUE/FALSE */ #include "utf8.hh" // C++ functions with C linkage ---------------------------------------------- @@ -64,3 +65,30 @@ int a_Utf8_test(const char* src, unsigned int srclen) { return utf8test(src, srclen); } + +/* + * Does s point to a UTF-8-encoded ideographic character? + * + * This is based on http://unicode.org/reports/tr14/#ID plus some guesses + * for what might make the most sense for Dillo. Surprisingly, they include + * Hangul Compatibility Jamo, but they're the experts, so I'll follow along. + */ +bool_t a_Utf8_ideographic(const char *s, const char *end, int *len) +{ + bool_t ret = FALSE; + + if ((uchar_t)*s >= 0xe2) { + /* Unicode char >= U+2000. */ + unsigned unicode = a_Utf8_decode(s, end, len); + + if (unicode >= 0x2e80 && + ((unicode <= 0xa4cf) || + (unicode >= 0xf900 && unicode <= 0xfaff) || + (unicode >= 0xff00 && unicode <= 0xff9f))) { + ret = TRUE; + } + } else { + *len = 1 + (int)a_Utf8_end_of_char(s, 0); + } + return ret; +} diff --git a/src/utf8.hh b/src/utf8.hh index e2f9f9a4..6e2b4169 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -19,6 +19,7 @@ uint_t a_Utf8_end_of_char(const char *str, uint_t i); uint_t a_Utf8_decode(const char*, const char* end, int* len); int a_Utf8_encode(unsigned int ucs, char *buf); int a_Utf8_test(const char* src, unsigned int srclen); +bool_t a_Utf8_ideographic(const char *s, const char *end, int *len); #ifdef __cplusplus } |