New function nextUtf8Char; usage in dw::Hyphenator and (partly) dw::Textblock.

author: Sebastian Geerken <devnull@localhost> 2012-12-13 22:43:15 +0100
committer: Sebastian Geerken <devnull@localhost> 2012-12-13 22:43:15 +0100
commit: f5380a56b1a6b83fea9b1c97140d4b1c8fe4ba49 (patch)
tree: 93a792dc006a046b4187acf95c6c8b79df97a920
parent: 1471c240d49b60ef081f50230f2eee8852793716 (diff)
6 files changed, 103 insertions, 29 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc
index 4fe9433b..cc92b77f 100644
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@@ -16,6 +16,7 @@
 using namespace lout::object;
 using namespace lout::container::typed;
 using namespace lout::misc;
+using namespace lout::unicode;
 
 namespace dw {
 
@@ -204,7 +205,7 @@ bool Hyphenator::isCharPartOfActualWord (char *s)
         (unsigned char)s[1] == 0x9f /* ß */ ));
 #endif
 
-   return lout::unicode::isAlpha (lout::unicode::decodeUtf8 (s));
+   return isAlpha (decodeUtf8 (s));
 }
 
 /**
@@ -315,9 +316,10 @@ void Hyphenator::hyphenateSingleWord(core::Platform *platform,
 
    // No hyphens in the first two chars or the last two.
    // Characters are not bytes, so UTF-8 characters must be counted.
-   int numBytes1Start = platform->nextGlyph (wordLc, 0);
-   int numBytes2Start = platform->nextGlyph (wordLc, numBytes1Start);
-   for (int i = 0; i < numBytes2Start; i++)
+   const char *bytesStart =  nextUtf8Char (nextUtf8Char (wordLc));
+   // TODO Check bytesStart == NULL;
+   int numBytesStart = bytesStart - wordLc;
+   for (int i = 0; i < numBytesStart; i++)
       points.set (i + 1, 0);
    
    int len = strlen (wordLc);
diff --git a/dw/textblock.cc b/dw/textblock.cc
index ba000e1a..816b2680 100644
--- a/dw/textblock.cc
+++ b/dw/textblock.cc
@@ -21,6 +21,7 @@
 #include "textblock.hh"
 #include "../lout/msg.h"
 #include "../lout/misc.hh"
+#include "../lout/unicode.hh"
 
 #include <stdio.h>
 #include <math.h>
@@ -34,6 +35,7 @@ static dw::core::style::Tooltip *hoverTooltip = NULL;
 
 
 using namespace lout;
+using namespace lout::unicode;
 
 namespace dw {
 
@@ -1425,13 +1427,12 @@ void Textblock::addText (const char *text, size_t len,
    // Count dividing characters.
    int numParts = 1;
 
-   for (int i = 0; i < (int)len;
-        i < (int)len && (i = layout->nextGlyph (text, i))) {
+   for (const char *s = text; s; s = nextUtf8Char (s, text + len - s)) {
       int foundDiv = -1;
       for (int j = 0; foundDiv == -1 && j < NUM_DIV_CHARS; j++) {
          int lDiv = strlen (divChars[j].s);
-         if (i <= (int)len - lDiv) {
-            if (memcmp (text + i, divChars[j].s, lDiv * sizeof (char)) == 0)
+         if (s  <= text + len - lDiv) {
+            if (memcmp (s, divChars[j].s, lDiv * sizeof (char)) == 0)
                foundDiv = j;
          }
       }
@@ -1466,13 +1467,12 @@ void Textblock::addText (const char *text, size_t len,
       partStart[0] = 0;
       partEnd[numParts - 1] = len;
 
-      for (int i = 0; i < (int)len;
-           i < (int)len && (i = layout->nextGlyph (text, i))) {
+      for (const char *s = text; s; s = nextUtf8Char (s, text + len - s)) {
          int foundDiv = -1;
          for (int j = 0; foundDiv == -1 && j < NUM_DIV_CHARS; j++) {
             int lDiv = strlen (divChars[j].s);
-            if (i <= (int)len - lDiv) {
-               if (memcmp (text + i, divChars[j].s, lDiv * sizeof (char)) == 0)
+            if (s <= text + len - lDiv) {
+               if (memcmp (s, divChars[j].s, lDiv * sizeof (char)) == 0)
                   foundDiv = j;
             }
          }
@@ -1490,8 +1490,8 @@ void Textblock::addText (const char *text, size_t len,
                unbreakableForMinWidth[n] =
                   divChars[foundDiv].unbreakableForMinWidth;
                canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated;
-               partEnd[n] = i;
-               partStart[n + 1] = i + lDiv;
+               partEnd[n] = s - text;
+               partStart[n + 1] = s - text + lDiv;
                n++;
                totalLenCharRemoved += lDiv;
             } else {
@@ -1505,8 +1505,8 @@ void Textblock::addText (const char *text, size_t len,
                   unbreakableForMinWidth[n] =
                      divChars[foundDiv].unbreakableForMinWidth;
                   canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated;
-                  partEnd[n] = i;
-                  partStart[n + 1] = i;
+                  partEnd[n] = s - text;
+                  partStart[n + 1] = s - text;
                   n++;
                }
 
@@ -1517,8 +1517,8 @@ void Textblock::addText (const char *text, size_t len,
                   unbreakableForMinWidth[n] =
                      divChars[foundDiv].unbreakableForMinWidth;
                   canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated;
-                  partEnd[n] = i + lDiv;
-                  partStart[n + 1] = i + lDiv;
+                  partEnd[n] = s - text + lDiv;
+                  partStart[n + 1] = s - text + lDiv;
                   n++;
                }
             }
diff --git a/lout/unicode.cc b/lout/unicode.cc
index 38d71494..7d2502dc 100644
--- a/lout/unicode.cc
+++ b/lout/unicode.cc
@@ -1,4 +1,7 @@
 #include "unicode.hh"
+#include "misc.hh"
+
+using namespace lout::misc;
 
 namespace lout {
 
@@ -47,22 +50,19 @@ bool isAlpha (int ch)
    return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7)));
 }
 
-int decodeUtf8 (char *s)
+int decodeUtf8 (const char *s)
 {
    if((s[0] & 0x80) == 0)
       return s[0];
    else {
-      int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0;
+      int mask = 0xe0, bits = 0xc0, done = 0, ch = 0;
       for(int j = 1; !done && j < 7;
           j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
-         if(((unsigned char)s[i] & mask) == bits) {
+         if(((unsigned char)s[0] & mask) == bits) {
             done = 1;
-            ch = (unsigned char)s[i] & ~mask & 0xff;
-            i++;
-            for(int k = 0; k < j; k++) {
-               ch = (ch << 6) | ((unsigned char)s[i] & 0x3f);
-               i++;
-            }
+            ch = (unsigned char)s[0] & ~mask & 0xff;
+            for(int k = 0; k < j; k++)
+               ch = (ch << 6) | ((unsigned char)s[k + 1] & 0x3f);
          }
       }
 
@@ -70,6 +70,54 @@ int decodeUtf8 (char *s)
    }
 }
 
+static const char *_nextUtf8Char (const char *s)
+{
+   if (s == NULL)
+      return NULL;
+
+   const char *r;
+   if((s[0] & 0x80) == 0)
+      r = s + 1;
+   else {
+      int mask = 0xe0, bits = 0xc0, done = 0;
+      for(int j = 1; !done && j < 7;
+          j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
+         if(((unsigned char)s[0] & mask) == bits) {
+            done = 1;
+            r = s + j + 1;
+         }
+      }
+
+      if(!done) {
+         assertNotReached();
+         return NULL;
+      }
+   }
+
+   return r;
+}
+
+const char *nextUtf8Char (const char *s)
+{
+   const char *r = _nextUtf8Char (s);
+   if (r != NULL && r[0] == 0)
+      return NULL;
+   else
+      return r;
+}
+
+const char *nextUtf8Char (const char *s, int len)
+{
+   if (len <= 0)
+      return NULL;
+   
+   const char *r = _nextUtf8Char (s);
+   if (r != NULL && r - s >= len)
+      return NULL;
+   else
+      return r;
+}
+
 } // namespace lout
 
 } // namespace unicode
diff --git a/lout/unicode.hh b/lout/unicode.hh
index 123e7aa3..42d06911 100644
--- a/lout/unicode.hh
+++ b/lout/unicode.hh
@@ -11,7 +11,11 @@ namespace unicode {
 
 bool isAlpha (int ch);
 
-int decodeUtf8 (char *s);
+int decodeUtf8 (const char *s);
+
+const char *nextUtf8Char (const char *s);
+
+const char *nextUtf8Char (const char *s, int len);
 
 } // namespace lout
 
diff --git a/test/Makefile.am b/test/Makefile.am
index 9589e09b..d8dd785e 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -23,7 +23,8 @@ noinst_PROGRAMS = \
 	cookies \
 	liang \
 	trie \
-	notsosimplevector
+	notsosimplevector \
+	unicode-test
 
 dw_anchors_test_SOURCES = dw_anchors_test.cc
 dw_anchors_test_LDADD = \
@@ -180,3 +181,7 @@ trie_LDADD = \
 notsosimplevector_SOURCES = notsosimplevector.cc
 
 notsosimplevector_LDADD = $(top_builddir)/lout/liblout.a
+
+unicode_test_SOURCES = unicode_test.cc
+
+unicode_test_LDADD = $(top_builddir)/lout/liblout.a
diff --git a/test/unicode_test.cc b/test/unicode_test.cc
new file mode 100644
index 00000000..3cd43710
--- /dev/null
+++ b/test/unicode_test.cc
@@ -0,0 +1,15 @@
+#include <string.h>
+#include <stdio.h>
+#include "../lout/unicode.hh"
+
+using namespace lout::unicode;
+
+int main (int argc, char *argv[])
+{
+   const char *t = "abcäöüабв−‐";
+
+   for (const char *s = t; s; s = nextUtf8Char (s, strlen (s)))
+      printf ("%3d -> U+%04x ('%s')\n", (int)(s - t), decodeUtf8(s), s);
+   
+   return 0;
+}
author	Sebastian Geerken <devnull@localhost>	2012-12-13 22:43:15 +0100
committer	Sebastian Geerken <devnull@localhost>	2012-12-13 22:43:15 +0100
commit	f5380a56b1a6b83fea9b1c97140d4b1c8fe4ba49 (patch)
tree	93a792dc006a046b4187acf95c6c8b79df97a920
parent	1471c240d49b60ef081f50230f2eee8852793716 (diff)