diff options
author | Sebastian Geerken <devnull@localhost> | 2012-12-13 22:43:15 +0100 |
---|---|---|
committer | Sebastian Geerken <devnull@localhost> | 2012-12-13 22:43:15 +0100 |
commit | f5380a56b1a6b83fea9b1c97140d4b1c8fe4ba49 (patch) | |
tree | 93a792dc006a046b4187acf95c6c8b79df97a920 | |
parent | 1471c240d49b60ef081f50230f2eee8852793716 (diff) |
New function nextUtf8Char; usage in dw::Hyphenator and (partly) dw::Textblock.
-rw-r--r-- | dw/hyphenator.cc | 10 | ||||
-rw-r--r-- | dw/textblock.cc | 28 | ||||
-rw-r--r-- | lout/unicode.cc | 66 | ||||
-rw-r--r-- | lout/unicode.hh | 6 | ||||
-rw-r--r-- | test/Makefile.am | 7 | ||||
-rw-r--r-- | test/unicode_test.cc | 15 |
6 files changed, 103 insertions, 29 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc index 4fe9433b..cc92b77f 100644 --- a/dw/hyphenator.cc +++ b/dw/hyphenator.cc @@ -16,6 +16,7 @@ using namespace lout::object; using namespace lout::container::typed; using namespace lout::misc; +using namespace lout::unicode; namespace dw { @@ -204,7 +205,7 @@ bool Hyphenator::isCharPartOfActualWord (char *s) (unsigned char)s[1] == 0x9f /* ß */ )); #endif - return lout::unicode::isAlpha (lout::unicode::decodeUtf8 (s)); + return isAlpha (decodeUtf8 (s)); } /** @@ -315,9 +316,10 @@ void Hyphenator::hyphenateSingleWord(core::Platform *platform, // No hyphens in the first two chars or the last two. // Characters are not bytes, so UTF-8 characters must be counted. - int numBytes1Start = platform->nextGlyph (wordLc, 0); - int numBytes2Start = platform->nextGlyph (wordLc, numBytes1Start); - for (int i = 0; i < numBytes2Start; i++) + const char *bytesStart = nextUtf8Char (nextUtf8Char (wordLc)); + // TODO Check bytesStart == NULL; + int numBytesStart = bytesStart - wordLc; + for (int i = 0; i < numBytesStart; i++) points.set (i + 1, 0); int len = strlen (wordLc); diff --git a/dw/textblock.cc b/dw/textblock.cc index ba000e1a..816b2680 100644 --- a/dw/textblock.cc +++ b/dw/textblock.cc @@ -21,6 +21,7 @@ #include "textblock.hh" #include "../lout/msg.h" #include "../lout/misc.hh" +#include "../lout/unicode.hh" #include <stdio.h> #include <math.h> @@ -34,6 +35,7 @@ static dw::core::style::Tooltip *hoverTooltip = NULL; using namespace lout; +using namespace lout::unicode; namespace dw { @@ -1425,13 +1427,12 @@ void Textblock::addText (const char *text, size_t len, // Count dividing characters. int numParts = 1; - for (int i = 0; i < (int)len; - i < (int)len && (i = layout->nextGlyph (text, i))) { + for (const char *s = text; s; s = nextUtf8Char (s, text + len - s)) { int foundDiv = -1; for (int j = 0; foundDiv == -1 && j < NUM_DIV_CHARS; j++) { int lDiv = strlen (divChars[j].s); - if (i <= (int)len - lDiv) { - if (memcmp (text + i, divChars[j].s, lDiv * sizeof (char)) == 0) + if (s <= text + len - lDiv) { + if (memcmp (s, divChars[j].s, lDiv * sizeof (char)) == 0) foundDiv = j; } } @@ -1466,13 +1467,12 @@ void Textblock::addText (const char *text, size_t len, partStart[0] = 0; partEnd[numParts - 1] = len; - for (int i = 0; i < (int)len; - i < (int)len && (i = layout->nextGlyph (text, i))) { + for (const char *s = text; s; s = nextUtf8Char (s, text + len - s)) { int foundDiv = -1; for (int j = 0; foundDiv == -1 && j < NUM_DIV_CHARS; j++) { int lDiv = strlen (divChars[j].s); - if (i <= (int)len - lDiv) { - if (memcmp (text + i, divChars[j].s, lDiv * sizeof (char)) == 0) + if (s <= text + len - lDiv) { + if (memcmp (s, divChars[j].s, lDiv * sizeof (char)) == 0) foundDiv = j; } } @@ -1490,8 +1490,8 @@ void Textblock::addText (const char *text, size_t len, unbreakableForMinWidth[n] = divChars[foundDiv].unbreakableForMinWidth; canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated; - partEnd[n] = i; - partStart[n + 1] = i + lDiv; + partEnd[n] = s - text; + partStart[n + 1] = s - text + lDiv; n++; totalLenCharRemoved += lDiv; } else { @@ -1505,8 +1505,8 @@ void Textblock::addText (const char *text, size_t len, unbreakableForMinWidth[n] = divChars[foundDiv].unbreakableForMinWidth; canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated; - partEnd[n] = i; - partStart[n + 1] = i; + partEnd[n] = s - text; + partStart[n + 1] = s - text; n++; } @@ -1517,8 +1517,8 @@ void Textblock::addText (const char *text, size_t len, unbreakableForMinWidth[n] = divChars[foundDiv].unbreakableForMinWidth; canBeHyphenated[n + 1] = divChars[foundDiv].canBeHyphenated; - partEnd[n] = i + lDiv; - partStart[n + 1] = i + lDiv; + partEnd[n] = s - text + lDiv; + partStart[n + 1] = s - text + lDiv; n++; } } diff --git a/lout/unicode.cc b/lout/unicode.cc index 38d71494..7d2502dc 100644 --- a/lout/unicode.cc +++ b/lout/unicode.cc @@ -1,4 +1,7 @@ #include "unicode.hh" +#include "misc.hh" + +using namespace lout::misc; namespace lout { @@ -47,22 +50,19 @@ bool isAlpha (int ch) return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7))); } -int decodeUtf8 (char *s) +int decodeUtf8 (const char *s) { if((s[0] & 0x80) == 0) return s[0]; else { - int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0; + int mask = 0xe0, bits = 0xc0, done = 0, ch = 0; for(int j = 1; !done && j < 7; j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) { - if(((unsigned char)s[i] & mask) == bits) { + if(((unsigned char)s[0] & mask) == bits) { done = 1; - ch = (unsigned char)s[i] & ~mask & 0xff; - i++; - for(int k = 0; k < j; k++) { - ch = (ch << 6) | ((unsigned char)s[i] & 0x3f); - i++; - } + ch = (unsigned char)s[0] & ~mask & 0xff; + for(int k = 0; k < j; k++) + ch = (ch << 6) | ((unsigned char)s[k + 1] & 0x3f); } } @@ -70,6 +70,54 @@ int decodeUtf8 (char *s) } } +static const char *_nextUtf8Char (const char *s) +{ + if (s == NULL) + return NULL; + + const char *r; + if((s[0] & 0x80) == 0) + r = s + 1; + else { + int mask = 0xe0, bits = 0xc0, done = 0; + for(int j = 1; !done && j < 7; + j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) { + if(((unsigned char)s[0] & mask) == bits) { + done = 1; + r = s + j + 1; + } + } + + if(!done) { + assertNotReached(); + return NULL; + } + } + + return r; +} + +const char *nextUtf8Char (const char *s) +{ + const char *r = _nextUtf8Char (s); + if (r != NULL && r[0] == 0) + return NULL; + else + return r; +} + +const char *nextUtf8Char (const char *s, int len) +{ + if (len <= 0) + return NULL; + + const char *r = _nextUtf8Char (s); + if (r != NULL && r - s >= len) + return NULL; + else + return r; +} + } // namespace lout } // namespace unicode diff --git a/lout/unicode.hh b/lout/unicode.hh index 123e7aa3..42d06911 100644 --- a/lout/unicode.hh +++ b/lout/unicode.hh @@ -11,7 +11,11 @@ namespace unicode { bool isAlpha (int ch); -int decodeUtf8 (char *s); +int decodeUtf8 (const char *s); + +const char *nextUtf8Char (const char *s); + +const char *nextUtf8Char (const char *s, int len); } // namespace lout diff --git a/test/Makefile.am b/test/Makefile.am index 9589e09b..d8dd785e 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -23,7 +23,8 @@ noinst_PROGRAMS = \ cookies \ liang \ trie \ - notsosimplevector + notsosimplevector \ + unicode-test dw_anchors_test_SOURCES = dw_anchors_test.cc dw_anchors_test_LDADD = \ @@ -180,3 +181,7 @@ trie_LDADD = \ notsosimplevector_SOURCES = notsosimplevector.cc notsosimplevector_LDADD = $(top_builddir)/lout/liblout.a + +unicode_test_SOURCES = unicode_test.cc + +unicode_test_LDADD = $(top_builddir)/lout/liblout.a diff --git a/test/unicode_test.cc b/test/unicode_test.cc new file mode 100644 index 00000000..3cd43710 --- /dev/null +++ b/test/unicode_test.cc @@ -0,0 +1,15 @@ +#include <string.h> +#include <stdio.h> +#include "../lout/unicode.hh" + +using namespace lout::unicode; + +int main (int argc, char *argv[]) +{ + const char *t = "abcäöüабв−‐"; + + for (const char *s = t; s; s = nextUtf8Char (s, strlen (s))) + printf ("%3d -> U+%04x ('%s')\n", (int)(s - t), decodeUtf8(s), s); + + return 0; +} |