summaryrefslogtreecommitdiff
path: root/lout/unicode.cc
diff options
context:
space:
mode:
authorSebastian Geerken <devnull@localhost>2012-12-14 16:41:56 +0100
committerSebastian Geerken <devnull@localhost>2012-12-14 16:41:56 +0100
commit5799b9d93121a7b0c2cd3e70417192b71bcfe38b (patch)
tree20ba9d830362d400a1c2b115b1168c75e57f601f /lout/unicode.cc
parent875355853f1c6cbdeb960133bf463a36072a0999 (diff)
Made UTF-8 stuff more robust.
Diffstat (limited to 'lout/unicode.cc')
-rw-r--r--lout/unicode.cc109
1 files changed, 64 insertions, 45 deletions
diff --git a/lout/unicode.cc b/lout/unicode.cc
index 7d2502dc..2a147fca 100644
--- a/lout/unicode.cc
+++ b/lout/unicode.cc
@@ -54,53 +54,58 @@ int decodeUtf8 (const char *s)
{
if((s[0] & 0x80) == 0)
return s[0];
- else {
- int mask = 0xe0, bits = 0xc0, done = 0, ch = 0;
- for(int j = 1; !done && j < 7;
- j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
- if(((unsigned char)s[0] & mask) == bits) {
- done = 1;
- ch = (unsigned char)s[0] & ~mask & 0xff;
- for(int k = 0; k < j; k++)
- ch = (ch << 6) | ((unsigned char)s[k + 1] & 0x3f);
- }
- }
-
- return ch;
- }
+ else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+ return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+ else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80)
+ return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
+ else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+ return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+ | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+ else
+ return '?';
}
-static const char *_nextUtf8Char (const char *s)
-{
- if (s == NULL)
- return NULL;
- const char *r;
- if((s[0] & 0x80) == 0)
- r = s + 1;
- else {
- int mask = 0xe0, bits = 0xc0, done = 0;
- for(int j = 1; !done && j < 7;
- j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
- if(((unsigned char)s[0] & mask) == bits) {
- done = 1;
- r = s + j + 1;
- }
- }
-
- if(!done) {
- assertNotReached();
- return NULL;
- }
- }
-
- return r;
+int decodeUtf8 (const char *s, int len)
+{
+ if(len >= 1 && (s[0] & 0x80) == 0)
+ return s[0];
+ else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+ return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+ else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80)
+ return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
+ else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+ return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+ | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+ else
+ return '?';
}
const char *nextUtf8Char (const char *s)
{
- const char *r = _nextUtf8Char (s);
- if (r != NULL && r[0] == 0)
+ const char *r;
+
+ if (s == NULL || s[0] == 0)
+ r = NULL;
+ else if((s[0] & 0x80) == 0)
+ r = s + 1;
+ else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+ r = s + 2;
+ else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80)
+ r = s + 3;
+ else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+ r = s + 4;
+ else
+ // invalid UTF-8 sequence: treat as one byte.
+ r = s + 1;
+
+ if (r && r[0] == 0)
return NULL;
else
return r;
@@ -108,11 +113,25 @@ const char *nextUtf8Char (const char *s)
const char *nextUtf8Char (const char *s, int len)
{
- if (len <= 0)
- return NULL;
-
- const char *r = _nextUtf8Char (s);
- if (r != NULL && r - s >= len)
+ const char *r;
+
+ if (s == NULL || len <= 0)
+ r = NULL;
+ else if(len >= 1 && (s[0] & 0x80) == 0)
+ r = s + 1;
+ else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+ r = s + 2;
+ else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80)
+ r = s + 3;
+ else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+ r = s + 4;
+ else
+ // invalid UTF-8 sequence: treat as one byte.
+ r = s + 1;
+
+ if (r && r - s >= len)
return NULL;
else
return r;