Made UTF-8 stuff more robust.

author: Sebastian Geerken <devnull@localhost> 2012-12-14 16:41:56 +0100
committer: Sebastian Geerken <devnull@localhost> 2012-12-14 16:41:56 +0100
commit: 5799b9d93121a7b0c2cd3e70417192b71bcfe38b (patch)
tree: 20ba9d830362d400a1c2b115b1168c75e57f601f /lout/unicode.cc
parent: 875355853f1c6cbdeb960133bf463a36072a0999 (diff)
1 files changed, 64 insertions, 45 deletions
diff --git a/lout/unicode.cc b/lout/unicode.cc
index 7d2502dc..2a147fca 100644
--- a/lout/unicode.cc
+++ b/lout/unicode.cc
@@ -54,53 +54,58 @@ int decodeUtf8 (const char *s)
 {
    if((s[0] & 0x80) == 0)
       return s[0];
-   else {
-      int mask = 0xe0, bits = 0xc0, done = 0, ch = 0;
-      for(int j = 1; !done && j < 7;
-          j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
-         if(((unsigned char)s[0] & mask) == bits) {
-            done = 1;
-            ch = (unsigned char)s[0] & ~mask & 0xff;
-            for(int k = 0; k < j; k++)
-               ch = (ch << 6) | ((unsigned char)s[k + 1] & 0x3f);
-         }
-      }
-
-      return ch;
-   }
+   else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+   else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6)  | (s[2] & 0x3f);
+   else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+         | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+   else
+      return '?';
 }
 
-static const char *_nextUtf8Char (const char *s)
-{
-   if (s == NULL)
-      return NULL;
 
-   const char *r;
-   if((s[0] & 0x80) == 0)
-      r = s + 1;
-   else {
-      int mask = 0xe0, bits = 0xc0, done = 0;
-      for(int j = 1; !done && j < 7;
-          j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
-         if(((unsigned char)s[0] & mask) == bits) {
-            done = 1;
-            r = s + j + 1;
-         }
-      }
-
-      if(!done) {
-         assertNotReached();
-         return NULL;
-      }
-   }
-
-   return r;
+int decodeUtf8 (const char *s, int len)
+{
+   if(len >= 1 && (s[0] & 0x80) == 0)
+      return s[0];
+   else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+   else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6)  | (s[2] & 0x3f);
+   else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+         | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+   else
+      return '?';
 }
 
 const char *nextUtf8Char (const char *s)
 {
-   const char *r = _nextUtf8Char (s);
-   if (r != NULL && r[0] == 0)
+   const char *r;
+
+   if (s == NULL || s[0] == 0)
+      r = NULL;   
+   else if((s[0] & 0x80) == 0)
+      r = s + 1;
+   else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      r = s + 2;
+   else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      r = s + 3;
+   else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      r = s + 4;
+   else
+      // invalid UTF-8 sequence: treat as one byte.
+      r = s + 1;
+
+   if (r && r[0] == 0)
       return NULL;
    else
       return r;
@@ -108,11 +113,25 @@ const char *nextUtf8Char (const char *s)
 
 const char *nextUtf8Char (const char *s, int len)
 {
-   if (len <= 0)
-      return NULL;
-   
-   const char *r = _nextUtf8Char (s);
-   if (r != NULL && r - s >= len)
+   const char *r;
+
+   if (s == NULL || len <= 0)
+      r = NULL;   
+   else if(len >= 1 && (s[0] & 0x80) == 0)
+      r = s + 1;
+   else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      r = s + 2;
+   else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      r = s + 3;
+   else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      r = s + 4;
+   else
+      // invalid UTF-8 sequence: treat as one byte.
+      r = s + 1;
+
+   if (r && r - s >= len)
       return NULL;
    else
       return r;
author	Sebastian Geerken <devnull@localhost>	2012-12-14 16:41:56 +0100
committer	Sebastian Geerken <devnull@localhost>	2012-12-14 16:41:56 +0100
commit	5799b9d93121a7b0c2cd3e70417192b71bcfe38b (patch)
tree	20ba9d830362d400a1c2b115b1168c75e57f601f /lout/unicode.cc
parent	875355853f1c6cbdeb960133bf463a36072a0999 (diff)