respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)

When splitting long lines in plain text to avoid X11 coordinate overflows we need to make sure that multibyte UTF-8 chars are not split. Additionally combining chars like accents should stay together with their base char.
author: Johannes Hofmann <Johannes.Hofmann@gmx.de> 2009-11-29 21:40:02 +0100
committer: Johannes Hofmann <Johannes.Hofmann@gmx.de> 2009-11-29 21:40:02 +0100
commit: a3daa9910dfbfc0fc6b57ec37ad712fbc19b1e01 (patch)
tree: 1e2aa63886a04d08ba18fe37441bb55580365e4f /src
parent: ba9c7b7e9afdfcc01b5a35c4c387642925b1bf9a (diff)
5 files changed, 57 insertions, 40 deletions
diff --git a/src/misc.c b/src/misc.c
index 7d9c40ee..4205f896 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -14,11 +14,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <assert.h>
 
 #include "utf8.hh"
 #include "msg.h"
 #include "misc.h"
-#include "utf8.hh"
 
 /*
  * Escape characters as %XX sequences.
@@ -51,38 +51,50 @@ char *a_Misc_escape_chars(const char *str, const char *esc_set)
 /*
  * Takes a string and converts any tabs to spaces.
  */
-char *a_Misc_expand_tabs(const char *str, int len)
+int
+a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
 {
-   int i = 0, j, pos = 0, old_pos, char_len;
+   int j, pos = 0, written = 0, old_pos, char_len;
    uint_t code;
-   char *val;
+   static const int combining_char_space = 32;
 
-   if (memchr(str, '\t', len) == NULL) {
-      val = dStrndup(str, len);
-   } else {
-      Dstr *New = dStr_new("");
-
-      while (i < len) {
-         code = a_Utf8_decode(&str[i], str + len, &char_len);
-
-         if (code == '\t') {
-            /* Fill with whitespaces until the next tab. */
-            old_pos = pos;
-            pos += TAB_SIZE - (pos % TAB_SIZE);
-            for (j = old_pos; j < pos; j++)
-               dStr_append_c(New, ' ');
-         } else {
-            dStr_append_l(New, &str[i], char_len);
-            pos++;
-         }
+   while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
+      code = a_Utf8_decode(*start, end, &char_len);
 
-         i += char_len;
+      if (code == '\t') {
+         /* Fill with whitespaces until the next tab. */
+         old_pos = pos;
+         pos += TAB_SIZE - (pos % TAB_SIZE);
+         for (j = old_pos; j < pos; j++)
+            buf[written++] = ' ';
+      } else {
+         assert(char_len <= 4);
+         for (j = 0; j < char_len; j++)
+            buf[written++] = (*start)[j];
+         pos++;
       }
 
-      val = New->str;
-      dStr_free(New, FALSE);
+      *start += char_len;
+   }
+
+   /* If following chars are combining chars (e.g. accents) add them to the
+    * buffer. We have reserved combining_char_space bytes for this.
+    * If there should be more combining chars, we split nevertheless.
+    */
+   while (*start < end && written < buflen - 4) {
+      code = a_Utf8_decode(*start, end, &char_len);
+
+      if (! a_Utf8_combining_char(code))
+         break;
+
+      assert(char_len <= 4);
+      for (j = 0; j < char_len; j++)
+         buf[written++] = (*start)[j];
+    
+      *start += char_len;
    }
-   return val;
+
+   return written;
 }
 
 /* TODO: could use dStr ADT! */
diff --git a/src/misc.h b/src/misc.h
index d8de0238..0b4eaaa5 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -10,7 +10,7 @@ extern "C" {
 
 
 char *a_Misc_escape_chars(const char *str, const char *esc_set);
-char *a_Misc_expand_tabs(const char *str, int len);
+int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen);
 int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT);
 int a_Misc_content_type_check(const char *EntryType, const char *DetectedType);
 void a_Misc_parse_content_type(const char *str, char **major, char **minor,
diff --git a/src/plain.cc b/src/plain.cc
index a610e174..394c4ca5 100644
--- a/src/plain.cc
+++ b/src/plain.cc
@@ -135,20 +135,16 @@ bool DilloPlain::PlainLinkReceiver::press (Widget *widget, int, int, int, int,
 
 void DilloPlain::addLine(char *Buf, uint_t BufSize)
 {
-   uint_t remaining;
-   char *dp, *data;
-   const uint_t maxWordLen = 128; // Limit word len to avoid X11 coordinate
-                                  // overflow with extremely long lines.
-   dp = data = a_Misc_expand_tabs(Buf, BufSize);
-   remaining = strlen(data);
-   while (remaining > maxWordLen) {
-      DW2TB(dw)->addText(dp, maxWordLen, widgetStyle);
-      remaining -= maxWordLen;
-      dp += maxWordLen;
-   }
-   DW2TB(dw)->addText(dp, widgetStyle);
+   int len;
+   char buf[128];
+   char *end = Buf + BufSize;
+
+   // Limit word len to avoid X11 coordinate
+   // overflow with extremely long lines.
+   while ((len = a_Misc_expand_tabs(&Buf, end, buf, sizeof(buf))))
+      DW2TB(dw)->addText(buf, len, widgetStyle);
+
    DW2TB(dw)->addParbreak(0, widgetStyle);
-   dFree(data);
 }
 
 /*
diff --git a/src/utf8.cc b/src/utf8.cc
index 47d8112b..0138c616 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -92,3 +92,11 @@ bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
    }
    return ret;
 }
+
+bool_t a_Utf8_combining_char(int unicode)
+{
+   return ((unicode >= 0x0300 && unicode <= 0x036f) ||
+           (unicode >= 0x1dc0 && unicode <= 0x1dff) ||
+           (unicode >= 0x20d0 && unicode <= 0x20ff) ||
+           (unicode >= 0xfe20 && unicode <= 0xfe2f));
+}
diff --git a/src/utf8.hh b/src/utf8.hh
index 6e2b4169..fd1fb87e 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -20,6 +20,7 @@ uint_t a_Utf8_decode(const char*, const char* end, int* len);
 int a_Utf8_encode(unsigned int ucs, char *buf);
 int a_Utf8_test(const char* src, unsigned int srclen);
 bool_t a_Utf8_ideographic(const char *s, const char *end, int *len);
+bool_t a_Utf8_combining_char(int unicode);
 
 #ifdef __cplusplus
 }
author	Johannes Hofmann <Johannes.Hofmann@gmx.de>	2009-11-29 21:40:02 +0100
committer	Johannes Hofmann <Johannes.Hofmann@gmx.de>	2009-11-29 21:40:02 +0100
commit	a3daa9910dfbfc0fc6b57ec37ad712fbc19b1e01 (patch)
tree	1e2aa63886a04d08ba18fe37441bb55580365e4f /src
parent	ba9c7b7e9afdfcc01b5a35c4c387642925b1bf9a (diff)