make tab expansion for plain text utf8 aware

In discussion with corvid <corvid@lavabit.com>.
author: Johannes Hofmann <Johannes.Hofmann@gmx.de> 2009-05-25 18:42:24 +0200
committer: Johannes Hofmann <Johannes.Hofmann@gmx.de> 2009-05-25 18:42:24 +0200
commit: 50260728b2e2d2c9e61a13b54b6b973bdc48fae0 (patch)
tree: 63b462a3d4cbbf445665714812331b2f7e968204 /src
parent: 6d62e8cf2ed9fe4eda942a59ba140b151b82b228 (diff)
3 files changed, 29 insertions, 14 deletions
diff --git a/src/misc.c b/src/misc.c
index 8cfb7003..d1a5352e 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <ctype.h>
 
+#include "utf8.hh"
 #include "msg.h"
 #include "misc.h"
 
@@ -47,7 +48,6 @@ char *a_Misc_escape_chars(const char *str, const char *esc_set)
    return p;
 }
 
-
 #define TAB_SIZE 8
 /*
  * Takes a string and converts any tabs to spaces.
@@ -55,23 +55,27 @@ char *a_Misc_escape_chars(const char *str, const char *esc_set)
 char *a_Misc_expand_tabs(const char *str, int len)
 {
    Dstr *New = dStr_new("");
-   int i, j, pos, old_pos;
+   int i = 0, j, pos = 0, old_pos, char_len;
+   uint_t code;
    char *val;
 
-   if (len) {
-      for (pos = 0, i = 0; i < len; i++) {
-         if (str[i] == '\t') {
-            /* Fill with whitespaces until the next tab. */
-            old_pos = pos;
-            pos += TAB_SIZE - (pos % TAB_SIZE);
-            for (j = old_pos; j < pos; j++)
-               dStr_append_c(New, ' ');
-         } else {
-            dStr_append_c(New, str[i]);
-            pos++;
-         }
+   while (i < len) {
+      code = a_Utf8_decode(&str[i], str + len, &char_len);
+
+      if (code == '\t') {
+         /* Fill with whitespaces until the next tab. */
+         old_pos = pos;
+         pos += TAB_SIZE - (pos % TAB_SIZE);
+         for (j = old_pos; j < pos; j++)
+            dStr_append_c(New, ' ');
+      } else {
+         dStr_append_l(New, &str[i], char_len);
+         pos++;
       }
+
+      i += char_len;
    }
+
    val = New->str;
    dStr_free(New, FALSE);
    return val;
diff --git a/src/utf8.cc b/src/utf8.cc
index 261024fb..42f7e676 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -36,6 +36,16 @@ uint_t a_Utf8_end_of_char(const char *str, uint_t i)
 }
 
 /*
+ * Decode a single UTF-8-encoded character starting at p.
+ * The resulting Unicode value (in the range 0-0x10ffff) is returned,
+ * and len is set the the number of bytes in the UTF-8 encoding.
+ */
+uint_t a_Utf8_decode(const char* str, const char* end, int* len)
+{
+   return utf8decode(str, end, len);
+}
+
+/*
  * Write UTF-8 encoding of ucs into buf and return number of bytes written.
  */
 int a_Utf8_encode(unsigned int ucs, char *buf)
diff --git a/src/utf8.hh b/src/utf8.hh
index e861d600..cdf6b50a 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -9,6 +9,7 @@ extern "C" {
 #include "d_size.h"
 
 uint_t a_Utf8_end_of_char(const char *str, uint_t i);
+uint_t a_Utf8_decode(const char*, const char* end, int* len);
 int a_Utf8_encode(unsigned int ucs, char *buf);
 int a_Utf8_test(const char* src, unsigned int srclen);
author	Johannes Hofmann <Johannes.Hofmann@gmx.de>	2009-05-25 18:42:24 +0200
committer	Johannes Hofmann <Johannes.Hofmann@gmx.de>	2009-05-25 18:42:24 +0200
commit	50260728b2e2d2c9e61a13b54b6b973bdc48fae0 (patch)
tree	63b462a3d4cbbf445665714812331b2f7e968204 /src
parent	6d62e8cf2ed9fe4eda942a59ba140b151b82b228 (diff)