aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Hofmann <Johannes.Hofmann@gmx.de>2009-11-29 21:40:02 +0100
committerJohannes Hofmann <Johannes.Hofmann@gmx.de>2009-11-29 21:40:02 +0100
commita3daa9910dfbfc0fc6b57ec37ad712fbc19b1e01 (patch)
tree1e2aa63886a04d08ba18fe37441bb55580365e4f
parentba9c7b7e9afdfcc01b5a35c4c387642925b1bf9a (diff)
respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
When splitting long lines in plain text to avoid X11 coordinate overflows we need to make sure that multibyte UTF-8 chars are not split. Additionally combining chars like accents should stay together with their base char.
-rw-r--r--src/misc.c64
-rw-r--r--src/misc.h2
-rw-r--r--src/plain.cc22
-rw-r--r--src/utf8.cc8
-rw-r--r--src/utf8.hh1
5 files changed, 57 insertions, 40 deletions
diff --git a/src/misc.c b/src/misc.c
index 7d9c40ee..4205f896 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -14,11 +14,11 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
+#include <assert.h>
#include "utf8.hh"
#include "msg.h"
#include "misc.h"
-#include "utf8.hh"
/*
* Escape characters as %XX sequences.
@@ -51,38 +51,50 @@ char *a_Misc_escape_chars(const char *str, const char *esc_set)
/*
* Takes a string and converts any tabs to spaces.
*/
-char *a_Misc_expand_tabs(const char *str, int len)
+int
+a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
{
- int i = 0, j, pos = 0, old_pos, char_len;
+ int j, pos = 0, written = 0, old_pos, char_len;
uint_t code;
- char *val;
+ static const int combining_char_space = 32;
- if (memchr(str, '\t', len) == NULL) {
- val = dStrndup(str, len);
- } else {
- Dstr *New = dStr_new("");
-
- while (i < len) {
- code = a_Utf8_decode(&str[i], str + len, &char_len);
-
- if (code == '\t') {
- /* Fill with whitespaces until the next tab. */
- old_pos = pos;
- pos += TAB_SIZE - (pos % TAB_SIZE);
- for (j = old_pos; j < pos; j++)
- dStr_append_c(New, ' ');
- } else {
- dStr_append_l(New, &str[i], char_len);
- pos++;
- }
+ while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
+ code = a_Utf8_decode(*start, end, &char_len);
- i += char_len;
+ if (code == '\t') {
+ /* Fill with whitespaces until the next tab. */
+ old_pos = pos;
+ pos += TAB_SIZE - (pos % TAB_SIZE);
+ for (j = old_pos; j < pos; j++)
+ buf[written++] = ' ';
+ } else {
+ assert(char_len <= 4);
+ for (j = 0; j < char_len; j++)
+ buf[written++] = (*start)[j];
+ pos++;
}
- val = New->str;
- dStr_free(New, FALSE);
+ *start += char_len;
+ }
+
+ /* If following chars are combining chars (e.g. accents) add them to the
+ * buffer. We have reserved combining_char_space bytes for this.
+ * If there should be more combining chars, we split nevertheless.
+ */
+ while (*start < end && written < buflen - 4) {
+ code = a_Utf8_decode(*start, end, &char_len);
+
+ if (! a_Utf8_combining_char(code))
+ break;
+
+ assert(char_len <= 4);
+ for (j = 0; j < char_len; j++)
+ buf[written++] = (*start)[j];
+
+ *start += char_len;
}
- return val;
+
+ return written;
}
/* TODO: could use dStr ADT! */
diff --git a/src/misc.h b/src/misc.h
index d8de0238..0b4eaaa5 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -10,7 +10,7 @@ extern "C" {
char *a_Misc_escape_chars(const char *str, const char *esc_set);
-char *a_Misc_expand_tabs(const char *str, int len);
+int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen);
int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT);
int a_Misc_content_type_check(const char *EntryType, const char *DetectedType);
void a_Misc_parse_content_type(const char *str, char **major, char **minor,
diff --git a/src/plain.cc b/src/plain.cc
index a610e174..394c4ca5 100644
--- a/src/plain.cc
+++ b/src/plain.cc
@@ -135,20 +135,16 @@ bool DilloPlain::PlainLinkReceiver::press (Widget *widget, int, int, int, int,
void DilloPlain::addLine(char *Buf, uint_t BufSize)
{
- uint_t remaining;
- char *dp, *data;
- const uint_t maxWordLen = 128; // Limit word len to avoid X11 coordinate
- // overflow with extremely long lines.
- dp = data = a_Misc_expand_tabs(Buf, BufSize);
- remaining = strlen(data);
- while (remaining > maxWordLen) {
- DW2TB(dw)->addText(dp, maxWordLen, widgetStyle);
- remaining -= maxWordLen;
- dp += maxWordLen;
- }
- DW2TB(dw)->addText(dp, widgetStyle);
+ int len;
+ char buf[128];
+ char *end = Buf + BufSize;
+
+ // Limit word len to avoid X11 coordinate
+ // overflow with extremely long lines.
+ while ((len = a_Misc_expand_tabs(&Buf, end, buf, sizeof(buf))))
+ DW2TB(dw)->addText(buf, len, widgetStyle);
+
DW2TB(dw)->addParbreak(0, widgetStyle);
- dFree(data);
}
/*
diff --git a/src/utf8.cc b/src/utf8.cc
index 47d8112b..0138c616 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -92,3 +92,11 @@ bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
}
return ret;
}
+
+bool_t a_Utf8_combining_char(int unicode)
+{
+ return ((unicode >= 0x0300 && unicode <= 0x036f) ||
+ (unicode >= 0x1dc0 && unicode <= 0x1dff) ||
+ (unicode >= 0x20d0 && unicode <= 0x20ff) ||
+ (unicode >= 0xfe20 && unicode <= 0xfe2f));
+}
diff --git a/src/utf8.hh b/src/utf8.hh
index 6e2b4169..fd1fb87e 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -20,6 +20,7 @@ uint_t a_Utf8_decode(const char*, const char* end, int* len);
int a_Utf8_encode(unsigned int ucs, char *buf);
int a_Utf8_test(const char* src, unsigned int srclen);
bool_t a_Utf8_ideographic(const char *s, const char *end, int *len);
+bool_t a_Utf8_combining_char(int unicode);
#ifdef __cplusplus
}