diff options
author | Johannes Hofmann <Johannes.Hofmann@gmx.de> | 2009-11-29 21:40:02 +0100 |
---|---|---|
committer | Johannes Hofmann <Johannes.Hofmann@gmx.de> | 2009-11-29 21:40:02 +0100 |
commit | a3daa9910dfbfc0fc6b57ec37ad712fbc19b1e01 (patch) | |
tree | 1e2aa63886a04d08ba18fe37441bb55580365e4f /src | |
parent | ba9c7b7e9afdfcc01b5a35c4c387642925b1bf9a (diff) |
respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
When splitting long lines in plain text to avoid X11 coordinate
overflows we need to make sure that multibyte UTF-8 chars are not
split.
Additionally combining chars like accents should stay together with
their base char.
Diffstat (limited to 'src')
-rw-r--r-- | src/misc.c | 64 | ||||
-rw-r--r-- | src/misc.h | 2 | ||||
-rw-r--r-- | src/plain.cc | 22 | ||||
-rw-r--r-- | src/utf8.cc | 8 | ||||
-rw-r--r-- | src/utf8.hh | 1 |
5 files changed, 57 insertions, 40 deletions
@@ -14,11 +14,11 @@ #include <stdlib.h> #include <string.h> #include <ctype.h> +#include <assert.h> #include "utf8.hh" #include "msg.h" #include "misc.h" -#include "utf8.hh" /* * Escape characters as %XX sequences. @@ -51,38 +51,50 @@ char *a_Misc_escape_chars(const char *str, const char *esc_set) /* * Takes a string and converts any tabs to spaces. */ -char *a_Misc_expand_tabs(const char *str, int len) +int +a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen) { - int i = 0, j, pos = 0, old_pos, char_len; + int j, pos = 0, written = 0, old_pos, char_len; uint_t code; - char *val; + static const int combining_char_space = 32; - if (memchr(str, '\t', len) == NULL) { - val = dStrndup(str, len); - } else { - Dstr *New = dStr_new(""); - - while (i < len) { - code = a_Utf8_decode(&str[i], str + len, &char_len); - - if (code == '\t') { - /* Fill with whitespaces until the next tab. */ - old_pos = pos; - pos += TAB_SIZE - (pos % TAB_SIZE); - for (j = old_pos; j < pos; j++) - dStr_append_c(New, ' '); - } else { - dStr_append_l(New, &str[i], char_len); - pos++; - } + while (*start < end && written < buflen - TAB_SIZE - combining_char_space) { + code = a_Utf8_decode(*start, end, &char_len); - i += char_len; + if (code == '\t') { + /* Fill with whitespaces until the next tab. */ + old_pos = pos; + pos += TAB_SIZE - (pos % TAB_SIZE); + for (j = old_pos; j < pos; j++) + buf[written++] = ' '; + } else { + assert(char_len <= 4); + for (j = 0; j < char_len; j++) + buf[written++] = (*start)[j]; + pos++; } - val = New->str; - dStr_free(New, FALSE); + *start += char_len; + } + + /* If following chars are combining chars (e.g. accents) add them to the + * buffer. We have reserved combining_char_space bytes for this. + * If there should be more combining chars, we split nevertheless. + */ + while (*start < end && written < buflen - 4) { + code = a_Utf8_decode(*start, end, &char_len); + + if (! a_Utf8_combining_char(code)) + break; + + assert(char_len <= 4); + for (j = 0; j < char_len; j++) + buf[written++] = (*start)[j]; + + *start += char_len; } - return val; + + return written; } /* TODO: could use dStr ADT! */ @@ -10,7 +10,7 @@ extern "C" { char *a_Misc_escape_chars(const char *str, const char *esc_set); -char *a_Misc_expand_tabs(const char *str, int len); +int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen); int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT); int a_Misc_content_type_check(const char *EntryType, const char *DetectedType); void a_Misc_parse_content_type(const char *str, char **major, char **minor, diff --git a/src/plain.cc b/src/plain.cc index a610e174..394c4ca5 100644 --- a/src/plain.cc +++ b/src/plain.cc @@ -135,20 +135,16 @@ bool DilloPlain::PlainLinkReceiver::press (Widget *widget, int, int, int, int, void DilloPlain::addLine(char *Buf, uint_t BufSize) { - uint_t remaining; - char *dp, *data; - const uint_t maxWordLen = 128; // Limit word len to avoid X11 coordinate - // overflow with extremely long lines. - dp = data = a_Misc_expand_tabs(Buf, BufSize); - remaining = strlen(data); - while (remaining > maxWordLen) { - DW2TB(dw)->addText(dp, maxWordLen, widgetStyle); - remaining -= maxWordLen; - dp += maxWordLen; - } - DW2TB(dw)->addText(dp, widgetStyle); + int len; + char buf[128]; + char *end = Buf + BufSize; + + // Limit word len to avoid X11 coordinate + // overflow with extremely long lines. + while ((len = a_Misc_expand_tabs(&Buf, end, buf, sizeof(buf)))) + DW2TB(dw)->addText(buf, len, widgetStyle); + DW2TB(dw)->addParbreak(0, widgetStyle); - dFree(data); } /* diff --git a/src/utf8.cc b/src/utf8.cc index 47d8112b..0138c616 100644 --- a/src/utf8.cc +++ b/src/utf8.cc @@ -92,3 +92,11 @@ bool_t a_Utf8_ideographic(const char *s, const char *end, int *len) } return ret; } + +bool_t a_Utf8_combining_char(int unicode) +{ + return ((unicode >= 0x0300 && unicode <= 0x036f) || + (unicode >= 0x1dc0 && unicode <= 0x1dff) || + (unicode >= 0x20d0 && unicode <= 0x20ff) || + (unicode >= 0xfe20 && unicode <= 0xfe2f)); +} diff --git a/src/utf8.hh b/src/utf8.hh index 6e2b4169..fd1fb87e 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -20,6 +20,7 @@ uint_t a_Utf8_decode(const char*, const char* end, int* len); int a_Utf8_encode(unsigned int ucs, char *buf); int a_Utf8_test(const char* src, unsigned int srclen); bool_t a_Utf8_ideographic(const char *s, const char *end, int *len); +bool_t a_Utf8_combining_char(int unicode); #ifdef __cplusplus } |