diff options
author | sgeerken <devnull@localhost> | 2012-07-12 13:15:20 +0200 |
---|---|---|
committer | sgeerken <devnull@localhost> | 2012-07-12 13:15:20 +0200 |
commit | 2e9c1b2b921a249743b139e7cea3449e642ae30e (patch) | |
tree | 256f8b387dda15ded6d9407c98711ccca4760a2a /lout | |
parent | fa1c054f17f6c71a5314e4c3e638c15aa33ec09c (diff) |
new file "unicode.hh" and "unicode.cc" for Unicode and UTF-8 issues
Diffstat (limited to 'lout')
-rw-r--r-- | lout/Makefile.am | 2 | ||||
-rw-r--r-- | lout/unicode.cc | 75 | ||||
-rw-r--r-- | lout/unicode.hh | 20 |
3 files changed, 97 insertions, 0 deletions
diff --git a/lout/Makefile.am b/lout/Makefile.am index a3f947db..bef9696e 100644 --- a/lout/Makefile.am +++ b/lout/Makefile.am @@ -15,4 +15,6 @@ liblout_a_SOURCES = \ object.hh \ signal.cc \ signal.hh \ + unicode.cc \ + unicode.hh \ msg.h diff --git a/lout/unicode.cc b/lout/unicode.cc new file mode 100644 index 00000000..38d71494 --- /dev/null +++ b/lout/unicode.cc @@ -0,0 +1,75 @@ +#include "unicode.hh" + +namespace lout { + +namespace unicode { + +static unsigned char alpha[0x500] = { + // 0000-007F: C0 Controls and Basic Latin + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, + // 0080-00FF: C1 Controls and Latin-1 Supplement + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, + // 0100-017F: Latin Extended-A + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // 0180-024F: Latin Extended-B + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, + // 0250–02AF: IPA Extensions + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 02B0–02FF: Spacing Modifier Letters + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + // 0300–036F: Combining Diacritical Marks + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // 0370–03FF: Greek and Coptic + 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 0400–04FF: Cyrillic + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Returns whether a given unicode character is an alphabetic character. + */ +bool isAlpha (int ch) +{ + return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7))); +} + +int decodeUtf8 (char *s) +{ + if((s[0] & 0x80) == 0) + return s[0]; + else { + int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0; + for(int j = 1; !done && j < 7; + j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) { + if(((unsigned char)s[i] & mask) == bits) { + done = 1; + ch = (unsigned char)s[i] & ~mask & 0xff; + i++; + for(int k = 0; k < j; k++) { + ch = (ch << 6) | ((unsigned char)s[i] & 0x3f); + i++; + } + } + } + + return ch; + } +} + +} // namespace lout + +} // namespace unicode diff --git a/lout/unicode.hh b/lout/unicode.hh new file mode 100644 index 00000000..123e7aa3 --- /dev/null +++ b/lout/unicode.hh @@ -0,0 +1,20 @@ +#ifndef __UNICODE_HH__ +#define __UNICODE_HH__ + +namespace lout { + +/** + * \brief Stuff dealing with Unicode characters: UTF-8, character classes etc. + * + */ +namespace unicode { + +bool isAlpha (int ch); + +int decodeUtf8 (char *s); + +} // namespace lout + +} // namespace unicode + +#endif // __UNICODE_HH__ |