diff options
Diffstat (limited to 'lout/unicode.cc')
-rw-r--r-- | lout/unicode.cc | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/lout/unicode.cc b/lout/unicode.cc new file mode 100644 index 00000000..38d71494 --- /dev/null +++ b/lout/unicode.cc @@ -0,0 +1,75 @@ +#include "unicode.hh" + +namespace lout { + +namespace unicode { + +static unsigned char alpha[0x500] = { + // 0000-007F: C0 Controls and Basic Latin + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, + // 0080-00FF: C1 Controls and Latin-1 Supplement + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, + // 0100-017F: Latin Extended-A + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // 0180-024F: Latin Extended-B + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, + // 0250–02AF: IPA Extensions + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 02B0–02FF: Spacing Modifier Letters + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + // 0300–036F: Combining Diacritical Marks + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // 0370–03FF: Greek and Coptic + 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 0400–04FF: Cyrillic + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Returns whether a given unicode character is an alphabetic character. + */ +bool isAlpha (int ch) +{ + return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7))); +} + +int decodeUtf8 (char *s) +{ + if((s[0] & 0x80) == 0) + return s[0]; + else { + int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0; + for(int j = 1; !done && j < 7; + j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) { + if(((unsigned char)s[i] & mask) == bits) { + done = 1; + ch = (unsigned char)s[i] & ~mask & 0xff; + i++; + for(int k = 0; k < j; k++) { + ch = (ch << 6) | ((unsigned char)s[i] & 0x3f); + i++; + } + } + } + + return ch; + } +} + +} // namespace lout + +} // namespace unicode |