aboutsummaryrefslogtreecommitdiff
path: root/lout/unicode.cc
diff options
context:
space:
mode:
authorsgeerken <devnull@localhost>2012-07-12 13:15:20 +0200
committersgeerken <devnull@localhost>2012-07-12 13:15:20 +0200
commit2e9c1b2b921a249743b139e7cea3449e642ae30e (patch)
tree256f8b387dda15ded6d9407c98711ccca4760a2a /lout/unicode.cc
parentfa1c054f17f6c71a5314e4c3e638c15aa33ec09c (diff)
new file "unicode.hh" and "unicode.cc" for Unicode and UTF-8 issues
Diffstat (limited to 'lout/unicode.cc')
-rw-r--r--lout/unicode.cc75
1 files changed, 75 insertions, 0 deletions
diff --git a/lout/unicode.cc b/lout/unicode.cc
new file mode 100644
index 00000000..38d71494
--- /dev/null
+++ b/lout/unicode.cc
@@ -0,0 +1,75 @@
+#include "unicode.hh"
+
+namespace lout {
+
+namespace unicode {
+
+static unsigned char alpha[0x500] = {
+ // 0000-007F: C0 Controls and Basic Latin
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
+ // 0080-00FF: C1 Controls and Latin-1 Supplement
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff,
+ // 0100-017F: Latin Extended-A
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ // 0180-024F: Latin Extended-B
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff,
+ // 0250–02AF: IPA Extensions
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff,
+ // 02B0–02FF: Spacing Modifier Letters
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00,
+ // 0300–036F: Combining Diacritical Marks
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ // 0370–03FF: Greek and Coptic
+ 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff,
+ // 0400–04FF: Cyrillic
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/**
+ * Returns whether a given unicode character is an alphabetic character.
+ */
+bool isAlpha (int ch)
+{
+ return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7)));
+}
+
+int decodeUtf8 (char *s)
+{
+ if((s[0] & 0x80) == 0)
+ return s[0];
+ else {
+ int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0;
+ for(int j = 1; !done && j < 7;
+ j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
+ if(((unsigned char)s[i] & mask) == bits) {
+ done = 1;
+ ch = (unsigned char)s[i] & ~mask & 0xff;
+ i++;
+ for(int k = 0; k < j; k++) {
+ ch = (ch << 6) | ((unsigned char)s[i] & 0x3f);
+ i++;
+ }
+ }
+ }
+
+ return ch;
+ }
+}
+
+} // namespace lout
+
+} // namespace unicode