1 files changed, 190 insertions, 0 deletions
diff --git a/lout/unicode.cc b/lout/unicode.cc
new file mode 100644
index 0000000..da9c27a
--- /dev/null
+++ b/lout/unicode.cc
@@ -0,0 +1,190 @@
+/*
+ * RTFL (originally part of dillo)
+ *
+ * Copyright 2012, 2013 Sebastian Geerken <sgeerken@dillo.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version; with the following exception:
+ *
+ * The copyright holders of RTFL give you permission to link this file
+ * statically or dynamically against all versions of the graphviz
+ * library, which are published by AT&T Corp. under one of the following
+ * licenses:
+ *
+ * - Common Public License version 1.0 as published by International
+ *   Business Machines Corporation (IBM), or
+ * - Eclipse Public License version 1.0 as published by the Eclipse
+ *   Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "unicode.hh"
+#include "misc.hh"
+
+using namespace lout::misc;
+
+namespace lout {
+
+namespace unicode {
+
+static unsigned char alpha[0x500] = {
+   // 0000-007F: C0 Controls and Basic Latin
+   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+   0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
+   // 0080-00FF: C1 Controls and Latin-1 Supplement
+   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+   0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff,
+   // 0100-017F: Latin Extended-A
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   // 0180-024F: Latin Extended-B
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff,
+   // 0250–02AF: IPA Extensions
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff,
+   // 02B0–02FF: Spacing Modifier Letters
+   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+   0x00, 0x00,
+   // 0300–036F: Combining Diacritical Marks
+   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+   0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+   // 0370–03FF: Greek and Coptic
+   0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff,
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff,
+   // 0400–04FF: Cyrillic
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/**
+ * Returns whether a given unicode character is an alphabetic character.
+ */
+bool isAlpha (int ch)
+{
+   return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7)));
+}
+
+int decodeUtf8 (const char *s)
+{
+   if((s[0] & 0x80) == 0)
+      return s[0];
+   else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+   else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6)  | (s[2] & 0x3f);
+   else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+         | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+   else
+      // Treat as ISO-8859-1 / ISO-8859-15 / Windows-1252
+      return s[0];
+}
+
+
+int decodeUtf8 (const char *s, int len)
+{
+   if(len >= 1 && (s[0] & 0x80) == 0)
+      return s[0];
+   else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
+   else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6)  | (s[2] & 0x3f);
+   else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
+         | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+   else
+      // Treat as ISO-8859-1 / ISO-8859-15 / Windows-1252
+      return s[0];
+}
+
+const char *nextUtf8Char (const char *s)
+{
+   const char *r;
+
+   if (s == NULL || s[0] == 0)
+      r = NULL;
+   else if((s[0] & 0x80) == 0)
+      r = s + 1;
+   else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      r = s + 2;
+   else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      r = s + 3;
+   else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      r = s + 4;
+   else
+      // invalid UTF-8 sequence: treat as one byte.
+      r = s + 1;
+
+   if (r && r[0] == 0)
+      return NULL;
+   else
+      return r;
+}
+
+const char *nextUtf8Char (const char *s, int len)
+{
+   const char *r;
+
+   if (s == NULL || len <= 0)
+      r = NULL;
+   else if(len >= 1 && (s[0] & 0x80) == 0)
+      r = s + 1;
+   else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
+      r = s + 2;
+   else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80)
+      r = s + 3;
+   else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
+           && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
+      r = s + 4;
+   else
+      // invalid UTF-8 sequence: treat as one byte.
+      r = s + 1;
+
+   if (r && r - s >= len)
+      return NULL;
+   else
+      return r;
+}
+
+int numUtf8Chars (const char *s)
+{
+   int numUtf8 = 0;
+   for (const char *r = s; r; r = nextUtf8Char (r))
+      numUtf8++;
+   return numUtf8;
+}
+
+int numUtf8Chars (const char *s, int len)
+{
+   int numUtf8 = 0;
+   for (const char *r = s; len > 0 && r; r = nextUtf8Char (r, len))
+      numUtf8++;
+   return numUtf8;
+}
+
+} // namespace lout
+
+} // namespace unicode