summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dw/hyphenator.cc5
-rw-r--r--lout/Makefile.am2
-rw-r--r--lout/unicode.cc75
-rw-r--r--lout/unicode.hh20
-rw-r--r--test/liang.cc2
5 files changed, 104 insertions, 0 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc
index 4acee185..1c64c310 100644
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@@ -1,6 +1,7 @@
#include "hyphenator.hh"
#include "../lout/misc.hh"
+#include "../lout/unicode.hh"
#include <stdio.h>
#include <string.h>
@@ -192,6 +193,7 @@ bool Hyphenator::isHyphenationCandidate (const char *word)
*/
bool Hyphenator::isCharPartOfActualWord (char *s)
{
+#if 0
// Return true when "s" points to a letter.
return (s[0] >= 'a' && s[0] <= 'z') ||
// UTF-8: starts with 0xc3
@@ -200,6 +202,9 @@ bool Hyphenator::isCharPartOfActualWord (char *s)
(unsigned char)s[1] == 0xb6 /* ö */ ||
(unsigned char)s[1] == 0xbc /* ü */ ||
(unsigned char)s[1] == 0x9f /* ß */ ));
+#endif
+
+ return lout::unicode::isAlpha (lout::unicode::decodeUtf8 (s));
}
/**
diff --git a/lout/Makefile.am b/lout/Makefile.am
index a3f947db..bef9696e 100644
--- a/lout/Makefile.am
+++ b/lout/Makefile.am
@@ -15,4 +15,6 @@ liblout_a_SOURCES = \
object.hh \
signal.cc \
signal.hh \
+ unicode.cc \
+ unicode.hh \
msg.h
diff --git a/lout/unicode.cc b/lout/unicode.cc
new file mode 100644
index 00000000..38d71494
--- /dev/null
+++ b/lout/unicode.cc
@@ -0,0 +1,75 @@
+#include "unicode.hh"
+
+namespace lout {
+
+namespace unicode {
+
+static unsigned char alpha[0x500] = {
+ // 0000-007F: C0 Controls and Basic Latin
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
+ // 0080-00FF: C1 Controls and Latin-1 Supplement
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff,
+ // 0100-017F: Latin Extended-A
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ // 0180-024F: Latin Extended-B
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff,
+ // 0250–02AF: IPA Extensions
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff,
+ // 02B0–02FF: Spacing Modifier Letters
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00,
+ // 0300–036F: Combining Diacritical Marks
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ // 0370–03FF: Greek and Coptic
+ 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff,
+ // 0400–04FF: Cyrillic
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/**
+ * Returns whether a given unicode character is an alphabetic character.
+ */
+bool isAlpha (int ch)
+{
+ return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7)));
+}
+
+int decodeUtf8 (char *s)
+{
+ if((s[0] & 0x80) == 0)
+ return s[0];
+ else {
+ int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0;
+ for(int j = 1; !done && j < 7;
+ j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) {
+ if(((unsigned char)s[i] & mask) == bits) {
+ done = 1;
+ ch = (unsigned char)s[i] & ~mask & 0xff;
+ i++;
+ for(int k = 0; k < j; k++) {
+ ch = (ch << 6) | ((unsigned char)s[i] & 0x3f);
+ i++;
+ }
+ }
+ }
+
+ return ch;
+ }
+}
+
+} // namespace lout
+
+} // namespace unicode
diff --git a/lout/unicode.hh b/lout/unicode.hh
new file mode 100644
index 00000000..123e7aa3
--- /dev/null
+++ b/lout/unicode.hh
@@ -0,0 +1,20 @@
+#ifndef __UNICODE_HH__
+#define __UNICODE_HH__
+
+namespace lout {
+
+/**
+ * \brief Stuff dealing with Unicode characters: UTF-8, character classes etc.
+ *
+ */
+namespace unicode {
+
+bool isAlpha (int ch);
+
+int decodeUtf8 (char *s);
+
+} // namespace lout
+
+} // namespace unicode
+
+#endif // __UNICODE_HH__
diff --git a/test/liang.cc b/test/liang.cc
index 026568d0..56e3bfea 100644
--- a/test/liang.cc
+++ b/test/liang.cc
@@ -30,6 +30,7 @@ int main (int argc, char *argv[])
hyphenateWord (&p, "JAHRHUNDERTROMAN");
hyphenateWord (&p, "„Jahrhundertroman“");
hyphenateWord (&p, "währenddessen");
+ hyphenateWord (&p, "„währenddessen“");
hyphenateWord (&p, "Ückendorf");
hyphenateWord (&p, "über");
hyphenateWord (&p, "aber");
@@ -41,6 +42,7 @@ int main (int argc, char *argv[])
hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeits"
"übertragungsverordnung“");
hyphenateWord (&p, "Grundstücksverkehrsgenehmigungszuständigkeit");
+ hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeit“");
hyphenateWord (&p, "(6R,7R)-7-[2-(2-Amino-4-thiazolyl)-glyoxylamido]-3-"
"(2,5-dihydro-6-hydroxy-2-methyl-5-oxo-1,2,4-triazin-3-yl-"
"thiomethyl)-8-oxo-5-thia-1-azabicyclo[4.2.0]oct-2-en-2-"