diff options
-rw-r--r-- | dw/hyphenator.cc | 5 | ||||
-rw-r--r-- | lout/Makefile.am | 2 | ||||
-rw-r--r-- | lout/unicode.cc | 75 | ||||
-rw-r--r-- | lout/unicode.hh | 20 | ||||
-rw-r--r-- | test/liang.cc | 2 |
5 files changed, 104 insertions, 0 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc index 4acee185..1c64c310 100644 --- a/dw/hyphenator.cc +++ b/dw/hyphenator.cc @@ -1,6 +1,7 @@ #include "hyphenator.hh" #include "../lout/misc.hh" +#include "../lout/unicode.hh" #include <stdio.h> #include <string.h> @@ -192,6 +193,7 @@ bool Hyphenator::isHyphenationCandidate (const char *word) */ bool Hyphenator::isCharPartOfActualWord (char *s) { +#if 0 // Return true when "s" points to a letter. return (s[0] >= 'a' && s[0] <= 'z') || // UTF-8: starts with 0xc3 @@ -200,6 +202,9 @@ bool Hyphenator::isCharPartOfActualWord (char *s) (unsigned char)s[1] == 0xb6 /* ö */ || (unsigned char)s[1] == 0xbc /* ü */ || (unsigned char)s[1] == 0x9f /* ß */ )); +#endif + + return lout::unicode::isAlpha (lout::unicode::decodeUtf8 (s)); } /** diff --git a/lout/Makefile.am b/lout/Makefile.am index a3f947db..bef9696e 100644 --- a/lout/Makefile.am +++ b/lout/Makefile.am @@ -15,4 +15,6 @@ liblout_a_SOURCES = \ object.hh \ signal.cc \ signal.hh \ + unicode.cc \ + unicode.hh \ msg.h diff --git a/lout/unicode.cc b/lout/unicode.cc new file mode 100644 index 00000000..38d71494 --- /dev/null +++ b/lout/unicode.cc @@ -0,0 +1,75 @@ +#include "unicode.hh" + +namespace lout { + +namespace unicode { + +static unsigned char alpha[0x500] = { + // 0000-007F: C0 Controls and Basic Latin + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, + // 0080-00FF: C1 Controls and Latin-1 Supplement + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, + // 0100-017F: Latin Extended-A + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // 0180-024F: Latin Extended-B + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, + // 0250–02AF: IPA Extensions + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 02B0–02FF: Spacing Modifier Letters + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + // 0300–036F: Combining Diacritical Marks + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // 0370–03FF: Greek and Coptic + 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + // 0400–04FF: Cyrillic + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Returns whether a given unicode character is an alphabetic character. + */ +bool isAlpha (int ch) +{ + return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7))); +} + +int decodeUtf8 (char *s) +{ + if((s[0] & 0x80) == 0) + return s[0]; + else { + int mask = 0xe0, bits = 0xc0, done = 0, ch = 0, i = 0; + for(int j = 1; !done && j < 7; + j++, mask = 0x80 | (mask >> 1), bits = 0x80 | (bits >> 1)) { + if(((unsigned char)s[i] & mask) == bits) { + done = 1; + ch = (unsigned char)s[i] & ~mask & 0xff; + i++; + for(int k = 0; k < j; k++) { + ch = (ch << 6) | ((unsigned char)s[i] & 0x3f); + i++; + } + } + } + + return ch; + } +} + +} // namespace lout + +} // namespace unicode diff --git a/lout/unicode.hh b/lout/unicode.hh new file mode 100644 index 00000000..123e7aa3 --- /dev/null +++ b/lout/unicode.hh @@ -0,0 +1,20 @@ +#ifndef __UNICODE_HH__ +#define __UNICODE_HH__ + +namespace lout { + +/** + * \brief Stuff dealing with Unicode characters: UTF-8, character classes etc. + * + */ +namespace unicode { + +bool isAlpha (int ch); + +int decodeUtf8 (char *s); + +} // namespace lout + +} // namespace unicode + +#endif // __UNICODE_HH__ diff --git a/test/liang.cc b/test/liang.cc index 026568d0..56e3bfea 100644 --- a/test/liang.cc +++ b/test/liang.cc @@ -30,6 +30,7 @@ int main (int argc, char *argv[]) hyphenateWord (&p, "JAHRHUNDERTROMAN"); hyphenateWord (&p, "„Jahrhundertroman“"); hyphenateWord (&p, "währenddessen"); + hyphenateWord (&p, "„währenddessen“"); hyphenateWord (&p, "Ückendorf"); hyphenateWord (&p, "über"); hyphenateWord (&p, "aber"); @@ -41,6 +42,7 @@ int main (int argc, char *argv[]) hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeits" "übertragungsverordnung“"); hyphenateWord (&p, "Grundstücksverkehrsgenehmigungszuständigkeit"); + hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeit“"); hyphenateWord (&p, "(6R,7R)-7-[2-(2-Amino-4-thiazolyl)-glyoxylamido]-3-" "(2,5-dihydro-6-hydroxy-2-methyl-5-oxo-1,2,4-triazin-3-yl-" "thiomethyl)-8-oxo-5-thia-1-azabicyclo[4.2.0]oct-2-en-2-" |