diff options
-rw-r--r-- | dw/hyphenator.cc | 54 | ||||
-rw-r--r-- | dw/hyphenator.hh | 2 | ||||
-rw-r--r-- | test/liang.cc | 9 |
3 files changed, 60 insertions, 5 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc index eb465a65..258bbafb 100644 --- a/dw/hyphenator.cc +++ b/dw/hyphenator.cc @@ -179,6 +179,26 @@ bool Hyphenator::isHyphenationCandidate (const char *word) } /** + * Test whether the character on which "s" points (UTF-8) is an actual + * part of the word. Other characters at the beginning and end are + * ignored. + * + * TODO Currently only suitable for English and German. + * TODO Only lowercase. (Uppercase not needed.) + */ +bool Hyphenator::isCharPartOfActualWord (char *s) +{ + // Return true when "s" points to a letter. + return (s[0] >= 'a' && s[0] <= 'z') || + // UTF-8: starts with 0xc3 + ((unsigned char)s[0] == 0xc3 && + ((unsigned char)s[1] == 0xa4 /* ä */ || + (unsigned char)s[1] == 0xb6 /* ö */ || + (unsigned char)s[1] == 0xbc /* ü */ || + (unsigned char)s[1] == 0x9f /* ß */ )); +} + +/** * Given a word, returns a list of the possible hyphenation points. */ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) @@ -190,13 +210,39 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) char *wordLc = platform->textToLower (word, strlen (word)); + // Determine "actual" word. See isCharPartOfActualWord for exact definition. + + // Only this actual word is used, and "startActualWord" is added to the + // break positions, so that these refer to the total word. + int startActualWord = 0; + while (wordLc[startActualWord] && + !isCharPartOfActualWord (wordLc + startActualWord)) + startActualWord = platform->nextGlyph (wordLc, startActualWord); + + if (wordLc[startActualWord] == 0) { + // No letters etc in word: do not hyphenate at all. + delete wordLc; + *numBreaks = 0; + return NULL; + } + + int endActualWord = startActualWord, i = endActualWord; + while (wordLc[i]) { + if (isCharPartOfActualWord (wordLc + i)) + endActualWord = i; + i = platform->nextGlyph (wordLc, i); + } + + endActualWord = platform->nextGlyph (wordLc, endActualWord); + wordLc[endActualWord] = 0; + // If the word is an exception, get the stored points. Vector <Integer> *exceptionalBreaks; - ConstString key (wordLc); + ConstString key (wordLc + startActualWord); if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) { int *result = new int[exceptionalBreaks->size()]; for (int i = 0; i < exceptionalBreaks->size(); i++) - result[i] = exceptionalBreaks->get(i)->getValue(); + result[i] = exceptionalBreaks->get(i)->getValue() + startActualWord; delete wordLc; *numBreaks = exceptionalBreaks->size(); return result; @@ -211,7 +257,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) char work[strlen (word) + 3]; strcpy (work, "."); - strcat (work, wordLc); + strcat (work, wordLc + startActualWord); delete wordLc; strcat (work, "."); @@ -252,7 +298,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) for (int i = 0; i < n; i++) { if (points.get(i + 2) % 2) { breakPos.increase (); - breakPos.set (breakPos.size() - 1, i + 1); + breakPos.set (breakPos.size() - 1, i + 1 + startActualWord); } } diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh index 9ef3e306..bf123e5a 100644 --- a/dw/hyphenator.hh +++ b/dw/hyphenator.hh @@ -30,6 +30,8 @@ private: void insertPattern (char *s); void insertException (char *s); + bool isCharPartOfActualWord (char *s); + public: Hyphenator (core::Platform *platform, const char *patFile, const char *excFile); diff --git a/test/liang.cc b/test/liang.cc index 5cee0b5f..026568d0 100644 --- a/test/liang.cc +++ b/test/liang.cc @@ -1,3 +1,4 @@ + #include "../dw/fltkcore.hh" #include "../dw/hyphenator.hh" @@ -9,7 +10,7 @@ void hyphenateWord (dw::core::Platform *p, const char *word) int *breakPos = h->hyphenateWord (word, &numBreaks); for (int i = 0; i < numBreaks + 1; i++) { if (i != 0) - printf ("\xc2\xad"); + printf (" \xc2\xad "); int start = (i == 0 ? 0 : breakPos[i - 1]); int end = (i == numBreaks ? strlen (word) : breakPos[i]); for (int j = start; j < end; j++) @@ -24,8 +25,10 @@ int main (int argc, char *argv[]) { dw::fltk::FltkPlatform p; + hyphenateWord (&p, "..."); hyphenateWord (&p, "Jahrhundertroman"); hyphenateWord (&p, "JAHRHUNDERTROMAN"); + hyphenateWord (&p, "„Jahrhundertroman“"); hyphenateWord (&p, "währenddessen"); hyphenateWord (&p, "Ückendorf"); hyphenateWord (&p, "über"); @@ -38,6 +41,10 @@ int main (int argc, char *argv[]) hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeits" "übertragungsverordnung“"); hyphenateWord (&p, "Grundstücksverkehrsgenehmigungszuständigkeit"); + hyphenateWord (&p, "(6R,7R)-7-[2-(2-Amino-4-thiazolyl)-glyoxylamido]-3-" + "(2,5-dihydro-6-hydroxy-2-methyl-5-oxo-1,2,4-triazin-3-yl-" + "thiomethyl)-8-oxo-5-thia-1-azabicyclo[4.2.0]oct-2-en-2-" + "carbonsäure-7²-(Z)-(O-methyloxim)"); return 0; } |