diff options
author | sgeerken <devnull@localhost> | 2012-07-10 23:19:04 +0200 |
---|---|---|
committer | sgeerken <devnull@localhost> | 2012-07-10 23:19:04 +0200 |
commit | af4420e3bbd894c7ed21dc676dfdd35d7149bd32 (patch) | |
tree | 3058368b0ea93a452b32e8f11c927ed894f766c0 /dw | |
parent | 8b1467671cb6c48c7d1abf43c4cbd7ad7b0f5aa3 (diff) |
For hyphenation, all characters not belonging to the actual word, e.g. punctuation marks, are ignored. Currently only words for English and German.y
Diffstat (limited to 'dw')
-rw-r--r-- | dw/hyphenator.cc | 54 | ||||
-rw-r--r-- | dw/hyphenator.hh | 2 |
2 files changed, 52 insertions, 4 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc index eb465a65..258bbafb 100644 --- a/dw/hyphenator.cc +++ b/dw/hyphenator.cc @@ -179,6 +179,26 @@ bool Hyphenator::isHyphenationCandidate (const char *word) } /** + * Test whether the character on which "s" points (UTF-8) is an actual + * part of the word. Other characters at the beginning and end are + * ignored. + * + * TODO Currently only suitable for English and German. + * TODO Only lowercase. (Uppercase not needed.) + */ +bool Hyphenator::isCharPartOfActualWord (char *s) +{ + // Return true when "s" points to a letter. + return (s[0] >= 'a' && s[0] <= 'z') || + // UTF-8: starts with 0xc3 + ((unsigned char)s[0] == 0xc3 && + ((unsigned char)s[1] == 0xa4 /* ä */ || + (unsigned char)s[1] == 0xb6 /* ö */ || + (unsigned char)s[1] == 0xbc /* ü */ || + (unsigned char)s[1] == 0x9f /* ß */ )); +} + +/** * Given a word, returns a list of the possible hyphenation points. */ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) @@ -190,13 +210,39 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) char *wordLc = platform->textToLower (word, strlen (word)); + // Determine "actual" word. See isCharPartOfActualWord for exact definition. + + // Only this actual word is used, and "startActualWord" is added to the + // break positions, so that these refer to the total word. + int startActualWord = 0; + while (wordLc[startActualWord] && + !isCharPartOfActualWord (wordLc + startActualWord)) + startActualWord = platform->nextGlyph (wordLc, startActualWord); + + if (wordLc[startActualWord] == 0) { + // No letters etc in word: do not hyphenate at all. + delete wordLc; + *numBreaks = 0; + return NULL; + } + + int endActualWord = startActualWord, i = endActualWord; + while (wordLc[i]) { + if (isCharPartOfActualWord (wordLc + i)) + endActualWord = i; + i = platform->nextGlyph (wordLc, i); + } + + endActualWord = platform->nextGlyph (wordLc, endActualWord); + wordLc[endActualWord] = 0; + // If the word is an exception, get the stored points. Vector <Integer> *exceptionalBreaks; - ConstString key (wordLc); + ConstString key (wordLc + startActualWord); if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) { int *result = new int[exceptionalBreaks->size()]; for (int i = 0; i < exceptionalBreaks->size(); i++) - result[i] = exceptionalBreaks->get(i)->getValue(); + result[i] = exceptionalBreaks->get(i)->getValue() + startActualWord; delete wordLc; *numBreaks = exceptionalBreaks->size(); return result; @@ -211,7 +257,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) char work[strlen (word) + 3]; strcpy (work, "."); - strcat (work, wordLc); + strcat (work, wordLc + startActualWord); delete wordLc; strcat (work, "."); @@ -252,7 +298,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) for (int i = 0; i < n; i++) { if (points.get(i + 2) % 2) { breakPos.increase (); - breakPos.set (breakPos.size() - 1, i + 1); + breakPos.set (breakPos.size() - 1, i + 1 + startActualWord); } } diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh index 9ef3e306..bf123e5a 100644 --- a/dw/hyphenator.hh +++ b/dw/hyphenator.hh @@ -30,6 +30,8 @@ private: void insertPattern (char *s); void insertException (char *s); + bool isCharPartOfActualWord (char *s); + public: Hyphenator (core::Platform *platform, const char *patFile, const char *excFile); |