diff options
author | Sebastian Geerken <devnull@localhost> | 2012-12-06 15:11:57 +0100 |
---|---|---|
committer | Sebastian Geerken <devnull@localhost> | 2012-12-06 15:11:57 +0100 |
commit | a914d5ffedd10004ae86b55aaaf58df270c4d960 (patch) | |
tree | 7501afdfb31c82b8cb4d1ca9d0f1314b6c63c976 | |
parent | a9136a867918e4bb7aa2080f2960c6920b405258 (diff) |
Split up words in hyphenation.
-rw-r--r-- | dw/hyphenator.cc | 127 | ||||
-rw-r--r-- | dw/hyphenator.hh | 2 | ||||
-rw-r--r-- | test/hyphenate-nbsp.html | 3 | ||||
-rw-r--r-- | test/liang.cc | 1 |
4 files changed, 73 insertions, 60 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc index b2e557c5..cec5a9d8 100644 --- a/dw/hyphenator.cc +++ b/dw/hyphenator.cc @@ -230,54 +230,78 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) char *wordLc = platform->textToLower (word, strlen (word)); - // Determine "actual" word. See isCharPartOfActualWord for exact definition. - - // Only this actual word is used, and "startActualWord" is added to the - // break positions, so that these refer to the total word. - int startActualWord = 0; - while (wordLc[startActualWord] && - !isCharPartOfActualWord (wordLc + startActualWord)) - startActualWord = platform->nextGlyph (wordLc, startActualWord); - - if (wordLc[startActualWord] == 0) { - // No letters etc in word: do not hyphenate at all. - free (wordLc); - *numBreaks = 0; - return NULL; - } + int start = 0; + SimpleVector <int> breakPos (1); + + // Split the original word up, ignore anything but characters, and + // collect all break points, so that they fit to the original + // word. (The latter is what the offset in the call of + // hyphenateSingleWord() is for.) + while (true) { + while (wordLc[start] && !isCharPartOfActualWord (wordLc + start)) + start = platform->nextGlyph (wordLc, start); + + if (wordLc[start] == 0) + break; + + int end = start, i = end; + while (wordLc[i]) { + if (!isCharPartOfActualWord (wordLc + i)) + break; + else + end = i; + i = platform->nextGlyph (wordLc, i); + } + end = platform->nextGlyph (wordLc, end); - int endActualWord = startActualWord, i = endActualWord; - while (wordLc[i]) { - if (isCharPartOfActualWord (wordLc + i)) - endActualWord = i; - i = platform->nextGlyph (wordLc, i); + int nextStart; + if (wordLc[end]) { + nextStart = platform->nextGlyph (wordLc, end); + wordLc[end] = 0; + } else + nextStart = end; + + hyphenateSingleWord (wordLc + start, start, &breakPos); + start = nextStart; } - endActualWord = platform->nextGlyph (wordLc, endActualWord); - wordLc[endActualWord] = 0; + free (wordLc); + + *numBreaks = breakPos.size (); + if (*numBreaks == 0) + return NULL; + else { + return breakPos.detachArray (); + } +} +/** + * Hyphenate a single word, which only consists of lowercase + * characters. Store break positions + "offset" in "breakPos". + */ +void Hyphenator::hyphenateSingleWord(char *wordLc, int offset, + SimpleVector <int> *breakPos) +{ // If the word is an exception, get the stored points. Vector <Integer> *exceptionalBreaks; - ConstString key (wordLc + startActualWord); + ConstString key (wordLc); if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) { - int *result = new int[exceptionalBreaks->size()]; - for (int i = 0; i < exceptionalBreaks->size(); i++) - result[i] = exceptionalBreaks->get(i)->getValue() + startActualWord; - free (wordLc); - *numBreaks = exceptionalBreaks->size(); - return result; + for (int i = 0; i < exceptionalBreaks->size(); i++) { + breakPos->increase (); + breakPos->set (breakPos->size() - 1, + exceptionalBreaks->get(i)->getValue() + offset); + } + return; } + // trie == NULL means that there is no pattern file. - if (trie == NULL) { - free (wordLc); - *numBreaks = 0; - return NULL; - } + if (trie == NULL) + return; - char work[strlen (word) + 3]; + char work[strlen (wordLc) + 3]; strcpy (work, "."); - strcat (work, wordLc + startActualWord); + strcat (work, wordLc); strcat (work, "."); int l = strlen (work); @@ -300,38 +324,25 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks) // No hyphens in the first two chars or the last two. // Characters are not bytes, so UTF-8 characters must be counted. - int numBytes1Start = platform->nextGlyph (wordLc + startActualWord, 0); - int numBytes2Start = platform->nextGlyph (wordLc + startActualWord, - numBytes1Start); + int numBytes1Start = platform->nextGlyph (wordLc, 0); + int numBytes2Start = platform->nextGlyph (wordLc, numBytes1Start); for (int i = 0; i < numBytes2Start; i++) points.set (i + 1, 0); - int len = strlen (wordLc + startActualWord); - int numBytes1End = platform->prevGlyph (wordLc + startActualWord, len); - int numBytes2End = platform->prevGlyph (wordLc + startActualWord, - numBytes1End); + int len = strlen (wordLc); + int numBytes1End = platform->prevGlyph (wordLc, len); + int numBytes2End = platform->prevGlyph (wordLc, numBytes1End); for (int i = 0; i < len - numBytes2End; i++) points.set (points.size() - 2 - i, 0); - // Examine the points to build the pieces list. - SimpleVector <int> breakPos (1); - - int n = lout::misc::min ((int)strlen (word), points.size () - 2); + // Examine the points to build the break point list. + int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2); for (int i = 0; i < n; i++) { if (points.get(i + 2) % 2) { - breakPos.increase (); - breakPos.set (breakPos.size() - 1, i + 1 + startActualWord); + breakPos->increase (); + breakPos->set (breakPos->size() - 1, i + 1 + offset); } } - - free (wordLc); - - *numBreaks = breakPos.size (); - if (*numBreaks == 0) - return NULL; - else { - return breakPos.detachArray (); - } } Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL}; diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh index 743a9073..b02265ec 100644 --- a/dw/hyphenator.hh +++ b/dw/hyphenator.hh @@ -105,6 +105,8 @@ class Hyphenator: public lout::object::Object void insertPattern (TrieBuilder *trieBuilder, char *s); void insertException (char *s); + void hyphenateSingleWord(char *wordLc, int offset, + lout::misc::SimpleVector <int> *breakPos); bool isCharPartOfActualWord (char *s); public: diff --git a/test/hyphenate-nbsp.html b/test/hyphenate-nbsp.html index 16e4d298..1bfd9a25 100644 --- a/test/hyphenate-nbsp.html +++ b/test/hyphenate-nbsp.html @@ -1,4 +1,3 @@ <div style="font-size:10em" lang=de> -Weiß -kurz und knapp +Weiß kurz und knapp www.dillo.org </div> diff --git a/test/liang.cc b/test/liang.cc index 1f08f95c..b1161d9f 100644 --- a/test/liang.cc +++ b/test/liang.cc @@ -51,6 +51,7 @@ int main (int argc, char *argv[]) hyphenateWord (&p, "Nordrhein-Westfalen"); hyphenateWord (&p, "kurz\xc2\xa0und\xc2\xa0knapp"); hyphenateWord (&p, "weiß"); + hyphenateWord (&p, "www.dillo.org"); return 0; } |