Split up words in hyphenation.

author: Sebastian Geerken <devnull@localhost> 2012-12-06 15:11:57 +0100
committer: Sebastian Geerken <devnull@localhost> 2012-12-06 15:11:57 +0100
commit: a914d5ffedd10004ae86b55aaaf58df270c4d960 (patch)
tree: 7501afdfb31c82b8cb4d1ca9d0f1314b6c63c976
parent: a9136a867918e4bb7aa2080f2960c6920b405258 (diff)
4 files changed, 73 insertions, 60 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc
index b2e557c5..cec5a9d8 100644
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@@ -230,54 +230,78 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
 
    char *wordLc = platform->textToLower (word, strlen (word));
 
-   // Determine "actual" word. See isCharPartOfActualWord for exact definition.
-
-   // Only this actual word is used, and "startActualWord" is added to the
-   // break positions, so that these refer to the total word.
-   int startActualWord = 0;
-   while (wordLc[startActualWord] &&
-          !isCharPartOfActualWord (wordLc + startActualWord))
-      startActualWord = platform->nextGlyph (wordLc, startActualWord);
-
-   if (wordLc[startActualWord] == 0) {
-      // No letters etc in word: do not hyphenate at all.
-      free (wordLc);
-      *numBreaks = 0;
-      return NULL;
-   }
+   int start = 0;
+   SimpleVector <int> breakPos (1);
+
+   // Split the original word up, ignore anything but characters, and
+   // collect all break points, so that they fit to the original
+   // word. (The latter is what the offset in the call of
+   // hyphenateSingleWord() is for.)
+   while (true) {
+      while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
+         start = platform->nextGlyph (wordLc, start);
+      
+      if (wordLc[start] == 0)
+         break;
+
+      int end = start, i = end;
+      while (wordLc[i]) {
+         if (!isCharPartOfActualWord (wordLc + i))
+            break;
+         else
+            end = i;
+         i = platform->nextGlyph (wordLc, i);
+      }
+      end = platform->nextGlyph (wordLc, end);
 
-   int endActualWord = startActualWord, i = endActualWord;
-   while (wordLc[i]) {
-      if (isCharPartOfActualWord (wordLc + i))
-         endActualWord = i;
-      i = platform->nextGlyph (wordLc, i);
+      int nextStart;
+      if (wordLc[end]) {
+         nextStart = platform->nextGlyph (wordLc, end);
+         wordLc[end] = 0;
+      } else
+         nextStart = end;
+
+      hyphenateSingleWord (wordLc + start, start, &breakPos);
+      start = nextStart;
    }
 
-   endActualWord = platform->nextGlyph (wordLc, endActualWord);
-   wordLc[endActualWord] = 0;
+   free (wordLc);
+
+   *numBreaks = breakPos.size ();
+   if (*numBreaks == 0)
+      return NULL;
+   else {
+      return breakPos.detachArray ();
+   }
+}
 
+/**
+ * Hyphenate a single word, which only consists of lowercase
+ * characters. Store break positions + "offset" in "breakPos".
+ */
+void Hyphenator::hyphenateSingleWord(char *wordLc, int offset,
+                                    SimpleVector <int> *breakPos)
+{
    // If the word is an exception, get the stored points.
    Vector <Integer> *exceptionalBreaks;
-   ConstString key (wordLc + startActualWord);
+   ConstString key (wordLc);
    if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
-      int *result = new int[exceptionalBreaks->size()];
-      for (int i = 0; i < exceptionalBreaks->size(); i++)
-         result[i] = exceptionalBreaks->get(i)->getValue() + startActualWord;
-      free (wordLc);
-      *numBreaks = exceptionalBreaks->size();
-      return result;
+      for (int i = 0; i < exceptionalBreaks->size(); i++) {
+         breakPos->increase ();
+         breakPos->set (breakPos->size() - 1,
+                        exceptionalBreaks->get(i)->getValue() + offset);
+      }
+      return;
    }
 
+
    // trie == NULL means that there is no pattern file.
-   if (trie == NULL) {
-      free (wordLc);
-      *numBreaks = 0;
-      return NULL;
-   }
+   if (trie == NULL)
+      return;
 
-   char work[strlen (word) + 3];
+   char work[strlen (wordLc) + 3];
    strcpy (work, ".");
-   strcat (work, wordLc + startActualWord);
+   strcat (work, wordLc);
    strcat (work, ".");
 
    int l = strlen (work);
@@ -300,38 +324,25 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
 
    // No hyphens in the first two chars or the last two.
    // Characters are not bytes, so UTF-8 characters must be counted.
-   int numBytes1Start = platform->nextGlyph (wordLc + startActualWord, 0);
-   int numBytes2Start = platform->nextGlyph (wordLc + startActualWord,
-                                             numBytes1Start);
+   int numBytes1Start = platform->nextGlyph (wordLc, 0);
+   int numBytes2Start = platform->nextGlyph (wordLc, numBytes1Start);
    for (int i = 0; i < numBytes2Start; i++)
       points.set (i + 1, 0);
    
-   int len = strlen (wordLc + startActualWord);
-   int numBytes1End = platform->prevGlyph (wordLc + startActualWord, len);
-   int numBytes2End = platform->prevGlyph (wordLc + startActualWord,
-                                           numBytes1End);
+   int len = strlen (wordLc);
+   int numBytes1End = platform->prevGlyph (wordLc, len);
+   int numBytes2End = platform->prevGlyph (wordLc, numBytes1End);
    for (int i = 0; i < len - numBytes2End; i++)
       points.set (points.size() - 2 - i, 0);
 
-   // Examine the points to build the pieces list.
-   SimpleVector <int> breakPos (1);
-
-   int n = lout::misc::min ((int)strlen (word), points.size () - 2);
+   // Examine the points to build the break point list.
+   int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
    for (int i = 0; i < n; i++) {
       if (points.get(i + 2) % 2) {
-         breakPos.increase ();
-         breakPos.set (breakPos.size() - 1, i + 1 + startActualWord);
+         breakPos->increase ();
+         breakPos->set (breakPos->size() - 1, i + 1 + offset);
       }
    }
-
-   free (wordLc);
-
-   *numBreaks = breakPos.size ();
-   if (*numBreaks == 0)
-      return NULL;
-   else {
-      return breakPos.detachArray ();
-   }
 }
 
 Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh
index 743a9073..b02265ec 100644
--- a/dw/hyphenator.hh
+++ b/dw/hyphenator.hh
@@ -105,6 +105,8 @@ class Hyphenator: public lout::object::Object
    void insertPattern (TrieBuilder *trieBuilder, char *s);
    void insertException (char *s);
 
+   void hyphenateSingleWord(char *wordLc, int offset,
+                            lout::misc::SimpleVector <int> *breakPos);
    bool isCharPartOfActualWord (char *s);
 
 public:
diff --git a/test/hyphenate-nbsp.html b/test/hyphenate-nbsp.html
index 16e4d298..1bfd9a25 100644
--- a/test/hyphenate-nbsp.html
+++ b/test/hyphenate-nbsp.html
@@ -1,4 +1,3 @@
 <div style="font-size:10em" lang=de>
-Weiß
-kurz&nbsp;und&nbsp;knapp
+Wei� kurz&nbsp;und&nbsp;knapp www.dillo.org
 </div>
diff --git a/test/liang.cc b/test/liang.cc
index 1f08f95c..b1161d9f 100644
--- a/test/liang.cc
+++ b/test/liang.cc
@@ -51,6 +51,7 @@ int main (int argc, char *argv[])
    hyphenateWord (&p, "Nordrhein-Westfalen");
    hyphenateWord (&p, "kurz\xc2\xa0und\xc2\xa0knapp");
    hyphenateWord (&p, "weiß");
+   hyphenateWord (&p, "www.dillo.org");
 
    return 0;
 }
author	Sebastian Geerken <devnull@localhost>	2012-12-06 15:11:57 +0100
committer	Sebastian Geerken <devnull@localhost>	2012-12-06 15:11:57 +0100
commit	a914d5ffedd10004ae86b55aaaf58df270c4d960 (patch)
tree	7501afdfb31c82b8cb4d1ca9d0f1314b6c63c976
parent	a9136a867918e4bb7aa2080f2960c6920b405258 (diff)