At least at the beginning, UTF-8 characters are countet correctly. Ückendorf is now hyphenated correctly.

author: sgeerken <devnull@localhost> 2012-07-11 14:15:04 +0200
committer: sgeerken <devnull@localhost> 2012-07-11 14:15:04 +0200
commit: 3cef8224ed1a1a3267adf5a5d3f1bdcf4c1b5eca (patch)
tree: 2689b169db86211ed86f158e4e2e3eb37326b380 /dw/hyphenator.cc
parent: de3847bff7e3d0215d4f8cfae210315871a6f4d5 (diff)
1 files changed, 9 insertions, 3 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc
index 7a1f5df7..4acee185 100644
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@@ -262,7 +262,6 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
    char work[strlen (word) + 3];
    strcpy (work, ".");
    strcat (work, wordLc + startActualWord);
-   delete wordLc;
    strcat (work, ".");
    
    int l = strlen (work);
@@ -290,8 +289,13 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
    }  
 
    // No hyphens in the first two chars or the last two.
-   points.set (1, 0);
-   points.set (2, 0);
+   // Characters are not bytes, so UTF-8 characters must be counted.
+   int numBytes1 = platform->nextGlyph (wordLc + startActualWord, 0);
+   int numBytes2 = platform->nextGlyph (wordLc + startActualWord, numBytes1);
+   for (int i = 0; i < numBytes2; i++)
+      points.set (i + 1, 0);
+
+   // TODO: Characters, not bytes (as above).
    points.set (points.size() - 2, 0);
    points.set (points.size() - 3, 0);
    
@@ -306,6 +310,8 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
       }
    }
 
+   delete wordLc;
+
    *numBreaks = breakPos.size ();
    if (*numBreaks == 0)
       return NULL;
author	sgeerken <devnull@localhost>	2012-07-11 14:15:04 +0200
committer	sgeerken <devnull@localhost>	2012-07-11 14:15:04 +0200
commit	3cef8224ed1a1a3267adf5a5d3f1bdcf4c1b5eca (patch)
tree	2689b169db86211ed86f158e4e2e3eb37326b380 /dw/hyphenator.cc
parent	de3847bff7e3d0215d4f8cfae210315871a6f4d5 (diff)