aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dw/hyphenator.cc54
-rw-r--r--dw/hyphenator.hh2
-rw-r--r--test/liang.cc9
3 files changed, 60 insertions, 5 deletions
diff --git a/dw/hyphenator.cc b/dw/hyphenator.cc
index eb465a65..258bbafb 100644
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@@ -179,6 +179,26 @@ bool Hyphenator::isHyphenationCandidate (const char *word)
}
/**
+ * Test whether the character on which "s" points (UTF-8) is an actual
+ * part of the word. Other characters at the beginning and end are
+ * ignored.
+ *
+ * TODO Currently only suitable for English and German.
+ * TODO Only lowercase. (Uppercase not needed.)
+ */
+bool Hyphenator::isCharPartOfActualWord (char *s)
+{
+ // Return true when "s" points to a letter.
+ return (s[0] >= 'a' && s[0] <= 'z') ||
+ // UTF-8: starts with 0xc3
+ ((unsigned char)s[0] == 0xc3 &&
+ ((unsigned char)s[1] == 0xa4 /* ä */ ||
+ (unsigned char)s[1] == 0xb6 /* ö */ ||
+ (unsigned char)s[1] == 0xbc /* ü */ ||
+ (unsigned char)s[1] == 0x9f /* ß */ ));
+}
+
+/**
* Given a word, returns a list of the possible hyphenation points.
*/
int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
@@ -190,13 +210,39 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
char *wordLc = platform->textToLower (word, strlen (word));
+ // Determine "actual" word. See isCharPartOfActualWord for exact definition.
+
+ // Only this actual word is used, and "startActualWord" is added to the
+ // break positions, so that these refer to the total word.
+ int startActualWord = 0;
+ while (wordLc[startActualWord] &&
+ !isCharPartOfActualWord (wordLc + startActualWord))
+ startActualWord = platform->nextGlyph (wordLc, startActualWord);
+
+ if (wordLc[startActualWord] == 0) {
+ // No letters etc in word: do not hyphenate at all.
+ delete wordLc;
+ *numBreaks = 0;
+ return NULL;
+ }
+
+ int endActualWord = startActualWord, i = endActualWord;
+ while (wordLc[i]) {
+ if (isCharPartOfActualWord (wordLc + i))
+ endActualWord = i;
+ i = platform->nextGlyph (wordLc, i);
+ }
+
+ endActualWord = platform->nextGlyph (wordLc, endActualWord);
+ wordLc[endActualWord] = 0;
+
// If the word is an exception, get the stored points.
Vector <Integer> *exceptionalBreaks;
- ConstString key (wordLc);
+ ConstString key (wordLc + startActualWord);
if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
int *result = new int[exceptionalBreaks->size()];
for (int i = 0; i < exceptionalBreaks->size(); i++)
- result[i] = exceptionalBreaks->get(i)->getValue();
+ result[i] = exceptionalBreaks->get(i)->getValue() + startActualWord;
delete wordLc;
*numBreaks = exceptionalBreaks->size();
return result;
@@ -211,7 +257,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
char work[strlen (word) + 3];
strcpy (work, ".");
- strcat (work, wordLc);
+ strcat (work, wordLc + startActualWord);
delete wordLc;
strcat (work, ".");
@@ -252,7 +298,7 @@ int *Hyphenator::hyphenateWord(const char *word, int *numBreaks)
for (int i = 0; i < n; i++) {
if (points.get(i + 2) % 2) {
breakPos.increase ();
- breakPos.set (breakPos.size() - 1, i + 1);
+ breakPos.set (breakPos.size() - 1, i + 1 + startActualWord);
}
}
diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh
index 9ef3e306..bf123e5a 100644
--- a/dw/hyphenator.hh
+++ b/dw/hyphenator.hh
@@ -30,6 +30,8 @@ private:
void insertPattern (char *s);
void insertException (char *s);
+ bool isCharPartOfActualWord (char *s);
+
public:
Hyphenator (core::Platform *platform,
const char *patFile, const char *excFile);
diff --git a/test/liang.cc b/test/liang.cc
index 5cee0b5f..026568d0 100644
--- a/test/liang.cc
+++ b/test/liang.cc
@@ -1,3 +1,4 @@
+
#include "../dw/fltkcore.hh"
#include "../dw/hyphenator.hh"
@@ -9,7 +10,7 @@ void hyphenateWord (dw::core::Platform *p, const char *word)
int *breakPos = h->hyphenateWord (word, &numBreaks);
for (int i = 0; i < numBreaks + 1; i++) {
if (i != 0)
- printf ("\xc2\xad");
+ printf (" \xc2\xad ");
int start = (i == 0 ? 0 : breakPos[i - 1]);
int end = (i == numBreaks ? strlen (word) : breakPos[i]);
for (int j = start; j < end; j++)
@@ -24,8 +25,10 @@ int main (int argc, char *argv[])
{
dw::fltk::FltkPlatform p;
+ hyphenateWord (&p, "...");
hyphenateWord (&p, "Jahrhundertroman");
hyphenateWord (&p, "JAHRHUNDERTROMAN");
+ hyphenateWord (&p, "„Jahrhundertroman“");
hyphenateWord (&p, "währenddessen");
hyphenateWord (&p, "Ückendorf");
hyphenateWord (&p, "über");
@@ -38,6 +41,10 @@ int main (int argc, char *argv[])
hyphenateWord (&p, "„Grundstücksverkehrsgenehmigungszuständigkeits"
"übertragungsverordnung“");
hyphenateWord (&p, "Grundstücksverkehrsgenehmigungszuständigkeit");
+ hyphenateWord (&p, "(6R,7R)-7-[2-(2-Amino-4-thiazolyl)-glyoxylamido]-3-"
+ "(2,5-dihydro-6-hydroxy-2-methyl-5-oxo-1,2,4-triazin-3-yl-"
+ "thiomethyl)-8-oxo-5-thia-1-azabicyclo[4.2.0]oct-2-en-2-"
+ "carbonsäure-7²-(Z)-(O-methyloxim)");
return 0;
}