/* * Dillo Widget * * Copyright 2012-2013 Sebastian Geerken , * Johannes Hofmann * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "hyphenator.hh" #include "../lout/misc.hh" #include "../lout/unicode.hh" #include #include #include #define LEN 1000 /* * This is (or at least began as) a direct translation of Ned Batchelder's * public-domain Python implementation at * http://nedbatchelder.com/code/modules/hyphenate.py */ using namespace lout::object; using namespace lout::container::typed; using namespace lout::misc; using namespace lout::unicode; namespace dw { HashTable *Hyphenator::hyphenators = new HashTable (true, true); Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack) { trie = NULL; // As long we are not sure whether a pattern file can be read. int bufLen = strlen (patFile) + 5 + 1; char *buf = new char[bufLen]; snprintf(buf, bufLen, "%s.trie", patFile); FILE *trieF = fopen (buf, "r"); delete[] buf; if (trieF) { trie = new Trie (); if (trie->load (trieF) != 0) { delete trie; trie = NULL; } fclose (trieF); } if (trie == NULL) { TrieBuilder trieBuilder(pack); FILE *patF = fopen (patFile, "r"); if (patF) { while (!feof (patF)) { char buf[LEN + 1]; char *s = fgets (buf, LEN, patF); if (s && s[0] != '%') { // ignore lines starting with '%' as comment // TODO Better exit with an error, when the line is too long. int l = strlen (s); if (s[l - 1] == '\n') s[l - 1] = 0; insertPattern (&trieBuilder, s); } } trie = trieBuilder.createTrie (); fclose (patF); } } exceptions = NULL; // Again, only instantiated when needed. FILE *excF = fopen (excFile, "r"); if (excF) { exceptions = new HashTable > (true, true); while (!feof (excF)) { char buf[LEN + 1]; char *s = fgets (buf, LEN, excF); if (s && s[0] != '%') { // ignore lines starting with '%' as comment // TODO Better exit with an error, when the line is too long. int l = strlen (s); if (s[l - 1] == '\n') s[l - 1] = 0; insertException (s); } } fclose (excF); } } Hyphenator::~Hyphenator () { delete trie; delete exceptions; } Hyphenator *Hyphenator::getHyphenator (const char *lang) { String *langString = new String (lang); Hyphenator *hyphenator = hyphenators->get (langString); if (hyphenator) delete langString; else { int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1; char *patFile = new char[patFileLen]; snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat", DILLO_LIBDIR, lang); int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1; char *excFile = new char[excFileLen]; snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc", DILLO_LIBDIR, lang); //printf ("Loading hyphenation patterns for language '%s' from '%s' and " // "exceptions from '%s' ...\n", lang, patFile, excFile); hyphenator = new Hyphenator (patFile, excFile); hyphenators->put (langString, hyphenator); delete[] patFile; delete[] excFile; } //lout::misc::StringBuffer sb; //hyphenators->intoStringBuffer (&sb); //printf ("%s\n", sb.getChars ()); return hyphenator; } void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s) { // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd' // and a list of points [ 0, 1, 0, 3, 4 ]. int l = strlen (s); char *chars = new char[l + 1]; SimpleVector points (1); // TODO numbers consisting of multiple digits? // TODO Encoding: This implementation works exactly like the Python // implementation, based on UTF-8. Does this always work? int numChars = 0; for (int i = 0; s[i]; i++) { if (s[i] >= '0' && s[i] <= '9') { points.setSize(numChars + 1, '0'); points.set(numChars, s[i]); } else { chars[numChars++] = s[i]; } } chars[numChars] = 0; points.setSize(numChars + 2, '0'); points.set(numChars + 1, '\0'); // Insert the pattern into the tree. Each character finds a dict // another level down in the tree, and leaf nodes have the list of // points. //printf("insertPattern %s\n", chars); trieBuilder->insert (chars, points.getArray ()); delete[] chars; } void Hyphenator::insertException (char *s) { Vector *breaks = new Vector (1, true); int len = strlen (s); for (int i = 0; i < len - 1; i++) if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad) breaks->put (new Integer (i - 2 * breaks->size())); char *noHyphens = new char[len - 2 * breaks->size() + 1]; int j = 0; for (int i = 0; i < len; ) { if(i < len - 1 && (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad) i += 2; else noHyphens[j++] = s[i++]; } noHyphens[j] = 0; exceptions->put (new String (noHyphens), breaks); delete[] noHyphens; } /** * Simple test to avoid much costs. Passing it does not mean that the word * can be hyphenated. */ bool Hyphenator::isHyphenationCandidate (const char *word) { // Short words aren't hyphenated. return (strlen (word) > 4); // TODO UTF-8? } /** * Test whether the character on which "s" points (UTF-8) is an actual * part of the word. Other characters at the beginning and end are * ignored. * * TODO Currently only suitable for English and German. * TODO Only lowercase. (Uppercase not needed.) */ bool Hyphenator::isCharPartOfActualWord (char *s) { #if 0 // Return true when "s" points to a letter. return (s[0] >= 'a' && s[0] <= 'z') || // UTF-8: starts with 0xc3 ((unsigned char)s[0] == 0xc3 && ((unsigned char)s[1] == 0xa4 /* ä */ || (unsigned char)s[1] == 0xb6 /* ö */ || (unsigned char)s[1] == 0xbc /* ü */ || (unsigned char)s[1] == 0x9f /* ß */ )); #endif return isAlpha (decodeUtf8 (s)); } /** * Given a word, returns a list of the possible hyphenation points. */ int *Hyphenator::hyphenateWord(core::Platform *platform, const char *word, int *numBreaks) { if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) { *numBreaks = 0; return NULL; } char *wordLc = platform->textToLower (word, strlen (word)); int start = 0; SimpleVector breakPos (1); // Split the original word up, ignore anything but characters, and // collect all break points, so that they fit to the original // word. (The latter is what the offset in the call of // hyphenateSingleWord() is for.) while (true) { while (wordLc[start] && !isCharPartOfActualWord (wordLc + start)) start = platform->nextGlyph (wordLc, start); if (wordLc[start] == 0) break; int end = start, i = end; while (wordLc[i]) { if (!isCharPartOfActualWord (wordLc + i)) break; else end = i; i = platform->nextGlyph (wordLc, i); } end = platform->nextGlyph (wordLc, end); int nextStart; if (wordLc[end]) { nextStart = platform->nextGlyph (wordLc, end); wordLc[end] = 0; } else nextStart = end; hyphenateSingleWord (platform, wordLc + start, start, &breakPos); start = nextStart; } free (wordLc); *numBreaks = breakPos.size (); if (*numBreaks == 0) return NULL; else { return breakPos.detachArray (); } } /** * Hyphenate a single word, which only consists of lowercase * characters. Store break positions + "offset" in "breakPos". */ void Hyphenator::hyphenateSingleWord(core::Platform *platform, char *wordLc, int offset, SimpleVector *breakPos) { // If the word is an exception, get the stored points. Vector *exceptionalBreaks; ConstString key (wordLc); if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) { for (int i = 0; i < exceptionalBreaks->size(); i++) { breakPos->increase (); breakPos->set (breakPos->size() - 1, exceptionalBreaks->get(i)->getValue() + offset); } return; } // trie == NULL means that there is no pattern file. if (trie == NULL) return; char *work = new char[strlen (wordLc) + 3]; strcpy (work, "."); strcat (work, wordLc); strcat (work, "."); int l = strlen (work); SimpleVector points (l + 1); points.setSize (l + 1, 0); for (int i = 0; i < l; i++) { int state = trie->root; for (int j = i; j < l && trie->validState (state); j++) { const char *p = trie->getData((unsigned char) work[j], &state); if (p) { for (int k = 0; p[k]; k++) points.set(i + k, lout::misc::max (points.get (i + k), p[k] - '0')); } } } delete[] work; // No hyphens in the first two chars or the last two. // Characters are not bytes, so UTF-8 characters must be counted. const char *s = nextUtf8Char (wordLc); if (s != NULL && (s = nextUtf8Char (s)) != NULL) { // First two characters. int bytesStart = s - wordLc; for (int i = 0; i < bytesStart; i++) points.set (i + 1, 0); // Last two characters: instead of iterating back from the end, // we simply iterate from the start to the end and count the // characters. int lenBytes = strlen (wordLc); int lenUtf8 = numUtf8Chars (wordLc); int bytesEnd = 0; s = wordLc; for (int i = 0; s; s = nextUtf8Char (s), i++) { if (i == lenUtf8 - 2) bytesEnd = lenBytes - (s - wordLc); } for (int i = 0; i < bytesEnd; i++) points.set (points.size() - 2 - i, 0); } // Examine the points to build the break point list. int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2); for (int i = 0; i < n; i++) { if (points.get(i + 2) % 2) { breakPos->increase (); breakPos->set (breakPos->size() - 1, i + 1 + offset); } } } Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL}; TrieBuilder::TrieBuilder (int pack) { this->pack = pack; dataList = new SimpleVector (10000); stateStack = new SimpleVector (10); tree = new SimpleVector (20000); dataZone = new ZoneAllocator (1024); stateStackPush(0); } TrieBuilder::~TrieBuilder () { delete dataList; delete stateStack; delete tree; delete dataZone; } void TrieBuilder::insert (const char *key, const char *value) { dataList->increase (); dataList->getLastRef ()->key = (unsigned char *) strdup(key); dataList->getLastRef ()->value = dataZone->strdup (value); } int TrieBuilder::keyCompare (const void *p1, const void *p2) { DataEntry *pd1 = (DataEntry *) p1; DataEntry *pd2 = (DataEntry *) p2; return strcmp ((char *) pd1->key, (char *) pd2->key); } int TrieBuilder::insertState (StackEntry *state, bool root) { int i, j; if (state->count == 0) return 0; if (root) { i = 0; // we reseve slot 0 for the root state } else { /* The bigger pack is the more slots we check and the smaller * the trie will be, but CPU consumption also increases. * Reasonable values for pack seemt to be between 256 and 1024. */ i = tree->size () - pack + 2 * state->count; if (i < 256) // reserve first 256 entries for the root state i = 256; } for (;; i++) { if (i + 256 > tree->size ()) tree->setSize (i + 256, trieNodeNull); for (j = 1; j < 256; j++) { Trie::TrieNode *tn = tree->getRef(i + j); if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0)) break; } if (j == 256) // found a suitable slot break; } for (int j = 1; j < 256; j++) { Trie::TrieNode *tn = tree->getRef(i + j); if (state->next[j] || state->data[j]) { tn->c = j; tn->next = state->next[j]; tn->data = state->data[j]; } } assert (root || i >= 256); assert (!root || i == 0); return i; } void TrieBuilder::stateStackPush (unsigned char c) { stateStack->increase (); StackEntry *e = stateStack->getLastRef (); memset (e, 0, sizeof (StackEntry)); e->c = c; } int TrieBuilder::stateStackPop () { int next = insertState (stateStack->getLastRef (), stateStack->size () == 1); unsigned char c = stateStack->getLastRef ()->c; const char *data = stateStack->getLastRef ()->data1; stateStack->setSize (stateStack->size () - 1); if (stateStack->size () > 0) { assert (stateStack->getLastRef ()->next[c] == 0); assert (stateStack->getLastRef ()->data[c] == NULL); stateStack->getLastRef ()->next[c] = next; stateStack->getLastRef ()->data[c] = data; stateStack->getLastRef ()->count++; } return next; } Trie *TrieBuilder::createTrie () { // we need to sort the patterns as byte strings not as unicode qsort (dataList->getArray (), dataList->size (), sizeof (DataEntry), keyCompare); for (int i = 0; i < dataList->size (); i++) { insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value); free (dataList->getRef (i)->key); } while (stateStack->size ()) stateStackPop (); int size = tree->size (); Trie *trie = new Trie(tree->detachArray(), size, true, dataZone); dataZone = NULL; return trie; } void TrieBuilder::insertSorted (unsigned char *s, const char *data) { int len = strlen((char*)s); for (int i = 0; i < len; i++) { if (stateStack->size () > i + 1 && stateStack->getRef (i + 1)->c != s[i]) { for (int j = stateStack->size () - 1; j >= i + 1; j--) stateStackPop(); } if (i + 1 >= stateStack->size ()) stateStackPush(s[i]); } while (stateStack->size () > len + 1) stateStackPop(); assert (stateStack->size () == len + 1); stateStack->getLastRef ()->data1 = data; } Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone) { this->array = array; this->size = size; this->freeArray = freeArray; this->dataZone = dataZone; } Trie::~Trie () { delete dataZone; if (freeArray) free(array); } void Trie::save (FILE *file) { for (int i = 0; i < size; i++) { Trie::TrieNode *tn = &array[i]; if (tn->data) fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data); else fprintf(file, "%u, %u\n", tn->c, tn->next); } } int Trie::load (FILE *file) { int next, c, maxNext = 0; SimpleVector tree (100); dataZone = new ZoneAllocator (1024); while (!feof (file)) { char buf[LEN + 1]; char *s = fgets (buf, LEN, file); if (!s) continue; char data[LEN + 1]; int n = sscanf (s, "%u, %u, %s", &c, &next, data); if (n >= 2 && c >= 0 && c < 256 && next >= 0) { tree.increase (); tree.getLastRef ()->c = c; tree.getLastRef ()->next = next; if (n >= 3) tree.getLastRef ()->data = dataZone->strdup (data); else tree.getLastRef ()->data = NULL; if (next > maxNext) maxNext = next; } else { goto error; } } if (maxNext >= tree.size ()) goto error; size = tree.size (); array = tree.detachArray (); freeArray = true; return 0; error: delete dataZone; dataZone = NULL; return 1; } } // namespace dw