aboutsummaryrefslogtreecommitdiff
path: root/dw/hyphenator.hh
diff options
context:
space:
mode:
Diffstat (limited to 'dw/hyphenator.hh')
-rw-r--r--dw/hyphenator.hh88
1 files changed, 82 insertions, 6 deletions
diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh
index 47530467..eadcf081 100644
--- a/dw/hyphenator.hh
+++ b/dw/hyphenator.hh
@@ -7,9 +7,84 @@
namespace dw {
+class Trie {
+ public:
+ struct TrieNode {
+ unsigned char c;
+ uint16_t next;
+ const char *data;
+ };
+
+ private:
+ TrieNode *array;
+ int size;
+ bool freeArray;
+ lout::misc::ZoneAllocator *dataZone;
+
+ public:
+ Trie (TrieNode *array = NULL, int size = 0, bool freeArray = false,
+ lout::misc::ZoneAllocator *dataZone = NULL);
+ ~Trie ();
+
+ static const int root = 0;
+ inline bool validState (int state) { return state >= 0 && state < size; };
+ inline const char *getData (unsigned char c, int *state)
+ {
+ if (!validState (*state))
+ return NULL;
+
+ TrieNode *tn = array + *state + c;
+
+ if (tn->c == c) {
+ *state = tn->next > 0 ? tn->next : -1;
+ return tn->data;
+ } else {
+ *state = -1;
+ return NULL;
+ }
+ };
+ void save (FILE *file);
+ int load (FILE *file);
+};
+
+class TrieBuilder {
+ private:
+ struct StackEntry {
+ unsigned char c;
+ int count;
+ int next[256];
+ const char *data[256];
+ const char *data1;
+ };
+
+ struct DataEntry {
+ unsigned char *key;
+ const char *value;
+ };
+
+ int pack;
+ static Trie::TrieNode trieNodeNull;
+ lout::misc::SimpleVector <Trie::TrieNode> *tree;
+ lout::misc::SimpleVector <DataEntry> *dataList;
+ lout::misc::SimpleVector <StackEntry> *stateStack;
+ lout::misc::ZoneAllocator *dataZone;
+
+ static int keyCompare (const void *p1, const void *p2);
+ void stateStackPush (unsigned char c);
+ int stateStackPop ();
+ int insertState (StackEntry *state, bool root);
+ void insertSorted (unsigned char *key, const char *value);
+
+ public:
+ TrieBuilder (int pack);
+ ~TrieBuilder ();
+
+ void insert (const char *key, const char *value);
+ Trie *createTrie();
+};
+
class Hyphenator: public lout::object::Object
{
-private:
static lout::container::typed::HashTable
<lout::object::TypedPair <lout::object::TypedPointer <core::Platform>,
lout::object::ConstString>,
@@ -21,26 +96,27 @@ private:
* independent, but based on UTF-8. Clarify? Change?
*/
core::Platform *platform;
- lout::container::typed::HashTable <lout::object::Integer,
- lout::container::typed::Collection
- <lout::object::Integer> > *tree;
+ Trie *trie;
+
lout::container::typed::HashTable <lout::object::ConstString,
lout::container::typed::Vector
<lout::object::Integer> > *exceptions;
- void insertPattern (char *s);
+
+ void insertPattern (TrieBuilder *trieBuilder, char *s);
void insertException (char *s);
bool isCharPartOfActualWord (char *s);
public:
Hyphenator (core::Platform *platform,
- const char *patFile, const char *excFile);
+ const char *patFile, const char *excFile, int pack = 256);
~Hyphenator();
static Hyphenator *getHyphenator (core::Platform *platform,
const char *language);
static bool isHyphenationCandidate (const char *word);
int *hyphenateWord(const char *word, int *numBreaks);
+ void saveTrie (FILE *fp) { trie->save (fp); };
};
} // namespace dw