diff options
Diffstat (limited to 'dw/hyphenator.hh')
-rw-r--r-- | dw/hyphenator.hh | 88 |
1 files changed, 82 insertions, 6 deletions
diff --git a/dw/hyphenator.hh b/dw/hyphenator.hh index 47530467..eadcf081 100644 --- a/dw/hyphenator.hh +++ b/dw/hyphenator.hh @@ -7,9 +7,84 @@ namespace dw { +class Trie { + public: + struct TrieNode { + unsigned char c; + uint16_t next; + const char *data; + }; + + private: + TrieNode *array; + int size; + bool freeArray; + lout::misc::ZoneAllocator *dataZone; + + public: + Trie (TrieNode *array = NULL, int size = 0, bool freeArray = false, + lout::misc::ZoneAllocator *dataZone = NULL); + ~Trie (); + + static const int root = 0; + inline bool validState (int state) { return state >= 0 && state < size; }; + inline const char *getData (unsigned char c, int *state) + { + if (!validState (*state)) + return NULL; + + TrieNode *tn = array + *state + c; + + if (tn->c == c) { + *state = tn->next > 0 ? tn->next : -1; + return tn->data; + } else { + *state = -1; + return NULL; + } + }; + void save (FILE *file); + int load (FILE *file); +}; + +class TrieBuilder { + private: + struct StackEntry { + unsigned char c; + int count; + int next[256]; + const char *data[256]; + const char *data1; + }; + + struct DataEntry { + unsigned char *key; + const char *value; + }; + + int pack; + static Trie::TrieNode trieNodeNull; + lout::misc::SimpleVector <Trie::TrieNode> *tree; + lout::misc::SimpleVector <DataEntry> *dataList; + lout::misc::SimpleVector <StackEntry> *stateStack; + lout::misc::ZoneAllocator *dataZone; + + static int keyCompare (const void *p1, const void *p2); + void stateStackPush (unsigned char c); + int stateStackPop (); + int insertState (StackEntry *state, bool root); + void insertSorted (unsigned char *key, const char *value); + + public: + TrieBuilder (int pack); + ~TrieBuilder (); + + void insert (const char *key, const char *value); + Trie *createTrie(); +}; + class Hyphenator: public lout::object::Object { -private: static lout::container::typed::HashTable <lout::object::TypedPair <lout::object::TypedPointer <core::Platform>, lout::object::ConstString>, @@ -21,26 +96,27 @@ private: * independent, but based on UTF-8. Clarify? Change? */ core::Platform *platform; - lout::container::typed::HashTable <lout::object::Integer, - lout::container::typed::Collection - <lout::object::Integer> > *tree; + Trie *trie; + lout::container::typed::HashTable <lout::object::ConstString, lout::container::typed::Vector <lout::object::Integer> > *exceptions; - void insertPattern (char *s); + + void insertPattern (TrieBuilder *trieBuilder, char *s); void insertException (char *s); bool isCharPartOfActualWord (char *s); public: Hyphenator (core::Platform *platform, - const char *patFile, const char *excFile); + const char *patFile, const char *excFile, int pack = 256); ~Hyphenator(); static Hyphenator *getHyphenator (core::Platform *platform, const char *language); static bool isHyphenationCandidate (const char *word); int *hyphenateWord(const char *word, int *numBreaks); + void saveTrie (FILE *fp) { trie->save (fp); }; }; } // namespace dw |