diff options
Diffstat (limited to 'common/rtfl_findrepeat.cc')
-rw-r--r-- | common/rtfl_findrepeat.cc | 534 |
1 files changed, 534 insertions, 0 deletions
diff --git a/common/rtfl_findrepeat.cc b/common/rtfl_findrepeat.cc new file mode 100644 index 0000000..d464db8 --- /dev/null +++ b/common/rtfl_findrepeat.cc @@ -0,0 +1,534 @@ +/* + * RTFL + * + * Copyright 2014, 2015 Sebastian Geerken <sgeerken@dillo.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/* + * This program searches for identical sequences in a stream of RTFL + * messages (actually, any stream) and marks the beginnings and ends + * with an RTFL mark (obj-mark). You should first filter out other + * lines, so run + * + * $ ... | grep '^\[rtfl[^\]]*\]' | rtfl-findrepeat + * + * or use the script rtfl-objfilter. + * + * For options, see printHelp(). + * + * Warning: This program is highly experimental, and especially rather + * inefficient. Some ideas: + * + * - When finding a suitable lenght ("-l find"), a smaller number of + * lines is in many cases sufficient, so use "head". + * + * - Unless the length is searched for ("-l find"), hashing multiple + * lines (as many as are searched as minimum), as done in the + * searching algorithm by Rabin and Karp, may increase the speed. + */ + +#include <unistd.h> +#include <string.h> +#include <stdio.h> +#include "tools.hh" +#include "../lout/object.hh" +#include "../lout/container.hh" + +using namespace lout::misc; +using namespace lout::object; +using namespace lout::container::typed; + +enum { MAX_LINE_SIZE = 1000 }; + +class Region: public Comparable +{ + int first, num; + +public: + inline Region (int first, int num) { this->first = first; this->num = num; } + + bool equals (Object *other); + int hashValue (); + void intoStringBuffer (StringBuffer *sb); + int compareTo(Comparable *other); + + inline int getFirst () { return first; } + inline int getNum () { return num; } + inline Region *cloneRegion () { return new Region (first, num); } + + inline bool subSetOf (Region *other) + { return first >= other->first && first + num <= other->first + other->num; } +}; + +class Mark: public Object +{ +public: + enum Type { START, END }; + +private: + Type type; + int majorNo, minorNo, length; + +public: + inline Mark (Type type, int majorNo, int minorNo, int length) + { this->type = type; this->majorNo = majorNo; this->minorNo = minorNo; + this->length = length; } + + void intoStringBuffer (StringBuffer *sb); + + inline Type getType () { return type; } + inline int getMajorNo () { return majorNo; } + inline int getMinorNo () { return minorNo; } + inline int getLength () { return length; } +}; + +bool Region::equals (Object *other) +{ + Region *otherRegion = (Region*)other; + return first == otherRegion->first && num == otherRegion->num; +} + +int Region::hashValue () +{ + return first ^ num; +} + +void Region::intoStringBuffer (StringBuffer *sb) +{ + char buf[32]; + + sb->append ("("); + snprintf (buf, 32, "%d", first); + sb->append (buf); + sb->append ("..."); + snprintf (buf, 32, "%d", first + num - 1); + sb->append (buf); + sb->append (")"); +} + +void Mark::intoStringBuffer (StringBuffer *sb) +{ + char buf[32]; + + + sb->append ("("); + sb->append (type == START ? "START" : "END"); + sb->append (" / "); + snprintf (buf, 32, "%d", majorNo); + sb->append (buf); + sb->append (" / "); + snprintf (buf, 32, "%d", minorNo); + sb->append (buf); + sb->append (")"); +} + +int Region::compareTo(Comparable *other) +{ + Region *otherRegion = (Region*)other; + return first - otherRegion->first; +} + +// ---------------------------------------------------------------------- + +static bool debug = false; + +static void printHelp (const char *argv0) +{ + fprintf + (stderr, "Usage: %s <options>\n" + "\n" + "Options:\n" + " -l <n> Search for sequence of at least <n> lines.\n" + " -c <n> Search for sequence repeated at least <n> times.\n" + "\n" + "If an arguments is 'f' or 'find', the maximal value for this is\n" + "determined (possibly with the other argument set to a concrete\n" + "number).\n" + "\n" + "See RTFL documentation for more details.\n", + argv0); +} + +// ---------------------------------------------------------------------- + +static void readFile (FILE *file, Vector<String> *lines, + HashTable<String, Vector<Integer> > *lineNosByLines) +{ + char buf[MAX_LINE_SIZE + 1]; + + for (int lineNo = 0; fgets (buf, MAX_LINE_SIZE + 1, file); lineNo++) { + size_t l = strlen (buf); + if (buf[l - 1] == '\n') buf[l - 1] = 0; + + String *line = new String (buf); + + Vector<Integer> *lineNos = lineNosByLines->get (line); + if (lineNos == NULL) { + lineNos = new Vector<Integer> (1, true); + // Note: key is dublicated. + lineNosByLines->put (new String (buf), lineNos); + } + lineNos->put (new Integer (lineNo)); + + lines->put (line); + } +} + +static int findRegions (Vector<String> *lines, + HashTable<String, Vector<Integer> > *lineNosByLines, + List <HashSet<Region> > *allSetsOfRegions, + int minLength, int minCount) +{ + int effMinLength = minLength == -1 ? 2 :minLength; + int effMinCount = minCount == -1 ? 2 : minCount; + int maxLength = 0, maxCount = 0; + + HashTable<Region, HashSet<Region> > *setsOfRegionsByRegion = + new HashTable<Region, HashSet<Region> > (false, false); + + List <HashSet<Region> > *tmpAllSetsOfRegions = + new List<HashSet<Region> > (false); + + for (int lineNo1 = 0; lineNo1 < lines->size (); lineNo1++) { + String *line = lines->get (lineNo1); + Vector<Integer> *lineNos = lineNosByLines->get (line); + + // Examine only lines after this. + Integer lineNo1Key (lineNo1); + + for (int linesNoIndex = lineNos->bsearch (&lineNo1Key, true) + 1; + linesNoIndex < lineNos->size (); linesNoIndex++) { + int lineNo2 = lineNos->get(linesNoIndex)->getValue (); + int numMatching = 1; + while (lineNo2 + numMatching < lines->size () && + lines->get(lineNo1 + numMatching)->equals + (lines->get(lineNo2 + numMatching))) { + numMatching++; + + if (numMatching >= effMinLength) { + //printf ("equal: (%d...%d) and (%d...%d)\n", + // lineNo1, lineNo1 + numMatching - 1, + // lineNo2, lineNo2 + numMatching - 1); + + Region r1 (lineNo1, numMatching), r2 (lineNo2, numMatching); + HashSet<Region> *setOfRegions; + + if ((setOfRegions = setsOfRegionsByRegion->get (&r1))) { + if (!setsOfRegionsByRegion->contains (&r2)) { + assert (!setOfRegions->contains (&r2)); + Region *rr2 = r2.cloneRegion (); + setOfRegions->put (rr2); + setsOfRegionsByRegion->put (rr2, setOfRegions); + } + } else if ((setOfRegions = setsOfRegionsByRegion->get (&r2))) { + if (!setsOfRegionsByRegion->contains (&r1)) { + assert (!setOfRegions->contains (&r1)); + Region *rr1 = r1.cloneRegion (); + setOfRegions->put (rr1); + setsOfRegionsByRegion->put (rr1, setOfRegions); + } + } else { + Region *rr1 = r1.cloneRegion (), *rr2 = r2.cloneRegion (); + setOfRegions = new HashSet<Region> (false); + setOfRegions->put (rr1); + setOfRegions->put (rr2); + setsOfRegionsByRegion->put (rr1, setOfRegions); + setsOfRegionsByRegion->put (rr2, setOfRegions); + tmpAllSetsOfRegions->append (setOfRegions); + } + + if (debug) { + StringBuffer sb; + setsOfRegionsByRegion->intoStringBuffer (&sb); + printf ("findRegions: setsOfRegionsByRegion = %s\n", + sb.getChars ()); + } + } + } + } + } + + delete setsOfRegionsByRegion; + + for (Iterator<HashSet<Region> > it1 = tmpAllSetsOfRegions->iterator (); + it1.hasNext (); ) { + HashSet<Region> *set = it1.getNext (); + if (set->size () >= effMinCount) { + allSetsOfRegions->append (set); + maxCount = max (maxCount, set->size ()); + + if (minLength == -1) { + for (Iterator<Region> it2 = set->iterator (); it2.hasNext (); ) { + Region *r = it2.getNext (); + maxLength = max (maxLength, r->getNum ()); + } + } + } else + delete set; + } + + delete tmpAllSetsOfRegions; + + if (minLength == -1) + return maxLength; + else if (minCount == -1) + return maxCount; + else + return -1; +} + +static void sortListsOfRegions (List <HashSet<Region> > *allSetsOfRegions, + List <Vector<Region> > *allListsOfRegions) +{ + for (Iterator<HashSet<Region> > it1 = allSetsOfRegions->iterator (); + it1.hasNext (); ) { + HashSet<Region> *set = it1.getNext (); + Vector<Region> *list = new Vector<Region> (1, true); + + for (Iterator<Region> it2 = set->iterator (); it2.hasNext (); ) { + Region *r = it2.getNext (); + list->put (r); + } + + if (debug) { + StringBuffer sb; + list->intoStringBuffer (&sb); + printf ("sortListsOfRegions: list = %s\n", sb.getChars ()); + } + + list->sort (); + allListsOfRegions->append (list); + } +} + +static void cleanupRegions (List <Vector <Region> > *allListsOfRegions) +{ + HashTable<List<Integer>, Vector<Vector<Region> > > *allListsOfLists = + new HashTable<List<Integer>, Vector<Vector<Region> > > (true, true); + + for (Iterator<Vector <Region> > it = allListsOfRegions->iterator (); + it.hasNext (); ) { + Vector<Region> *list = it.getNext (); + + List<Integer> *key = new List<Integer> (true); + for (int i = 1; i < list->size (); i++) + key->append (new Integer (list->get(i)->getFirst () - + list->get(i - 1)->getFirst ())); + + Vector<Vector<Region> > *listOfLists = allListsOfLists->get (key); + if (listOfLists) + delete key; + else { + listOfLists = new Vector<Vector<Region> > (1, false); + allListsOfLists->put (key, listOfLists); + } + + listOfLists->put (list); + } + + allListsOfRegions->clear (); + + if (debug) { + StringBuffer sb; + allListsOfLists->intoStringBuffer (&sb); + printf ("cleanupRegions: allListsOfLists = %s\n", sb.getChars ()); + } + + for (Iterator<List<Integer> > it = allListsOfLists->iterator (); + it.hasNext (); ) { + List<Integer> *key = it.getNext (); + Vector<Vector<Region> > *listOfLists = allListsOfLists->get (key); + + if (debug) { + StringBuffer sb; + listOfLists->intoStringBuffer (&sb); + printf ("cleanupRegions: listOfLists = %s\n", sb.getChars ()); + } + + for (int i = 0; i < listOfLists->size (); i++) { + Vector<Region> *list1 = listOfLists->get (i); + Region *r1 = list1->get (0); + bool redundant = false; + for (int j = 0; j < listOfLists->size () && !redundant; j++) { + if (i != j) { + Vector<Region> *list2 = listOfLists->get (j); + if (list2 != NULL) { + Region *r2 = list2->get (0); + if (r1->subSetOf (r2)) + redundant = true; + } + } + } + + if (redundant) { + listOfLists->put (NULL, i); + delete list1; + } else + allListsOfRegions->append (list1); + } + } + + delete allListsOfLists; +} + +// ---------------------------------------------------------------------- + +int main (int argc, char *argv[]) +{ + int minLength = 2, minCount = 2; + int opt; + + while ((opt = getopt(argc, argv, "c:dl:")) != -1) { + switch (opt) { + case 'c': + if (strcmp (optarg, "f") == 0 || strcmp (optarg, "find") == 0) + minCount = -1; + else + minCount = atoi (optarg); + break; + + case 'd': + debug = true; + break; + + case 'l': + if (strcmp (optarg, "f") == 0 || strcmp (optarg, "find") == 0) + minLength = -1; + else + minLength = atoi (optarg); + break; + + default: + printHelp (argv[0]); + return 1; + } + } + + Vector<String> *lines = new Vector<String> (8, true); + HashTable<String, Vector<Integer> > *lineNosByLines = + new HashTable<String, Vector<Integer> > (true, true); + + readFile (stdin, lines, lineNosByLines); + + List <HashSet<Region> > *allSetsOfRegions = + new List<HashSet<Region> > (true); + + int numFound = findRegions (lines, lineNosByLines, allSetsOfRegions, + minLength, minCount); + + if (debug) { + StringBuffer sb; + allSetsOfRegions->intoStringBuffer (&sb); + printf ("main: allSetsOfRegions = %s\n", sb.getChars ()); + } + + delete lineNosByLines; + + if (numFound != -1) { + delete allSetsOfRegions; + printf ("%d\n", numFound); + } else { + List <Vector<Region> > *allListsOfRegions = + new List <Vector<Region> > (false); // TODO Memory leak! + + sortListsOfRegions (allSetsOfRegions, allListsOfRegions); + + delete allSetsOfRegions; + + if (debug) { + StringBuffer sb; + allListsOfRegions->intoStringBuffer (&sb); + printf ("(a) main: allListsOfRegions = %s\n", sb.getChars ()); + } + + cleanupRegions (allListsOfRegions); + + if (debug) { + StringBuffer sb; + allListsOfRegions->intoStringBuffer (&sb); + printf ("(b) main: allListsOfRegions = %s\n", sb.getChars ()); + } + + HashTable<Integer, List<Mark> > *marksByLineNo = + new HashTable<Integer, List<Mark> > (true, true); + + int majorNo = 0; + for (Iterator<Vector<Region> > it1 = allListsOfRegions->iterator (); + it1.hasNext (); ) { + Vector<Region> *list = it1.getNext (); + int minorNo = 0; + + for (Iterator<Region> it2 = list->iterator (); it2.hasNext (); ) { + Region *r = it2.getNext (); + + for (int typeNo = 0; typeNo < 2; typeNo++) { + Mark::Type type = typeNo == 0 ? Mark::START : Mark::END; + int lineNo = + r->getFirst () + (type == Mark::START ? 0 : r->getNum ()); + Integer lineNoKey (lineNo); + + List<Mark> *list = marksByLineNo->get (&lineNoKey); + if (list == NULL) { + list = new List<Mark> (true); + marksByLineNo->put (new Integer (lineNo), list); + } + + list->append (new Mark (type, majorNo, minorNo, r->getNum ())); + } + + + minorNo++; + } + + majorNo++; + } + + delete allListsOfRegions; + + if (debug) { + StringBuffer sb; + marksByLineNo->intoStringBuffer (&sb); + printf ("main: marksByLineNo = %s\n", sb.getChars ()); + } + + for (int lineNo = 0; lineNo < lines->size (); lineNo++) { + Integer lineNoKey (lineNo); + List<Mark> *list = marksByLineNo->get (&lineNoKey); + if (list) { + for (Iterator<Mark> it = list->iterator (); it.hasNext (); ) { + Mark *m = it.getNext (); + char buf[200]; + rtfl::tools::numToRoman (m->getMajorNo () + 1, buf, + sizeof (buf)); + // Certainly no ':' or '\' in the message, so no quoting + // necessary. + printf ("[rtfl-obj-1.0]n:0:0:mark:findrepeat:findrepeat:0:" + "Sequence %s (length %d), %d%s occurence -- %s\n", + buf, m->getLength (), m->getMinorNo () + 1, + rtfl::tools::numSuffix (m->getMinorNo () + 1), + m->getType () == Mark::START ? "start" : "end"); + } + } + + String *line = lines->get (lineNo); + puts (line->chars ()); + } + + delete marksByLineNo; + } + + delete lines; +} |