Generalization of soft hyphens, for "hard" hyphens, em-dashes and similar characters. Works now for soft hyphens.

author: Sebastian Geerken <devnull@localhost> 2012-11-09 21:00:10 +0100
committer: Sebastian Geerken <devnull@localhost> 2012-11-09 21:00:10 +0100
commit: ba12d00e4304138098beec462bd9c7ab5063b0bd (patch)
tree: bfe3c1fbefc8fbb3822882da1302c42c0a266b30
parent: 5e000618f0774aa24605ab984c46411e745a1574 (diff)
3 files changed, 141 insertions, 55 deletions
diff --git a/dw/textblock.cc b/dw/textblock.cc
index 095ad219..367e154a 100644
--- a/dw/textblock.cc
+++ b/dw/textblock.cc
@@ -40,6 +40,10 @@ namespace dw {
 
 int Textblock::CLASS_ID = -1;
 
+Textblock::DivSign Textblock::divSigns[NUM_DIV_SIGNS] = {
+   { "\xc2\xad", true, false, PENALTY_HYPHEN, -1 }
+};
+
 Textblock::Textblock (bool limitTextWidth, int penaltyHyphen)
 {
    registerName ("dw::Textblock", &CLASS_ID);
@@ -86,7 +90,7 @@ Textblock::Textblock (bool limitTextWidth, int penaltyHyphen)
    availDescent = 0;
 
    this->limitTextWidth = limitTextWidth;
-   this->penaltyHyphen = penaltyHyphen;
+   penalties[PENALTY_HYPHEN] = penaltyHyphen;
 
    for (int layer = 0; layer < core::HIGHLIGHT_NUM_LAYERS; layer++) {
       /* hlStart[layer].index > hlEnd[layer].index means no highlighting */
@@ -1418,77 +1422,146 @@ void Textblock::calcTextSize (const char *text, size_t len,
 }
 
 /**
- * Add a word to the page structure. If it contains soft hyphens, it is
- * divided.
+ * Add a word to the page structure. If it contains dividing
+ * characters (hard or soft hyphens, em-dashes, etc.), it is divided.
  */
 void Textblock::addText (const char *text, size_t len,
                          core::style::Style *style)
 {
    PRINTF ("[%p] ADD_TEXT (%d characters)\n", this, (int)len);
 
-   // Count hyphens.
-   int numHyphens = 0;
-   for (int i = 0; i < (int)len - 1; i++)
-      // (0xc2, 0xad) is the UTF-8 representation of a soft hyphen (Unicode
-      // 0xc2).
-      if((unsigned char)text[i] == 0xc2 && (unsigned char)text[i + 1] == 0xad)
-         numHyphens++;
-
-   if (numHyphens == 0) {
-      // Simple (and common) case: no soft hyphens. May still be hyphenated
-      // automatically.
+   // Count dividing characters.
+   int numParts = 1;
+
+   for (int i = 0; i < (int)len;
+        i < (int)len && (i = layout->nextGlyph (text, i))) {
+      int foundDiv = -1;
+      for (int j = 0; foundDiv == -1 && j < NUM_DIV_SIGNS; j++) {
+         int lDiv = strlen (divSigns[j].s);
+         if (i <= (int)len - lDiv) {
+            if (memcmp (text + i, divSigns[j].s, lDiv * sizeof (char)) == 0)
+               foundDiv = j;
+         }
+      }
+
+      if (foundDiv != -1) {
+         if (divSigns[foundDiv].penaltyIndexLeft != -1)
+            numParts ++;
+         if (divSigns[foundDiv].penaltyIndexRight != -1)
+            numParts ++;
+      }
+   }
+
+   if (numParts == 1) {
+      // Simple (and common) case: no dividing characters. May still
+      // be hyphenated automatically.
       core::Requisition size;
       calcTextSize (text, len, style, &size);
       addText0 (text, len, true, style, &size);
    } else {
-      PRINTF("HYPHENATION: '");
+      PRINTF ("HYPHENATION: '");
       for (size_t i = 0; i < len; i++)
          PUTCHAR(text[i]);
-      PRINTF("', with %d hyphen(s)\n", numHyphens);
+      PRINTF ("', with %d parts\n", numParts);
 
       // Store hyphen positions.
-      int n = 0, hyphenPos[numHyphens], breakPos[numHyphens];
-      for (size_t i = 0; i < len - 1; i++)
-         if((unsigned char)text[i] == 0xc2 &&
-            (unsigned char)text[i + 1] == 0xad) {
-            hyphenPos[n] = i;
-            breakPos[n] = i - 2 * n;
-            n++;
+      int n = 0, totalLenSignRemoved = 0;
+      int partPenalty[numParts], partStart[numParts], partEnd[numParts];
+      partPenalty[numParts - 1] = INT_MAX;
+      partStart[0] = 0;
+      partEnd[numParts - 1] = len;
+
+      for (int i = 0; i < (int)len;
+           i < (int)len && (i = layout->nextGlyph (text, i))) {
+         int foundDiv = -1;
+         for (int j = 0; foundDiv == -1 && j < NUM_DIV_SIGNS; j++) {
+            int lDiv = strlen (divSigns[j].s);
+            if (i <= (int)len - lDiv) {
+               if (memcmp (text + i, divSigns[j].s, lDiv * sizeof (char)) == 0)
+                  foundDiv = j;
+            }
+         }
+         
+         if (foundDiv != -1) {
+            int lDiv = strlen (divSigns[foundDiv].s);
+            
+            if (divSigns[foundDiv].signRemoved) {
+               assert (divSigns[foundDiv].penaltyIndexLeft != -1);
+               assert (divSigns[foundDiv].penaltyIndexRight == -1);
+
+               partPenalty[n] = penalties[divSigns[foundDiv].penaltyIndexLeft];
+               partEnd[n] = i;
+               partStart[n + 1] = i + lDiv;
+               n++;
+               totalLenSignRemoved += lDiv;
+            } else {
+               assert (divSigns[foundDiv].penaltyIndexLeft != -1 ||
+                       divSigns[foundDiv].penaltyIndexRight != -1);
+
+               if (divSigns[foundDiv].penaltyIndexLeft != -1) {
+                  partPenalty[n] =
+                     penalties[divSigns[foundDiv].penaltyIndexLeft];
+                  partEnd[n] = i;
+                  partStart[n + 1] = i;
+                  n++;
+               }
+
+               if (divSigns[foundDiv].penaltyIndexRight != -1) {
+                  partPenalty[n] =
+                     penalties[divSigns[foundDiv].penaltyIndexRight];
+                  partEnd[n] = i + lDiv;
+                  partStart[n + 1] = i + lDiv;
+                  n++;
+               }
+            }
          }
+      }
+
+      // Get text without removed characters, e. g. hyphens.
+      const char *textWithoutHyphens;
+      char textWithoutHyphensBuf[len - totalLenSignRemoved];
+      int *partEndWithoutHyphens, partEndWithoutHyphensBuf[numParts];
 
-      // Get text without hyphens. (There are numHyphens + 1 parts in the word,
-      // and 2 * numHyphens bytes less, 2 for each hyphen, are needed.)
-      char textWithoutHyphens[len - 2 * numHyphens];
-      int start = 0; // related to "text"
-      for (int i = 0; i < numHyphens + 1; i++) {
-         int end = (i == numHyphens) ? len : hyphenPos[i];
-         memmove (textWithoutHyphens + start - 2 * i, text + start,
-                  end - start);
-         start = end + 2;
+      if (totalLenSignRemoved == 0) {
+         // No removed characters: take original arrays.
+         textWithoutHyphens = text;
+         partEndWithoutHyphens = partEnd;
+      } else {
+         // Copy into special buffers.
+         textWithoutHyphens = textWithoutHyphensBuf;
+         partEndWithoutHyphens = partEndWithoutHyphensBuf;
+
+         int n = 0;
+         for (int i = 0; i < numParts; i++) {
+            memmove (textWithoutHyphensBuf + n, text + partStart[i],
+                     partEnd[i] - partStart[i]);
+            n += partEnd[i] - partStart[i];
+            partEndWithoutHyphensBuf[i] = n;
+         }
       }
 
       PRINTF("H... without hyphens: '");
-      for (size_t i = 0; i < len - 2 * numHyphens; i++)
+      for (size_t i = 0; i < len - totalLenSignRemoved; i++)
          PUTCHAR(textWithoutHyphens[i]);
       PRINTF("'\n");
 
-      core::Requisition wordSize[numHyphens + 1];
-      calcTextSizes (textWithoutHyphens, len - 2 * numHyphens, style,
-                     numHyphens, breakPos, wordSize);
+      core::Requisition wordSize[numParts];
+      calcTextSizes (textWithoutHyphens, len - totalLenSignRemoved, style,
+                     numParts, partEndWithoutHyphens, wordSize);
 
       // Finished!
-      for (int i = 0; i < numHyphens + 1; i++) {
-         int start = (i == 0) ? 0 : hyphenPos[i - 1] + 2;
-         int end = (i == numHyphens) ? len : hyphenPos[i];
-         // Do not anymore hyphen automatically.
-         addText0 (text + start, end - start, false, style, &wordSize[i]);
+      for (int i = 0; i < numParts; i++) {
+         // Do not anymore hyphen automatically. TODO Sometimes do.
+         addText0 (text + partStart[i], partEnd[i] - partStart[i],
+                   false, style, &wordSize[i]);
 
          PRINTF("H... [%d] '", i);
-         for (int j = start; j < end; j++)
+         for (int j = partStart[i]; j < partEnd[i]; j++)
             PUTCHAR(text[j]);
          PRINTF("' added\n");
 
-         if(i < numHyphens) {
+         if(i < numParts - 1) {
+            // TODO Here again. Consider also penalties.
             addHyphen ();
             PRINTF("H... yphen added\n");
          }
@@ -1498,22 +1571,23 @@ void Textblock::addText (const char *text, size_t len,
 
 void Textblock::calcTextSizes (const char *text, size_t textLen,
                                core::style::Style *style,
-                               int numBreaks, int *breakPos,
+                               int numParts, int *partEnd,
                                core::Requisition *wordSize)
 {
    // The size of the last part is calculated in a simple way.
-   int lastStart = breakPos[numBreaks - 1];
+   int lastStart = partEnd[numParts - 2];
    calcTextSize (text + lastStart, textLen - lastStart, style,
-                 &wordSize[numBreaks]);
+                 &wordSize[numParts - 1]);
 
    PRINTF("H... [%d] '", numBreaks);
    for (size_t i = 0; i < textLen - lastStart; i++)
       PUTCHAR(text[i + lastStart]);
    PRINTF("' -> %d\n", wordSize[numBreaks].width);
 
-   // The rest is more complicated. TODO Documentation.
-   for (int i = numBreaks - 1; i >= 0; i--) {
-      int start = (i == 0) ? 0 : breakPos[i - 1];
+   // The rest is more complicated. See dw-line-breaking, section
+   // "Hyphens".
+   for (int i = numParts - 2; i >= 0; i--) {
+      int start = (i == 0) ? 0 : partEnd[i - 1];
       calcTextSize (text + start, textLen - start, style, &wordSize[i]);
 
       PRINTF("H... [%d] '", i);
@@ -1521,7 +1595,7 @@ void Textblock::calcTextSizes (const char *text, size_t textLen,
          PUTCHAR(text[j + start]);
       PRINTF("' -> %d\n", wordSize[i].width);
 
-      for (int j = i + 1; j < numBreaks + 1; j++) {
+      for (int j = i + 1; j < numParts; j++) {
          wordSize[i].width -= wordSize[j].width;
          PRINTF("H...    - %d = %d\n", wordSize[j].width, wordSize[i].width);
       }
@@ -1707,7 +1781,7 @@ void Textblock::addHyphen ()
    if (wordIndex >= 0) {
       Word *word = words->getRef(wordIndex);
  
-      word->badnessAndPenalty.setPenalty (penaltyHyphen);
+      word->badnessAndPenalty.setPenalty (penalties[PENALTY_HYPHEN]);
       // TODO Optimize? Like spaces?
       word->hyphenWidth = layout->textWidth (word->style->font, "\xc2\xad", 2);
 
diff --git a/dw/textblock.hh b/dw/textblock.hh
index cc11eb9c..2a0eda0f 100644
--- a/dw/textblock.hh
+++ b/dw/textblock.hh
@@ -211,6 +211,18 @@ private:
       void print ();
    };
 
+   enum { PENALTY_HYPHEN, PENALTY_NUM };
+   enum { NUM_DIV_SIGNS = 1 };
+
+   typedef struct
+   {
+      const char *s;
+      bool signRemoved, canBeHyphenated;
+      int penaltyIndexLeft, penaltyIndexRight;
+   } DivSign;
+
+   static DivSign divSigns[NUM_DIV_SIGNS];
+
 protected:
    struct Line
    {
@@ -355,12 +367,12 @@ protected:
    bool mustQueueResize;
 
    /**
-    * The penalty for hyphens, multiplied by 100. So, 100 means
-    * 1.0. INT_MAX and INT_MIN are also allowed. See
+    * The penalties for hyphens and other, multiplied by 100. So, 100
+    * means 1.0. INT_MAX and INT_MIN are also allowed. See
     * dw::Textblock::BadnessAndPenalty::setPenalty for more
     * details. Set from preferences.
     */
-   int penaltyHyphen;
+   int penalties[PENALTY_NUM];
 
    bool limitTextWidth; /* from preferences */
 
diff --git a/dw/textblock_linebreaking.cc b/dw/textblock_linebreaking.cc
index ebe3d534..d111a5b5 100644
--- a/dw/textblock_linebreaking.cc
+++ b/dw/textblock_linebreaking.cc
@@ -699,7 +699,7 @@ int Textblock::hyphenateWord (int wordIndex)
          // Note: there are numBreaks + 1 word parts.
          if (i < numBreaks) {
             // TODO There should be a method fillHyphen.
-            w->badnessAndPenalty.setPenalty (penaltyHyphen);
+            w->badnessAndPenalty.setPenalty (penalties[PENALTY_HYPHEN]);
             w->hyphenWidth =
                layout->textWidth (origWord.style->font, "\xc2\xad", 2);
             PRINTF ("      [%d] + hyphen\n", wordIndex + i);
author	Sebastian Geerken <devnull@localhost>	2012-11-09 21:00:10 +0100
committer	Sebastian Geerken <devnull@localhost>	2012-11-09 21:00:10 +0100
commit	ba12d00e4304138098beec462bd9c7ab5063b0bd (patch)
tree	bfe3c1fbefc8fbb3822882da1302c42c0a266b30
parent	5e000618f0774aa24605ab984c46411e745a1574 (diff)