From afc695fb7f3db2a69840a94a56ddfacd26a9cedd Mon Sep 17 00:00:00 2001
From: corvid <corvid@lavabit.com>
Date: Fri, 29 May 2009 16:17:06 -0400
Subject: Improved recognition heuristics for plain text in utf8

---
 src/misc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'src/misc.c')

diff --git a/src/misc.c b/src/misc.c
index 27b7ff10..c28c359c 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -19,7 +19,7 @@
 #include "utf8.hh"
 #include "msg.h"
 #include "misc.h"
-
+#include "utf8.hh"
 
 /*
  * Escape characters as %XX sequences.
@@ -169,12 +169,16 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
          if (ch > 190)
             ++non_ascci_text;
       }
-      if (bin_chars == 0) {
+      if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
          /* Let's say text: if "rare" chars are <= 10% */
-         if ((non_ascci - non_ascci_text) <= Size/10)
+         Type = DT_TEXT_PLAIN;
+      } else if (Size > 0) {
+         /* a special check for UTF-8 */
+         Size = a_Utf8_end_of_char(p, Size - 1) + 1;
+         if (a_Utf8_test(p, Size) > 0)
             Type = DT_TEXT_PLAIN;
       }
-      if (Size == 256)
+      if (Size >= 256)
          st = 0;
    }
 
-- 
cgit v1.2.3