summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcorvid <corvid@lavabit.com>2009-05-29 16:17:06 -0400
committercorvid <corvid@lavabit.com>2009-05-29 16:17:06 -0400
commitafc695fb7f3db2a69840a94a56ddfacd26a9cedd (patch)
tree26292d30f610efb86a26c80d9de6e8b44d1feeec
parentb130bd0e45c5bca8e6f703ad3d1e72ddbec71255 (diff)
Improved recognition heuristics for plain text in utf8
-rw-r--r--src/misc.c12
1 files changed, 8 insertions, 4 deletions
diff --git a/src/misc.c b/src/misc.c
index 27b7ff10..c28c359c 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -19,7 +19,7 @@
#include "utf8.hh"
#include "msg.h"
#include "misc.h"
-
+#include "utf8.hh"
/*
* Escape characters as %XX sequences.
@@ -169,12 +169,16 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
if (ch > 190)
++non_ascci_text;
}
- if (bin_chars == 0) {
+ if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
/* Let's say text: if "rare" chars are <= 10% */
- if ((non_ascci - non_ascci_text) <= Size/10)
+ Type = DT_TEXT_PLAIN;
+ } else if (Size > 0) {
+ /* a special check for UTF-8 */
+ Size = a_Utf8_end_of_char(p, Size - 1) + 1;
+ if (a_Utf8_test(p, Size) > 0)
Type = DT_TEXT_PLAIN;
}
- if (Size == 256)
+ if (Size >= 256)
st = 0;
}