summaryrefslogtreecommitdiff
path: root/src/misc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/misc.c')
-rw-r--r--src/misc.c144
1 files changed, 126 insertions, 18 deletions
diff --git a/src/misc.c b/src/misc.c
index 1b70c1b6..1334d62c 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -93,6 +93,15 @@ static const ContentType_t MimeTypes[] = {
{ NULL, 0 }
};
+typedef enum {
+ DT_OCTET_STREAM = 0,
+ DT_TEXT_HTML,
+ DT_TEXT_PLAIN,
+ DT_IMAGE_GIF,
+ DT_IMAGE_PNG,
+ DT_IMAGE_JPG,
+} DetectedContentType;
+
/*
* Detects 'Content-Type' from a data stream sample.
*
@@ -105,10 +114,10 @@ static const ContentType_t MimeTypes[] = {
*/
int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
{
- int st = 1; /* default to "doubt' */
- int Type = 0; /* default to "application/octet-stream" */
+ size_t i, non_ascci, non_ascci_text, bin_chars;
char *p = Data;
- size_t i, non_ascci;
+ int st = 1; /* default to "doubt' */
+ DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
/* HTML try */
for (i = 0; i < Size && isspace(p[i]); ++i);
@@ -119,36 +128,50 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
/* this line is workaround for FTP through the Squid proxy */
(Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) {
- Type = 1;
+ Type = DT_TEXT_HTML;
st = 0;
/* Images */
} else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) {
- Type = 3;
+ Type = DT_IMAGE_GIF;
st = 0;
} else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) {
- Type = 4;
+ Type = DT_IMAGE_PNG;
st = 0;
} else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) {
/* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
* at the character representation should be machine independent. */
- Type = 5;
+ Type = DT_IMAGE_JPG;
st = 0;
/* Text */
} else {
- /* We'll assume "text/plain" if the set of chars above 127 is <= 10
- * in a 256-bytes sample. Better heuristics are welcomed! :-) */
- non_ascci = 0;
+ /* Heuristic for "text/plain"
+ * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
+ * All in the above set regard [00-31] as control characters.
+ * LATIN1: [7F-9F] unused
+ * CP-1251 {7F,98} unused (two characters).
+ *
+ * We'll use [0-31] as indicators of non-text content.
+ * Better heuristics are welcomed! :-) */
+
+ non_ascci = non_ascci_text = bin_chars = 0;
Size = MIN (Size, 256);
- for (i = 0; i < Size; i++)
- if ((uchar_t) p[i] > 127)
+ for (i = 0; i < Size; i++) {
+ int ch = (uchar_t) p[i];
+ if (ch < 32 && !isspace(ch))
+ ++bin_chars;
+ if (ch > 126)
++non_ascci;
- if (Size == 256) {
- Type = (non_ascci > 10) ? 0 : 2;
- st = 0;
- } else {
- Type = (non_ascci > 0) ? 0 : 2;
+ if (ch > 190)
+ ++non_ascci_text;
+ }
+ if (bin_chars == 0) {
+ /* Let's say text: if "rare" chars are <= 10% */
+ if ((non_ascci - non_ascci_text) <= Size/10)
+ Type = DT_TEXT_PLAIN;
}
+ if (Size == 256)
+ st = 0;
}
*PT = MimeTypes[Type].str;
@@ -156,6 +179,91 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
}
/*
+ * Parse Content-Type string, e.g., "text/html; charset=utf-8".
+ */
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+ char **charset)
+{
+ const char *s;
+
+ if (major)
+ *major = NULL;
+ if (minor)
+ *minor = NULL;
+ if (charset)
+ *charset = NULL;
+ if (!str)
+ return;
+
+ for (s = str; isalnum(*s) || (*s == '-'); s++);
+ if (major)
+ *major = dStrndup(str, s - str);
+
+ if (*s == '/') {
+ for (str = ++s; isalnum(*s) || (*s == '-'); s++);
+ if (minor)
+ *minor = dStrndup(str, s - str);
+ }
+
+ if (charset && *s) {
+ const char terminators[] = " ;\t";
+ const char key[] = "charset";
+
+ if ((s = dStristr(str, key)) &&
+ (s == str || strchr(terminators, s[-1]))) {
+ s += sizeof(key) - 1;
+ for ( ; *s == ' ' || *s == '\t'; ++s);
+ if (*s == '=') {
+ size_t len;
+ for (++s; *s == ' ' || *s == '\t'; ++s);
+ if ((len = strcspn(s, terminators))) {
+ if (*s == '"' && s[len-1] == '"' && len > 1) {
+ /* quoted string */
+ s++;
+ len -= 2;
+ }
+ *charset = dStrndup(s, len);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two Content-Type strings.
+ * Return 0 if they are equivalent, and 1 otherwise.
+ */
+int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
+{
+ char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
+ int ret;
+
+ if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
+ return 0;
+ if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
+ return 1;
+
+ a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
+ a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
+
+ if (major1 && major2 && !dStrcasecmp(major1, major2) &&
+ minor1 && minor2 && !dStrcasecmp(minor1, minor2) &&
+ ((!charset1 && !charset2) ||
+ (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) ||
+ (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) ||
+ (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) {
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+ dFree(major1); dFree(major2);
+ dFree(minor1); dFree(minor2);
+ dFree(charset1); dFree(charset2);
+
+ return ret;
+}
+
+/*
* Check the server-supplied 'Content-Type' against our detected type.
* (some servers seem to default to "text/plain").
*
@@ -177,7 +285,7 @@ int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
int i;
int st = -1;
- _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
+ MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
if (!EntryType)
return 0; /* there's no mismatch without server type */