aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjcid <devnull@localhost>2008-05-29 02:19:08 +0200
committerjcid <devnull@localhost>2008-05-29 02:19:08 +0200
commit1e21147176d202f3c8399c280ca3a6b77f830433 (patch)
tree33dfd52c64dc0fc708391fd143db00150cc2fb14
parente4acbd9f0423a47efa7188d801bc58c2bdedf8ba (diff)
- Switched from charset to content-type for handling data.
- Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text.
-rw-r--r--ChangeLog2
-rw-r--r--src/cache.c52
-rw-r--r--src/cache.h3
-rw-r--r--src/capi.c17
-rw-r--r--src/capi.h3
-rw-r--r--src/html.cc92
-rw-r--r--src/misc.c144
-rw-r--r--src/misc.h3
8 files changed, 230 insertions, 86 deletions
diff --git a/ChangeLog b/ChangeLog
index dc9a5032..f88144bb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -103,6 +103,7 @@ dillo-fltk2
- Switched Window::destroy to Window::delete, fixing side effects.
- Made zlib a configure requirement, and cleaned up configure.in.
- Fixed a segfault bug in Nav.c.
+ - Switched from charset to content-type for handling data.
Patches: place (AKA corvid)
+- Fixed a problem with locally-installed dpis.
- Added code for optional image loading (nice interface) very advanced!
@@ -114,6 +115,7 @@ dillo-fltk2
- Added a_Capi_get_flags(). It requests a cache entry's status as flags.
- Switched URL_DATA type from char* to a dStr.
- Implemented the file input control for forms.
+ - Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text.
Patch: place, Jorge Arellano Cid
+- Fixed a cookies-related dillo freeze bug happening at:
http://www.fltk.org/newsgroups.php?gfltk.general+v:24912
diff --git a/src/cache.c b/src/cache.c
index dc8228f1..ade2bd6a 100644
--- a/src/cache.c
+++ b/src/cache.c
@@ -54,6 +54,7 @@ typedef struct {
const DilloUrl *Url; /* Cached Url. Url is used as a primary Key */
char *TypeDet; /* MIME type string (detected from data) */
char *TypeHdr; /* MIME type string as from the HTTP Header */
+ char *TypeMeta; /* MIME type string from META HTTP-EQUIV */
Dstr *Header; /* HTTP header */
const DilloUrl *Location; /* New URI for redirects */
Dstr *Data; /* Pointer to raw data */
@@ -202,6 +203,7 @@ static void Cache_entry_init(CacheEntry_t *NewEntry, const DilloUrl *Url)
NewEntry->Url = a_Url_dup(Url);
NewEntry->TypeDet = NULL;
NewEntry->TypeHdr = NULL;
+ NewEntry->TypeMeta = NULL;
NewEntry->Header = dStr_new("");
NewEntry->Location = NULL;
NewEntry->Data = dStr_sized_new(8*1024);
@@ -290,6 +292,7 @@ static void Cache_entry_free(CacheEntry_t *entry)
a_Url_free((DilloUrl *)entry->Url);
dFree(entry->TypeDet);
dFree(entry->TypeHdr);
+ dFree(entry->TypeMeta);
dStr_free(entry->Header, TRUE);
a_Url_free((DilloUrl *)entry->Location);
dStr_free(entry->Data, 1);
@@ -390,6 +393,47 @@ uint_t a_Cache_get_flags(const DilloUrl *url)
}
/*
+ * Get current content type.
+ */
+static const char *Cache_current_content_type(CacheEntry_t *entry)
+{
+ return entry->TypeMeta ? entry->TypeMeta : entry->TypeHdr ? entry->TypeHdr :
+ entry->TypeDet;
+}
+
+/*
+ * Get current Content-Type for cache entry found by URL.
+ */
+const char *a_Cache_get_content_type(const DilloUrl *url)
+{
+ CacheEntry_t *entry = Cache_entry_search_with_redirect(url);
+
+ return (entry) ? Cache_current_content_type(entry) : NULL;
+}
+
+/*
+ * Change Content-Type for cache entry found by url.
+ * Return new content type.
+ */
+const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype,
+ bool_t force)
+{
+ const char *ret;
+ CacheEntry_t *entry = Cache_entry_search_with_redirect(url);
+
+ if (!entry) {
+ ret = NULL;
+ } else {
+ if (force == TRUE || entry->TypeMeta == NULL) {
+ dFree(entry->TypeMeta);
+ entry->TypeMeta = dStrdup(ctype);
+ }
+ ret = entry->TypeMeta;
+ }
+ return ret;
+}
+
+/*
* Get the pointer to the URL document, and its size, from the cache entry.
* Return: 1 cached, 0 not cached.
*/
@@ -831,7 +875,7 @@ static void Cache_process_queue(CacheEntry_t *entry)
if (!(entry->Flags & CA_GotContentType)) {
st = a_Misc_get_content_type_from_data(
entry->Data->str, entry->Data->len, &Type);
- _MSG("Cache: detected Content-Type '%s'\n", Type);
+ MSG("Cache: detected Content-Type '%s'\n", Type);
if (st == 0 || entry->Flags & CA_GotData) {
if (a_Misc_content_type_check(entry->TypeHdr, Type) < 0) {
MSG_HTTP("Content-Type '%s' doesn't match the real data.\n",
@@ -885,9 +929,9 @@ static void Cache_process_queue(CacheEntry_t *entry)
if (TypeMismatch) {
AbortEntry = TRUE;
} else {
- st = a_Web_dispatch_by_type(
- entry->TypeHdr ? entry->TypeHdr : entry->TypeDet,
- ClientWeb, &Client->Callback, &Client->CbData);
+ st = a_Web_dispatch_by_type(Cache_current_content_type(entry),
+ ClientWeb, &Client->Callback,
+ &Client->CbData);
if (st == -1) {
/* MIME type is not viewable */
if (ClientWeb->flags & WEB_RootUrl) {
diff --git a/src/cache.h b/src/cache.h
index 5dee12c1..08932550 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -59,6 +59,9 @@ struct _CacheClient {
void a_Cache_init(void);
int a_Cache_open_url(void *Web, CA_Callback_t Call, void *CbData);
int a_Cache_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize);
+const char *a_Cache_get_content_type(const DilloUrl *url);
+const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype,
+ bool_t force);
uint_t a_Cache_get_flags(const DilloUrl *url);
void a_Cache_process_dbuf(int Op, const char *buf, size_t buf_size,
const DilloUrl *Url);
diff --git a/src/capi.c b/src/capi.c
index 70011224..3cb597d9 100644
--- a/src/capi.c
+++ b/src/capi.c
@@ -406,6 +406,23 @@ int a_Capi_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize)
}
/*
+ * Get the Content-Type associated with the URL
+ */
+const char *a_Capi_get_content_type(const DilloUrl *url)
+{
+ return a_Cache_get_content_type(url);
+}
+
+/*
+ * Set the Content-Type for the URL.
+ */
+const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype,
+ bool_t force)
+{
+ return a_Cache_set_content_type(url, ctype, force);
+}
+
+/*
* Send a dpi cmd.
* (For instance: add_bookmark, open_url, send_preferences, ...)
*/
diff --git a/src/capi.h b/src/capi.h
index 120eb3c1..0cb60863 100644
--- a/src/capi.h
+++ b/src/capi.h
@@ -24,6 +24,9 @@ extern "C" {
void a_Capi_init(void);
int a_Capi_open_url(DilloWeb *web, CA_Callback_t Call, void *CbData);
int a_Capi_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize);
+const char *a_Capi_get_content_type(const DilloUrl *url);
+const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype,
+ bool_t force);
int a_Capi_get_flags(const DilloUrl *Url);
int a_Capi_dpi_send_cmd(DilloUrl *url, void *bw, char *cmd, char *server,
int flags);
diff --git a/src/html.cc b/src/html.cc
index dad4c0e8..65805325 100644
--- a/src/html.cc
+++ b/src/html.cc
@@ -302,7 +302,7 @@ private:
public: //BUG: for now everything is public
BrowserWindow *bw;
- DilloUrl *base_url;
+ DilloUrl *page_url, *base_url;
dw::core::Widget *dw; /* this is duplicated in the stack */
/* -------------------------------------------------------------------*/
@@ -311,9 +311,9 @@ public: //BUG: for now everything is public
size_t Buf_Consumed; /* amount of source from cache consumed */
Dstr *Local_Buf; /* source converted to displayable encoding (UTF-8) */
int Local_Ofs;
- char *charset;
- bool using_meta_charset; /* to handle multiple meta_charset tags */
Decode *decoder;
+ char *content_type, *charset;
+ bool stop_parser;
size_t CurrTagOfs;
size_t OldTagOfs, OldTagLine;
@@ -363,7 +363,7 @@ private:
void initDw(); /* Used by the constructor */
public:
- DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *charset);
+ DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *content_type);
~DilloHtml();
void connectSignals(dw::core::Widget *dw);
void write(char *Buf, int BufSize, int Eof);
@@ -430,11 +430,6 @@ typedef struct {
} TagInfo;
extern const TagInfo Tags[];
-/* todo: implement this as an URL/charset pair in a DList.
- * chances of this bare-bones implementation to fail are minimal though:
- * two ROOT pages using meta-charset, parsing HEAD section at the same time */
-static char *meta_charset = NULL;
-
/*-----------------------------------------------------------------------------
*-----------------------------------------------------------------------------
* Main Code
@@ -493,39 +488,12 @@ static DilloUrl *Html_url_new(DilloHtml *html,
}
/*
- * Get charset string from HTTP Content-Type string.
- */
-static char *Html_get_charset(const char *ct)
-{
- const char key[] = "charset";
- const char terminators[] = " ;\t";
- char *start;
- size_t len;
-
- if ((start = dStristr(ct, "charset")) &&
- (start == ct || strchr(terminators, start[-1]))) {
- start += sizeof(key) - 1;
- for ( ; *start == ' ' || *start == '\t'; ++start);
- if (*start == '=') {
- for (++start; *start == ' ' || *start == '\t'; ++start);
- _MSG("Html_get_charset: %s\n", start);
- if ((len = strcspn(start, terminators)))
- return dStrndup(start, len);
- }
- }
- return NULL;
-}
-
-/*
* Set callback function and callback data for the "html/text" MIME type.
*/
void *a_Html_text(const char *Type, void *P, CA_Callback_t *Call, void **Data)
{
DilloWeb *web = (DilloWeb*)P;
- char *charset = Html_get_charset(Type);
- DilloHtml *html = new DilloHtml(web->bw, web->url, charset);
-
- dFree(charset);
+ DilloHtml *html = new DilloHtml(web->bw, web->url, Type);
*Data = (void*)html;
*Call = (CA_Callback_t)Html_callback;
@@ -778,13 +746,14 @@ static int Html_level_to_fontsize(int level)
* Create and initialize a new DilloHtml class
*/
DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url,
- const char *charset)
+ const char *content_type)
{
/* Init event receiver */
linkReceiver.html = this;
/* Init main variables */
bw = p_bw;
+ page_url = a_Url_dup(url);
base_url = a_Url_dup(url);
dw = NULL;
@@ -795,22 +764,14 @@ DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url,
Local_Buf = dStr_new("");
Local_Ofs = 0;
- if (charset) {
- MSG("HTTP Content-Type gave charset as: %s\n", charset);
- }
- if (meta_charset) {
- MSG("META Content-Type gave charset as: %s\n", meta_charset);
- }
- if (meta_charset) {
- decoder = a_Decode_charset_init(meta_charset);
- this->charset = meta_charset;
- using_meta_charset = true;
- meta_charset = NULL;
- } else {
- decoder = a_Decode_charset_init(charset);
- this->charset = dStrdup(charset);
- using_meta_charset = false;
- }
+ MSG("HTML content type: %s\n", content_type);
+ this->content_type = dStrdup(content_type);
+
+ /* get charset */
+ a_Misc_parse_content_type(content_type, NULL, NULL, &charset);
+
+ decoder = a_Decode_charset_init(charset);
+ stop_parser = false;
CurrTagOfs = 0;
OldTagOfs = 0;
@@ -921,6 +882,7 @@ DilloHtml::~DilloHtml()
a_Bw_remove_doc(bw, this);
+ a_Url_free(page_url);
a_Url_free(base_url);
for (int i = 0; i < forms->size(); i++)
@@ -1027,6 +989,7 @@ void DilloHtml::freeParseData()
a_Decode_free(decoder);
dStr_free(Local_Buf, TRUE);
+ dFree(content_type);
dFree(charset);
}
@@ -3959,19 +3922,20 @@ static void Html_tag_open_meta(DilloHtml *html, const char *tag, int tagsize)
}
dStr_free(ds_msg, 1);
- } else if (!html->using_meta_charset &&
- !dStrcasecmp(equiv, "content-type") &&
+ } else if (!dStrcasecmp(equiv, "content-type") &&
(content = Html_get_attr(html, tag, tagsize, "content"))) {
- char *charset = Html_get_charset(content);
- if (charset) {
- if (!html->charset || dStrcasecmp(charset, html->charset)) {
- MSG("META Content-Type changes charset to: %s\n", charset);
- dFree(meta_charset);
- meta_charset = dStrdup(charset);
+ if (a_Misc_content_type_cmp(html->content_type, content)) {
+ const bool_t force = FALSE;
+ const char *new_content =
+ a_Capi_set_content_type(html->page_url, content, force);
+ /* Cannot ask cache whether the content type was changed, as
+ * this code in another bw might have already changed it for us.
+ */
+ if (a_Misc_content_type_cmp(html->content_type, new_content)) {
a_Nav_repush(html->bw);
+ html->stop_parser = true;
}
}
- dFree(charset);
}
}
}
@@ -5873,7 +5837,7 @@ static int Html_write_raw(DilloHtml *html, char *buf, int bufsize, int Eof)
* boundary. Iterate through tokens until end of buffer is reached. */
buf_index = 0;
token_start = buf_index;
- while (buf_index < bufsize) {
+ while ((buf_index < bufsize) && (html->stop_parser == false)) {
/* invariant: buf_index == bufsize || token_start == buf_index */
if (S_TOP(html)->parse_mode ==
diff --git a/src/misc.c b/src/misc.c
index 1b70c1b6..1334d62c 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -93,6 +93,15 @@ static const ContentType_t MimeTypes[] = {
{ NULL, 0 }
};
+typedef enum {
+ DT_OCTET_STREAM = 0,
+ DT_TEXT_HTML,
+ DT_TEXT_PLAIN,
+ DT_IMAGE_GIF,
+ DT_IMAGE_PNG,
+ DT_IMAGE_JPG,
+} DetectedContentType;
+
/*
* Detects 'Content-Type' from a data stream sample.
*
@@ -105,10 +114,10 @@ static const ContentType_t MimeTypes[] = {
*/
int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
{
- int st = 1; /* default to "doubt' */
- int Type = 0; /* default to "application/octet-stream" */
+ size_t i, non_ascci, non_ascci_text, bin_chars;
char *p = Data;
- size_t i, non_ascci;
+ int st = 1; /* default to "doubt' */
+ DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
/* HTML try */
for (i = 0; i < Size && isspace(p[i]); ++i);
@@ -119,36 +128,50 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
/* this line is workaround for FTP through the Squid proxy */
(Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) {
- Type = 1;
+ Type = DT_TEXT_HTML;
st = 0;
/* Images */
} else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) {
- Type = 3;
+ Type = DT_IMAGE_GIF;
st = 0;
} else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) {
- Type = 4;
+ Type = DT_IMAGE_PNG;
st = 0;
} else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) {
/* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
* at the character representation should be machine independent. */
- Type = 5;
+ Type = DT_IMAGE_JPG;
st = 0;
/* Text */
} else {
- /* We'll assume "text/plain" if the set of chars above 127 is <= 10
- * in a 256-bytes sample. Better heuristics are welcomed! :-) */
- non_ascci = 0;
+ /* Heuristic for "text/plain"
+ * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
+ * All in the above set regard [00-31] as control characters.
+ * LATIN1: [7F-9F] unused
+ * CP-1251 {7F,98} unused (two characters).
+ *
+ * We'll use [0-31] as indicators of non-text content.
+ * Better heuristics are welcomed! :-) */
+
+ non_ascci = non_ascci_text = bin_chars = 0;
Size = MIN (Size, 256);
- for (i = 0; i < Size; i++)
- if ((uchar_t) p[i] > 127)
+ for (i = 0; i < Size; i++) {
+ int ch = (uchar_t) p[i];
+ if (ch < 32 && !isspace(ch))
+ ++bin_chars;
+ if (ch > 126)
++non_ascci;
- if (Size == 256) {
- Type = (non_ascci > 10) ? 0 : 2;
- st = 0;
- } else {
- Type = (non_ascci > 0) ? 0 : 2;
+ if (ch > 190)
+ ++non_ascci_text;
+ }
+ if (bin_chars == 0) {
+ /* Let's say text: if "rare" chars are <= 10% */
+ if ((non_ascci - non_ascci_text) <= Size/10)
+ Type = DT_TEXT_PLAIN;
}
+ if (Size == 256)
+ st = 0;
}
*PT = MimeTypes[Type].str;
@@ -156,6 +179,91 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
}
/*
+ * Parse Content-Type string, e.g., "text/html; charset=utf-8".
+ */
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+ char **charset)
+{
+ const char *s;
+
+ if (major)
+ *major = NULL;
+ if (minor)
+ *minor = NULL;
+ if (charset)
+ *charset = NULL;
+ if (!str)
+ return;
+
+ for (s = str; isalnum(*s) || (*s == '-'); s++);
+ if (major)
+ *major = dStrndup(str, s - str);
+
+ if (*s == '/') {
+ for (str = ++s; isalnum(*s) || (*s == '-'); s++);
+ if (minor)
+ *minor = dStrndup(str, s - str);
+ }
+
+ if (charset && *s) {
+ const char terminators[] = " ;\t";
+ const char key[] = "charset";
+
+ if ((s = dStristr(str, key)) &&
+ (s == str || strchr(terminators, s[-1]))) {
+ s += sizeof(key) - 1;
+ for ( ; *s == ' ' || *s == '\t'; ++s);
+ if (*s == '=') {
+ size_t len;
+ for (++s; *s == ' ' || *s == '\t'; ++s);
+ if ((len = strcspn(s, terminators))) {
+ if (*s == '"' && s[len-1] == '"' && len > 1) {
+ /* quoted string */
+ s++;
+ len -= 2;
+ }
+ *charset = dStrndup(s, len);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two Content-Type strings.
+ * Return 0 if they are equivalent, and 1 otherwise.
+ */
+int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
+{
+ char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
+ int ret;
+
+ if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
+ return 0;
+ if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
+ return 1;
+
+ a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
+ a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
+
+ if (major1 && major2 && !dStrcasecmp(major1, major2) &&
+ minor1 && minor2 && !dStrcasecmp(minor1, minor2) &&
+ ((!charset1 && !charset2) ||
+ (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) ||
+ (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) ||
+ (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) {
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+ dFree(major1); dFree(major2);
+ dFree(minor1); dFree(minor2);
+ dFree(charset1); dFree(charset2);
+
+ return ret;
+}
+
+/*
* Check the server-supplied 'Content-Type' against our detected type.
* (some servers seem to default to "text/plain").
*
@@ -177,7 +285,7 @@ int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
int i;
int st = -1;
- _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
+ MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
if (!EntryType)
return 0; /* there's no mismatch without server type */
diff --git a/src/misc.h b/src/misc.h
index 145b155b..c4d901ab 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -13,6 +13,9 @@ char *a_Misc_escape_chars(const char *str, char *esc_set);
char *a_Misc_expand_tabs(const char *str);
int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT);
int a_Misc_content_type_check(const char *EntryType, const char *DetectedType);
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+ char **charset);
+int a_Misc_content_type_cmp(const char* ct1, const char *ct2);
int a_Misc_parse_geometry(char *geom, int *x, int *y, int *w, int *h);
char *a_Misc_encode_base64(const char *in);
Dstr *a_Misc_file2dstr(const char *filename);