diff options
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | src/cache.c | 52 | ||||
-rw-r--r-- | src/cache.h | 3 | ||||
-rw-r--r-- | src/capi.c | 17 | ||||
-rw-r--r-- | src/capi.h | 3 | ||||
-rw-r--r-- | src/html.cc | 92 | ||||
-rw-r--r-- | src/misc.c | 144 | ||||
-rw-r--r-- | src/misc.h | 3 |
8 files changed, 230 insertions, 86 deletions
@@ -103,6 +103,7 @@ dillo-fltk2 - Switched Window::destroy to Window::delete, fixing side effects. - Made zlib a configure requirement, and cleaned up configure.in. - Fixed a segfault bug in Nav.c. + - Switched from charset to content-type for handling data. Patches: place (AKA corvid) +- Fixed a problem with locally-installed dpis. - Added code for optional image loading (nice interface) very advanced! @@ -114,6 +115,7 @@ dillo-fltk2 - Added a_Capi_get_flags(). It requests a cache entry's status as flags. - Switched URL_DATA type from char* to a dStr. - Implemented the file input control for forms. + - Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text. Patch: place, Jorge Arellano Cid +- Fixed a cookies-related dillo freeze bug happening at: http://www.fltk.org/newsgroups.php?gfltk.general+v:24912 diff --git a/src/cache.c b/src/cache.c index dc8228f1..ade2bd6a 100644 --- a/src/cache.c +++ b/src/cache.c @@ -54,6 +54,7 @@ typedef struct { const DilloUrl *Url; /* Cached Url. Url is used as a primary Key */ char *TypeDet; /* MIME type string (detected from data) */ char *TypeHdr; /* MIME type string as from the HTTP Header */ + char *TypeMeta; /* MIME type string from META HTTP-EQUIV */ Dstr *Header; /* HTTP header */ const DilloUrl *Location; /* New URI for redirects */ Dstr *Data; /* Pointer to raw data */ @@ -202,6 +203,7 @@ static void Cache_entry_init(CacheEntry_t *NewEntry, const DilloUrl *Url) NewEntry->Url = a_Url_dup(Url); NewEntry->TypeDet = NULL; NewEntry->TypeHdr = NULL; + NewEntry->TypeMeta = NULL; NewEntry->Header = dStr_new(""); NewEntry->Location = NULL; NewEntry->Data = dStr_sized_new(8*1024); @@ -290,6 +292,7 @@ static void Cache_entry_free(CacheEntry_t *entry) a_Url_free((DilloUrl *)entry->Url); dFree(entry->TypeDet); dFree(entry->TypeHdr); + dFree(entry->TypeMeta); dStr_free(entry->Header, TRUE); a_Url_free((DilloUrl *)entry->Location); dStr_free(entry->Data, 1); @@ -390,6 +393,47 @@ uint_t a_Cache_get_flags(const DilloUrl *url) } /* + * Get current content type. + */ +static const char *Cache_current_content_type(CacheEntry_t *entry) +{ + return entry->TypeMeta ? entry->TypeMeta : entry->TypeHdr ? entry->TypeHdr : + entry->TypeDet; +} + +/* + * Get current Content-Type for cache entry found by URL. + */ +const char *a_Cache_get_content_type(const DilloUrl *url) +{ + CacheEntry_t *entry = Cache_entry_search_with_redirect(url); + + return (entry) ? Cache_current_content_type(entry) : NULL; +} + +/* + * Change Content-Type for cache entry found by url. + * Return new content type. + */ +const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype, + bool_t force) +{ + const char *ret; + CacheEntry_t *entry = Cache_entry_search_with_redirect(url); + + if (!entry) { + ret = NULL; + } else { + if (force == TRUE || entry->TypeMeta == NULL) { + dFree(entry->TypeMeta); + entry->TypeMeta = dStrdup(ctype); + } + ret = entry->TypeMeta; + } + return ret; +} + +/* * Get the pointer to the URL document, and its size, from the cache entry. * Return: 1 cached, 0 not cached. */ @@ -831,7 +875,7 @@ static void Cache_process_queue(CacheEntry_t *entry) if (!(entry->Flags & CA_GotContentType)) { st = a_Misc_get_content_type_from_data( entry->Data->str, entry->Data->len, &Type); - _MSG("Cache: detected Content-Type '%s'\n", Type); + MSG("Cache: detected Content-Type '%s'\n", Type); if (st == 0 || entry->Flags & CA_GotData) { if (a_Misc_content_type_check(entry->TypeHdr, Type) < 0) { MSG_HTTP("Content-Type '%s' doesn't match the real data.\n", @@ -885,9 +929,9 @@ static void Cache_process_queue(CacheEntry_t *entry) if (TypeMismatch) { AbortEntry = TRUE; } else { - st = a_Web_dispatch_by_type( - entry->TypeHdr ? entry->TypeHdr : entry->TypeDet, - ClientWeb, &Client->Callback, &Client->CbData); + st = a_Web_dispatch_by_type(Cache_current_content_type(entry), + ClientWeb, &Client->Callback, + &Client->CbData); if (st == -1) { /* MIME type is not viewable */ if (ClientWeb->flags & WEB_RootUrl) { diff --git a/src/cache.h b/src/cache.h index 5dee12c1..08932550 100644 --- a/src/cache.h +++ b/src/cache.h @@ -59,6 +59,9 @@ struct _CacheClient { void a_Cache_init(void); int a_Cache_open_url(void *Web, CA_Callback_t Call, void *CbData); int a_Cache_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize); +const char *a_Cache_get_content_type(const DilloUrl *url); +const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype, + bool_t force); uint_t a_Cache_get_flags(const DilloUrl *url); void a_Cache_process_dbuf(int Op, const char *buf, size_t buf_size, const DilloUrl *Url); @@ -406,6 +406,23 @@ int a_Capi_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize) } /* + * Get the Content-Type associated with the URL + */ +const char *a_Capi_get_content_type(const DilloUrl *url) +{ + return a_Cache_get_content_type(url); +} + +/* + * Set the Content-Type for the URL. + */ +const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype, + bool_t force) +{ + return a_Cache_set_content_type(url, ctype, force); +} + +/* * Send a dpi cmd. * (For instance: add_bookmark, open_url, send_preferences, ...) */ @@ -24,6 +24,9 @@ extern "C" { void a_Capi_init(void); int a_Capi_open_url(DilloWeb *web, CA_Callback_t Call, void *CbData); int a_Capi_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize); +const char *a_Capi_get_content_type(const DilloUrl *url); +const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype, + bool_t force); int a_Capi_get_flags(const DilloUrl *Url); int a_Capi_dpi_send_cmd(DilloUrl *url, void *bw, char *cmd, char *server, int flags); diff --git a/src/html.cc b/src/html.cc index dad4c0e8..65805325 100644 --- a/src/html.cc +++ b/src/html.cc @@ -302,7 +302,7 @@ private: public: //BUG: for now everything is public BrowserWindow *bw; - DilloUrl *base_url; + DilloUrl *page_url, *base_url; dw::core::Widget *dw; /* this is duplicated in the stack */ /* -------------------------------------------------------------------*/ @@ -311,9 +311,9 @@ public: //BUG: for now everything is public size_t Buf_Consumed; /* amount of source from cache consumed */ Dstr *Local_Buf; /* source converted to displayable encoding (UTF-8) */ int Local_Ofs; - char *charset; - bool using_meta_charset; /* to handle multiple meta_charset tags */ Decode *decoder; + char *content_type, *charset; + bool stop_parser; size_t CurrTagOfs; size_t OldTagOfs, OldTagLine; @@ -363,7 +363,7 @@ private: void initDw(); /* Used by the constructor */ public: - DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *charset); + DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *content_type); ~DilloHtml(); void connectSignals(dw::core::Widget *dw); void write(char *Buf, int BufSize, int Eof); @@ -430,11 +430,6 @@ typedef struct { } TagInfo; extern const TagInfo Tags[]; -/* todo: implement this as an URL/charset pair in a DList. - * chances of this bare-bones implementation to fail are minimal though: - * two ROOT pages using meta-charset, parsing HEAD section at the same time */ -static char *meta_charset = NULL; - /*----------------------------------------------------------------------------- *----------------------------------------------------------------------------- * Main Code @@ -493,39 +488,12 @@ static DilloUrl *Html_url_new(DilloHtml *html, } /* - * Get charset string from HTTP Content-Type string. - */ -static char *Html_get_charset(const char *ct) -{ - const char key[] = "charset"; - const char terminators[] = " ;\t"; - char *start; - size_t len; - - if ((start = dStristr(ct, "charset")) && - (start == ct || strchr(terminators, start[-1]))) { - start += sizeof(key) - 1; - for ( ; *start == ' ' || *start == '\t'; ++start); - if (*start == '=') { - for (++start; *start == ' ' || *start == '\t'; ++start); - _MSG("Html_get_charset: %s\n", start); - if ((len = strcspn(start, terminators))) - return dStrndup(start, len); - } - } - return NULL; -} - -/* * Set callback function and callback data for the "html/text" MIME type. */ void *a_Html_text(const char *Type, void *P, CA_Callback_t *Call, void **Data) { DilloWeb *web = (DilloWeb*)P; - char *charset = Html_get_charset(Type); - DilloHtml *html = new DilloHtml(web->bw, web->url, charset); - - dFree(charset); + DilloHtml *html = new DilloHtml(web->bw, web->url, Type); *Data = (void*)html; *Call = (CA_Callback_t)Html_callback; @@ -778,13 +746,14 @@ static int Html_level_to_fontsize(int level) * Create and initialize a new DilloHtml class */ DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url, - const char *charset) + const char *content_type) { /* Init event receiver */ linkReceiver.html = this; /* Init main variables */ bw = p_bw; + page_url = a_Url_dup(url); base_url = a_Url_dup(url); dw = NULL; @@ -795,22 +764,14 @@ DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url, Local_Buf = dStr_new(""); Local_Ofs = 0; - if (charset) { - MSG("HTTP Content-Type gave charset as: %s\n", charset); - } - if (meta_charset) { - MSG("META Content-Type gave charset as: %s\n", meta_charset); - } - if (meta_charset) { - decoder = a_Decode_charset_init(meta_charset); - this->charset = meta_charset; - using_meta_charset = true; - meta_charset = NULL; - } else { - decoder = a_Decode_charset_init(charset); - this->charset = dStrdup(charset); - using_meta_charset = false; - } + MSG("HTML content type: %s\n", content_type); + this->content_type = dStrdup(content_type); + + /* get charset */ + a_Misc_parse_content_type(content_type, NULL, NULL, &charset); + + decoder = a_Decode_charset_init(charset); + stop_parser = false; CurrTagOfs = 0; OldTagOfs = 0; @@ -921,6 +882,7 @@ DilloHtml::~DilloHtml() a_Bw_remove_doc(bw, this); + a_Url_free(page_url); a_Url_free(base_url); for (int i = 0; i < forms->size(); i++) @@ -1027,6 +989,7 @@ void DilloHtml::freeParseData() a_Decode_free(decoder); dStr_free(Local_Buf, TRUE); + dFree(content_type); dFree(charset); } @@ -3959,19 +3922,20 @@ static void Html_tag_open_meta(DilloHtml *html, const char *tag, int tagsize) } dStr_free(ds_msg, 1); - } else if (!html->using_meta_charset && - !dStrcasecmp(equiv, "content-type") && + } else if (!dStrcasecmp(equiv, "content-type") && (content = Html_get_attr(html, tag, tagsize, "content"))) { - char *charset = Html_get_charset(content); - if (charset) { - if (!html->charset || dStrcasecmp(charset, html->charset)) { - MSG("META Content-Type changes charset to: %s\n", charset); - dFree(meta_charset); - meta_charset = dStrdup(charset); + if (a_Misc_content_type_cmp(html->content_type, content)) { + const bool_t force = FALSE; + const char *new_content = + a_Capi_set_content_type(html->page_url, content, force); + /* Cannot ask cache whether the content type was changed, as + * this code in another bw might have already changed it for us. + */ + if (a_Misc_content_type_cmp(html->content_type, new_content)) { a_Nav_repush(html->bw); + html->stop_parser = true; } } - dFree(charset); } } } @@ -5873,7 +5837,7 @@ static int Html_write_raw(DilloHtml *html, char *buf, int bufsize, int Eof) * boundary. Iterate through tokens until end of buffer is reached. */ buf_index = 0; token_start = buf_index; - while (buf_index < bufsize) { + while ((buf_index < bufsize) && (html->stop_parser == false)) { /* invariant: buf_index == bufsize || token_start == buf_index */ if (S_TOP(html)->parse_mode == @@ -93,6 +93,15 @@ static const ContentType_t MimeTypes[] = { { NULL, 0 } }; +typedef enum { + DT_OCTET_STREAM = 0, + DT_TEXT_HTML, + DT_TEXT_PLAIN, + DT_IMAGE_GIF, + DT_IMAGE_PNG, + DT_IMAGE_JPG, +} DetectedContentType; + /* * Detects 'Content-Type' from a data stream sample. * @@ -105,10 +114,10 @@ static const ContentType_t MimeTypes[] = { */ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT) { - int st = 1; /* default to "doubt' */ - int Type = 0; /* default to "application/octet-stream" */ + size_t i, non_ascci, non_ascci_text, bin_chars; char *p = Data; - size_t i, non_ascci; + int st = 1; /* default to "doubt' */ + DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */ /* HTML try */ for (i = 0; i < Size && isspace(p[i]); ++i); @@ -119,36 +128,50 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT) /* this line is workaround for FTP through the Squid proxy */ (Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) { - Type = 1; + Type = DT_TEXT_HTML; st = 0; /* Images */ } else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) { - Type = 3; + Type = DT_IMAGE_GIF; st = 0; } else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) { - Type = 4; + Type = DT_IMAGE_PNG; st = 0; } else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) { /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking * at the character representation should be machine independent. */ - Type = 5; + Type = DT_IMAGE_JPG; st = 0; /* Text */ } else { - /* We'll assume "text/plain" if the set of chars above 127 is <= 10 - * in a 256-bytes sample. Better heuristics are welcomed! :-) */ - non_ascci = 0; + /* Heuristic for "text/plain" + * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251} + * All in the above set regard [00-31] as control characters. + * LATIN1: [7F-9F] unused + * CP-1251 {7F,98} unused (two characters). + * + * We'll use [0-31] as indicators of non-text content. + * Better heuristics are welcomed! :-) */ + + non_ascci = non_ascci_text = bin_chars = 0; Size = MIN (Size, 256); - for (i = 0; i < Size; i++) - if ((uchar_t) p[i] > 127) + for (i = 0; i < Size; i++) { + int ch = (uchar_t) p[i]; + if (ch < 32 && !isspace(ch)) + ++bin_chars; + if (ch > 126) ++non_ascci; - if (Size == 256) { - Type = (non_ascci > 10) ? 0 : 2; - st = 0; - } else { - Type = (non_ascci > 0) ? 0 : 2; + if (ch > 190) + ++non_ascci_text; + } + if (bin_chars == 0) { + /* Let's say text: if "rare" chars are <= 10% */ + if ((non_ascci - non_ascci_text) <= Size/10) + Type = DT_TEXT_PLAIN; } + if (Size == 256) + st = 0; } *PT = MimeTypes[Type].str; @@ -156,6 +179,91 @@ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT) } /* + * Parse Content-Type string, e.g., "text/html; charset=utf-8". + */ +void a_Misc_parse_content_type(const char *str, char **major, char **minor, + char **charset) +{ + const char *s; + + if (major) + *major = NULL; + if (minor) + *minor = NULL; + if (charset) + *charset = NULL; + if (!str) + return; + + for (s = str; isalnum(*s) || (*s == '-'); s++); + if (major) + *major = dStrndup(str, s - str); + + if (*s == '/') { + for (str = ++s; isalnum(*s) || (*s == '-'); s++); + if (minor) + *minor = dStrndup(str, s - str); + } + + if (charset && *s) { + const char terminators[] = " ;\t"; + const char key[] = "charset"; + + if ((s = dStristr(str, key)) && + (s == str || strchr(terminators, s[-1]))) { + s += sizeof(key) - 1; + for ( ; *s == ' ' || *s == '\t'; ++s); + if (*s == '=') { + size_t len; + for (++s; *s == ' ' || *s == '\t'; ++s); + if ((len = strcspn(s, terminators))) { + if (*s == '"' && s[len-1] == '"' && len > 1) { + /* quoted string */ + s++; + len -= 2; + } + *charset = dStrndup(s, len); + } + } + } + } +} + +/* + * Compare two Content-Type strings. + * Return 0 if they are equivalent, and 1 otherwise. + */ +int a_Misc_content_type_cmp(const char *ct1, const char *ct2) +{ + char *major1, *major2, *minor1, *minor2, *charset1, *charset2; + int ret; + + if ((!ct1 || !*ct1) && (!ct2 || !*ct2)) + return 0; + if ((!ct1 || !*ct1) || (!ct2 || !*ct2)) + return 1; + + a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1); + a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2); + + if (major1 && major2 && !dStrcasecmp(major1, major2) && + minor1 && minor2 && !dStrcasecmp(minor1, minor2) && + ((!charset1 && !charset2) || + (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) || + (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) || + (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) { + ret = 0; + } else { + ret = 1; + } + dFree(major1); dFree(major2); + dFree(minor1); dFree(minor2); + dFree(charset1); dFree(charset2); + + return ret; +} + +/* * Check the server-supplied 'Content-Type' against our detected type. * (some servers seem to default to "text/plain"). * @@ -177,7 +285,7 @@ int a_Misc_content_type_check(const char *EntryType, const char *DetectedType) int i; int st = -1; - _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType); + MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType); if (!EntryType) return 0; /* there's no mismatch without server type */ @@ -13,6 +13,9 @@ char *a_Misc_escape_chars(const char *str, char *esc_set); char *a_Misc_expand_tabs(const char *str); int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT); int a_Misc_content_type_check(const char *EntryType, const char *DetectedType); +void a_Misc_parse_content_type(const char *str, char **major, char **minor, + char **charset); +int a_Misc_content_type_cmp(const char* ct1, const char *ct2); int a_Misc_parse_geometry(char *geom, int *x, int *y, int *w, int *h); char *a_Misc_encode_base64(const char *in); Dstr *a_Misc_file2dstr(const char *filename); |