/* * File: html.cc * * Copyright (C) 2005-2007 Jorge Arellano Cid * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. */ /* * Dillo HTML parsing routines */ /*----------------------------------------------------------------------------- * Includes *---------------------------------------------------------------------------*/ #include /* for isspace */ #include /* for memcpy and memmove */ #include #include /* for sprintf */ #include #include "bw.h" /* for BrowserWindow */ #include "msg.h" #include "binaryconst.h" #include "colors.h" #include "html_charrefs.h" #include "utf8.hh" #include "misc.h" #include "uicmd.hh" #include "history.h" #include "menu.hh" #include "prefs.h" #include "capi.h" #include "html.hh" #include "html_common.hh" #include "form.hh" #include "table.hh" #include "dw/textblock.hh" #include "dw/bullet.hh" #include "dw/listitem.hh" #include "dw/image.hh" #include "dw/ruler.hh" /*----------------------------------------------------------------------------- * Defines *---------------------------------------------------------------------------*/ /* Define to 1 to ignore white space immediately after an open tag, * and immediately before a close tag. */ #define SGML_SPCDEL 0 #define TAB_SIZE 8 /*----------------------------------------------------------------------------- * Name spaces *---------------------------------------------------------------------------*/ using namespace lout; using namespace dw; using namespace dw::core; using namespace dw::core::ui; using namespace dw::core::style; /*----------------------------------------------------------------------------- * Typedefs *---------------------------------------------------------------------------*/ class DilloHtml; typedef void (*TagOpenFunct) (DilloHtml *html, const char *tag, int tagsize); typedef void (*TagCloseFunct) (DilloHtml *html); typedef enum { SEEK_ATTR_START, MATCH_ATTR_NAME, SEEK_TOKEN_START, SEEK_VALUE_START, SKIP_VALUE, GET_VALUE, FINISHED } DilloHtmlTagParsingState; typedef enum { HTML_LeftTrim = 1 << 0, HTML_RightTrim = 1 << 1, HTML_ParseEntities = 1 << 2 } DilloHtmlTagParsingFlags; /* * Exported function with C linkage. */ extern "C" { void *a_Html_text(const char *type, void *P, CA_Callback_t *Call,void **Data); } /*----------------------------------------------------------------------------- * Forward declarations *---------------------------------------------------------------------------*/ static int Html_write_raw(DilloHtml *html, char *buf, int bufsize, int Eof); static bool Html_load_image(BrowserWindow *bw, DilloUrl *url, const DilloUrl *requester, DilloImage *image); static void Html_callback(int Op, CacheClient_t *Client); static void Html_tag_cleanup_at_close(DilloHtml *html, int TagIdx); /*----------------------------------------------------------------------------- * Local Data *---------------------------------------------------------------------------*/ /* Parsing table structure */ typedef struct { const char *name; /* element name */ unsigned char Flags; /* flags (explained near the table data) */ char EndTag; /* Is it Required, Optional or Forbidden */ uchar_t TagLevel; /* Used to heuristically parse bad HTML */ TagOpenFunct open; /* Open function */ TagOpenFunct content; /* Content function */ TagCloseFunct close; /* Close function */ } TagInfo; extern const TagInfo Tags[]; /*----------------------------------------------------------------------------- *----------------------------------------------------------------------------- * Main Code *----------------------------------------------------------------------------- *---------------------------------------------------------------------------*/ /* * Collect HTML error strings. */ void DilloHtml::bugMessage(const char *format, ... ) { va_list argp; dStr_sprintfa(bw->page_bugs, "HTML warning: line %d, ", getCurrLineNumber()); va_start(argp, format); dStr_vsprintfa(bw->page_bugs, format, argp); va_end(argp); a_UIcmd_set_bug_prog(bw, ++bw->num_page_bugs); } /* * Wrapper for a_Url_new that adds an error detection message. * If use_base_url is TRUE, it uses base_url. Otherwise it uses html->base_url. */ DilloUrl *a_Html_url_new(DilloHtml *html, const char *url_str, const char *base_url, int use_base_url) { DilloUrl *url; int n_ic, n_ic_spc; url = a_Url_new(url_str, (use_base_url) ? base_url : URL_STR_(html->base_url)); if ((n_ic = URL_ILLEGAL_CHARS(url)) != 0) { const char *suffix = (n_ic) > 1 ? "s" : ""; n_ic_spc = URL_ILLEGAL_CHARS_SPC(url); if (n_ic == n_ic_spc) { BUG_MSG("URL has %d illegal space%s ('%s')\n", n_ic, suffix, url_str); } else if (n_ic_spc == 0) { BUG_MSG("URL has %d illegal byte%s in {00-1F, 7F-FF} range ('%s')\n", n_ic, suffix, url_str); } else { BUG_MSG("URL has %d illegal byte%s: " "%d space%s and %d in {00-1F, 7F-FF} range ('%s')\n", n_ic, suffix, n_ic_spc, n_ic_spc > 1 ? "s" : "", n_ic-n_ic_spc, url_str); } } return url; } /* * Set callback function and callback data for the "html/text" MIME type. */ void *a_Html_text(const char *Type, void *P, CA_Callback_t *Call, void **Data) { DilloWeb *web = (DilloWeb*)P; DilloHtml *html = new DilloHtml(web->bw, web->url, Type); *Data = (void*)html; *Call = (CA_Callback_t)Html_callback; return (void*)html->dw; } static void Html_free(void *data) { delete ((DilloHtml*)data); } /* * Used by the "Load images" page menuitem. */ void a_Html_load_images(void *v_html, DilloUrl *pattern) { DilloHtml *html = (DilloHtml*)v_html; html->loadImages(pattern); } /* * Search for form */ static bool Html_contains_form(DilloHtml *html, void *v_form) { for (int i = 0; i < html->forms->size(); i++) { if (html->forms->get(i) == v_form) { return true; } } return false; } /* * Used by the "Submit form" form menuitem. */ void a_Html_form_submit(void *v_html, void *v_form) { DilloHtml *html = (DilloHtml*)v_html; if (Html_contains_form(html, v_form)) { /* it's still valid */ a_Html_form_submit2(v_form); } } /* * Used by the "Reset form" form menuitem. */ void a_Html_form_reset(void *v_html, void *v_form) { DilloHtml *html = (DilloHtml*)v_html; if (Html_contains_form(html, v_form)) { /* it's still valid */ a_Html_form_reset2(v_form); } } /* * Used by the "Show/Hide hiddens" form menuitem. */ void a_Html_form_display_hiddens(void *v_html, void *v_form, bool_t display) { DilloHtml *html = (DilloHtml*)v_html; if (Html_contains_form(html, v_form)) { /* it's still valid */ a_Html_form_display_hiddens2(v_form, (display != 0)); } } /* * Set the URL data for image maps. */ static void Html_set_link_coordinates(DilloHtml *html, int link, int x, int y) { char data[64]; if (x != -1) { snprintf(data, 64, "?%d,%d", x, y); a_Url_set_ismap_coords(html->links->get(link), data); } } /* * Create a new link, set it as the url's parent * and return the index. */ static int Html_set_new_link(DilloHtml *html, DilloUrl **url) { int nl = html->links->size(); html->links->increase(); html->links->set(nl, (*url) ? *url : NULL); return nl; } /* * Evaluates the ALIGN attribute (left|center|right|justify) and * sets the style at the top of the stack. */ void a_Html_tag_set_align_attr(DilloHtml *html, const char *tag, int tagsize) { const char *align; if ((align = a_Html_get_attr(html, tag, tagsize, "align"))) { TextAlignType textAlignType = TEXT_ALIGN_LEFT; if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) BUG_MSG("The align attribute is obsolete in HTML5.\n"); if (dStrAsciiCasecmp (align, "left") == 0) textAlignType = TEXT_ALIGN_LEFT; else if (dStrAsciiCasecmp (align, "right") == 0) textAlignType = TEXT_ALIGN_RIGHT; else if (dStrAsciiCasecmp (align, "center") == 0) textAlignType = TEXT_ALIGN_CENTER; else if (dStrAsciiCasecmp (align, "justify") == 0) textAlignType = TEXT_ALIGN_JUSTIFY; #if 0 else if (dStrAsciiCasecmp (align, "char") == 0) { /* TODO: Actually not supported for

etc. */ v.textAlign = TEXT_ALIGN_STRING; if ((charattr = a_Html_get_attr(html, tag, tagsize, "char"))) { if (charattr[0] == 0) /* TODO: ALIGN=" ", and even ALIGN="&32;" will reult in * an empty string (don't know whether the latter is * correct, has to be clarified with the specs), so * that for empty strings, " " is assumed. */ style_attrs.textAlignChar = ' '; else style_attrs.textAlignChar = charattr[0]; } else /* TODO: Examine LANG attr of . */ style_attrs.textAlignChar = '.'; } #endif html->styleEngine->setNonCssHint(CSS_PROPERTY_TEXT_ALIGN, CSS_TYPE_ENUM, textAlignType); } } /* * Evaluates the VALIGN attribute (top|bottom|middle|baseline) and * sets the style in style_attrs. Returns true when set. */ bool a_Html_tag_set_valign_attr(DilloHtml *html, const char *tag, int tagsize) { const char *attr; VAlignType valign; if ((attr = a_Html_get_attr(html, tag, tagsize, "valign"))) { if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) BUG_MSG("The valign attribute is obsolete in HTML5.\n"); if (dStrAsciiCasecmp (attr, "top") == 0) valign = VALIGN_TOP; else if (dStrAsciiCasecmp (attr, "bottom") == 0) valign = VALIGN_BOTTOM; else if (dStrAsciiCasecmp (attr, "baseline") == 0) valign = VALIGN_BASELINE; else valign = VALIGN_MIDDLE; html->styleEngine->setNonCssHint (CSS_PROPERTY_VERTICAL_ALIGN, CSS_TYPE_ENUM, valign); return true; } else return false; } /* * Create and add a new Textblock to the current Textblock */ static void Html_add_textblock(DilloHtml *html, int space) { Textblock *textblock = new Textblock (prefs.limit_text_width); HT2TB(html)->addParbreak (space, html->wordStyle ()); HT2TB(html)->addWidget (textblock, html->style ()); /* Works also for floats etc. */ HT2TB(html)->addParbreak (space, html->wordStyle ()); S_TOP(html)->textblock = html->dw = textblock; S_TOP(html)->hand_over_break = true; } /* * Create and initialize a new DilloHtml class */ DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url, const char *content_type) { /* Init main variables */ bw = p_bw; page_url = a_Url_dup(url); base_url = a_Url_dup(url); dw = NULL; /* Init event receiver */ linkReceiver.html = this; HT2LT(this)->connectLink (&linkReceiver); a_Bw_add_doc(p_bw, this); /* Init for-parsing variables */ Start_Buf = NULL; Start_Ofs = 0; _MSG("DilloHtml(): content type: %s\n", content_type); this->content_type = dStrdup(content_type); /* get charset */ a_Misc_parse_content_type(content_type, NULL, NULL, &charset); stop_parser = false; CurrOfs = OldOfs = 0; OldLine = 1; DocType = DT_NONE; /* assume Tag Soup 0.0! :-) */ DocTypeVersion = 0.0f; styleEngine = new StyleEngine (HT2LT (this), page_url, base_url); cssUrls = new misc::SimpleVector (1); stack = new misc::SimpleVector (16); stack->increase(); stack->getRef(0)->parse_mode = DILLO_HTML_PARSE_MODE_INIT; stack->getRef(0)->table_mode = DILLO_HTML_TABLE_MODE_NONE; stack->getRef(0)->table_border_mode = DILLO_HTML_TABLE_BORDER_SEPARATE; stack->getRef(0)->cell_text_align_set = false; stack->getRef(0)->display_none = false; stack->getRef(0)->list_type = HTML_LIST_NONE; stack->getRef(0)->list_number = 0; stack->getRef(0)->tag_idx = -1; /* MUST not be used */ stack->getRef(0)->textblock = NULL; stack->getRef(0)->table = NULL; stack->getRef(0)->ref_list_item = NULL; stack->getRef(0)->hand_over_break = false; InFlags = IN_NONE; Stash = dStr_new(""); StashSpace = false; pre_column = 0; PreFirstChar = false; PrevWasCR = false; InVisitedLink = false; ReqTagClose = false; TagSoup = true; loadCssFromStash = false; Num_HTML = Num_HEAD = Num_BODY = Num_TITLE = 0; attr_data = dStr_sized_new(1024); non_css_link_color = -1; non_css_visited_color = -1; visited_color = -1; /* Init page-handling variables */ forms = new misc::SimpleVector (1); inputs_outside_form = new misc::SimpleVector (1); links = new misc::SimpleVector (64); images = new misc::SimpleVector (16); /* Initialize the main widget */ initDw(); /* Hook destructor to the dw delete call */ dw->setDeleteCallback(Html_free, this); } /* * Miscellaneous initializations for Dw */ void DilloHtml::initDw() { dReturn_if_fail (dw == NULL); /* Create the main widget */ dw = stack->getRef(0)->textblock = new Textblock (prefs.limit_text_width); bw->num_page_bugs = 0; dStr_truncate(bw->page_bugs, 0); } /* * Free memory used by the DilloHtml class. */ DilloHtml::~DilloHtml() { _MSG("::~DilloHtml(this=%p)\n", this); freeParseData(); a_Bw_remove_doc(bw, this); a_Url_free(page_url); a_Url_free(base_url); for (int i = 0; i < cssUrls->size(); i++) a_Url_free(cssUrls->get(i)); delete (cssUrls); for (int i = 0; i < forms->size(); i++) a_Html_form_delete (forms->get(i)); delete(forms); for (int i = 0; i < inputs_outside_form->size(); i++) a_Html_input_delete(inputs_outside_form->get(i)); delete(inputs_outside_form); for (int i = 0; i < links->size(); i++) a_Url_free(links->get(i)); delete (links); for (int i = 0; i < images->size(); i++) { DilloHtmlImage *img = images->get(i); a_Url_free(img->url); a_Image_unref(img->image); dFree(img); } delete (images); delete styleEngine; } /* * Process the newly arrived html and put it into the page structure. * (This function is called by Html_callback whenever there's new data) */ void DilloHtml::write(char *Buf, int BufSize, int Eof) { int token_start; char *buf = Buf + Start_Ofs; int bufsize = BufSize - Start_Ofs; _MSG("DilloHtml::write BufSize=%d Start_Ofs=%d\n", BufSize, Start_Ofs); #if 0 char *aux = dStrndup(Buf, BufSize); MSG(" {%s}\n", aux); dFree(aux); #endif /* Update Start_Buf. It may be used after the parser is stopped */ Start_Buf = Buf; dReturn_if (dw == NULL); dReturn_if (stop_parser == true); token_start = Html_write_raw(this, buf, bufsize, Eof); Start_Ofs += token_start; } /* * Return the line number of the tag/word being processed by the parser. * Also update the offsets. */ int DilloHtml::getCurrLineNumber() { int i, ofs, line; const char *p = Start_Buf; dReturn_val_if_fail(p != NULL, -1); /* Disable line counting for META hack. Buffers differ. */ dReturn_val_if((InFlags & IN_META_HACK), -1); ofs = CurrOfs; line = OldLine; for (i = OldOfs; i < ofs; ++i) if (p[i] == '\n' || (p[i] == '\r' && p[i+1] != '\n')) ++line; OldOfs = CurrOfs; OldLine = line; return line; } /* * Free parsing data. */ void DilloHtml::freeParseData() { delete(stack); dStr_free(Stash, TRUE); dStr_free(attr_data, TRUE); dFree(content_type); dFree(charset); } /* * Finish parsing a HTML page. Close the parser and close the client. * The class is not deleted here, it remains until the widget is destroyed. */ void DilloHtml::finishParsing(int ClientKey) { int si; dReturn_if (stop_parser == true); /* flag we've already parsed up to the last byte */ InFlags |= IN_EOF; /* force the close of elements left open (TODO: not for XHTML) */ while ((si = stack->size() - 1)) { if (stack->getRef(si)->tag_idx != -1) { Html_tag_cleanup_at_close(this, stack->getRef(si)->tag_idx); } } /* Nothing left to do with the parser. Clear all flags, except EOF. */ InFlags = IN_EOF; /* Remove this client from our active list */ a_Bw_close_client(bw, ClientKey); } /* * Allocate and insert form information. */ int DilloHtml::formNew(DilloHtmlMethod method, const DilloUrl *action, DilloHtmlEnc enc, const char *charset) { // avoid data loss on repush after CSS stylesheets have been loaded bool enabled = bw->NumPendingStyleSheets == 0; DilloHtmlForm *form = a_Html_form_new (this, method, action, enc, charset, enabled); int nf = forms->size (); forms->increase (); forms->set (nf, form); _MSG("Html formNew: action=%s nform=%d\n", action, nf); return forms->size(); } /* * Get the current form. */ DilloHtmlForm *DilloHtml::getCurrentForm () { return forms->get (forms->size() - 1); } bool_t DilloHtml::unloadedImages() { for (int i = 0; i < images->size(); i++) { if (images->get(i)->image != NULL) { return TRUE; } } return FALSE; } /* * Load images if they were disabled. */ void DilloHtml::loadImages (const DilloUrl *pattern) { dReturn_if (a_Bw_expecting(bw)); /* If the user asked for a specific image, the user (NULL) is the requester, * and the domain mechanism will always permit the request. But if the user * just asked for all images (clicking "Load images"), use the page URL as * the requester so that the domain mechanism can act as a filter. * If the possible patterns become more complex, it might be good to have * the caller supply the requester instead. */ const DilloUrl *requester = pattern ? NULL : this->page_url; for (int i = 0; i < images->size(); i++) { DilloHtmlImage *hi = images->get(i); if (hi->image) { assert(hi->url); if ((!pattern) || (!a_Url_cmp(hi->url, pattern))) { if (Html_load_image(bw, hi->url, requester, hi->image)) { a_Image_unref (hi->image); hi->image = NULL; // web owns it now } } } } } /* * Save URL in a vector (may be loaded later). */ void DilloHtml::addCssUrl(const DilloUrl *url) { int nu = cssUrls->size(); cssUrls->increase(); cssUrls->set(nu, a_Url_dup(url)); } bool DilloHtml::HtmlLinkReceiver::enter (Widget *widget, int link, int img, int x, int y) { BrowserWindow *bw = html->bw; _MSG(" ** "); if (link == -1) { _MSG(" Link LEAVE notify...\n"); a_UIcmd_set_msg(bw, ""); } else { _MSG(" Link ENTER notify...\n"); Html_set_link_coordinates(html, link, x, y); a_UIcmd_set_msg(bw, "%s", URL_STR(html->links->get(link))); } return true; } /* * Handle the "press" signal. */ bool DilloHtml::HtmlLinkReceiver::press (Widget *widget, int link, int img, int x, int y, EventButton *event) { BrowserWindow *bw = html->bw; int ret = false; DilloUrl *linkurl = NULL; _MSG("pressed button %d\n", event->button); if (event->button == 3) { // popup menus if (img != -1) { // image menu if (link != -1) linkurl = html->links->get(link); const bool_t loaded_img = (html->images->get(img)->image == NULL); a_UIcmd_image_popup(bw, html->images->get(img)->url, loaded_img, html->page_url, linkurl); ret = true; } else { if (link == -1) { a_UIcmd_page_popup(bw, bw->num_page_bugs != 0, html->cssUrls); ret = true; } else { a_UIcmd_link_popup(bw, html->links->get(link)); ret = true; } } } return ret; } /* * Handle the "click" signal. */ bool DilloHtml::HtmlLinkReceiver::click (Widget *widget, int link, int img, int x, int y, EventButton *event) { BrowserWindow *bw = html->bw; if ((img != -1) && (html->images->get(img)->image)) { // clicked an image that has not already been loaded if (event->button == 1){ // load all instances of this image DilloUrl *pattern = html->images->get(img)->url; html->loadImages(pattern); return true; } } if (link != -1) { DilloUrl *url = html->links->get(link); _MSG("clicked on URL %d: %s\n", link, a_Url_str (url)); Html_set_link_coordinates(html, link, x, y); if (event->button == 1) { a_UIcmd_open_url(bw, url); } else if (event->button == 2) { if (prefs.middle_click_opens_new_tab) { int focus = prefs.focus_new_tab ? 1 : 0; if (event->state == SHIFT_MASK) focus = !focus; a_UIcmd_open_url_nt(bw, url, focus); } else a_UIcmd_open_url_nw(bw, url); } else { return false; } /* Change the link color to "visited" as visual feedback */ for (Widget *w = widget; w; w = w->getParent()) { _MSG(" ->%s\n", w->getClassName()); if (w->instanceOf(dw::Textblock::CLASS_ID)) { ((Textblock*)w)->changeLinkColor (link, html->visited_color); break; } } } return true; } /* * Initialize the stash buffer */ void a_Html_stash_init(DilloHtml *html) { S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_STASH; html->StashSpace = false; dStr_truncate(html->Stash, 0); } /* * This is M$ non-standard "smart quotes" (w1252). Now even deprecated by them! * * SGML for HTML4.01 defines c >= 128 and c <= 159 as UNUSED. * TODO: Probably I should remove this hack. --Jcid */ static int Html_ms_stupid_quotes_2ucs(int codepoint) { int ret; switch (codepoint) { case 145: case 146: ret = '\''; break; case 147: case 148: ret = '"'; break; case 149: ret = 176; break; case 150: case 151: ret = '-'; break; default: ret = codepoint; break; } return ret; } /* * Parse a numeric character reference (e.g., "/" or "/"). * The "&#" has already been consumed. */ static const char *Html_parse_numeric_charref(DilloHtml *html, char *tok, bool_t is_attr, int *entsize) { static char buf[5]; char *s = tok; int n, codepoint = -1; errno = 0; if (*s == 'x' || *s == 'X') { if (isxdigit(*++s)) { /* strtol with base 16 accepts leading "0x" - we don't */ if (*s == '0' && s[1] == 'x') { s++; codepoint = 0; } else { codepoint = strtol(s, &s, 16); } } } else if (isdigit(*s)) { codepoint = strtol(s, &s, 10); } if (errno) codepoint = -1; if (*s == ';') s++; else { if (prefs.show_extra_warnings && (html->DocType == DT_XHTML || (html->DocType == DT_HTML && html->DocTypeVersion <= 4.01f))) { char c = *s; *s = '\0'; BUG_MSG("character reference '&#%s' lacks ';'\n", tok); *s = c; } /* Don't require ';' for old HTML, except that our current heuristic * is to require it in attributes to avoid cases like "©=1" found * in URLs. */ if (is_attr || html->DocType == DT_XHTML || (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)) { return NULL; } } if ((codepoint < 0x20 && codepoint != '\t' && codepoint != '\n' && codepoint != '\f') || (codepoint >= 0x7f && codepoint <= 0x9f) || (codepoint >= 0xd800 && codepoint <= 0xdfff) || codepoint > 0x10ffff || ((codepoint & 0xfffe) == 0xfffe) || (!(html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) && codepoint > 0xffff)) { /* this catches null bytes, errors, codes out of range, disallowed * control chars, permanently undefined chars, and surrogates. */ char c = *s; *s = '\0'; BUG_MSG("numeric character reference '&#%s' is not valid.\n", tok); *s = c; codepoint = (codepoint >= 145 && codepoint <= 151) ? Html_ms_stupid_quotes_2ucs(codepoint) : -1; } if (codepoint != -1) { if (codepoint >= 128) { n = a_Utf8_encode(codepoint, buf); } else { n = 1; buf[0] = (char) codepoint; } assert(n < 5); buf[n] = '\0'; *entsize = s-tok+2; return buf; } else { return NULL; } } /* * Comparison function for binary search */ static int Html_charref_comp(const void *a, const void *b) { return strcmp(((Charref_t *)a)->ref, ((Charref_t *)b)->ref); } /* * Binary search of 'key' in charref list */ static Charref_t *Html_charref_search(char *key) { Charref_t RefKey; RefKey.ref = key; return (Charref_t*) bsearch(&RefKey, Charrefs, NumRef, sizeof(Charref_t), Html_charref_comp); } /* * Parse a named character reference (e.g., "&" or "…"). * The "&" has already been consumed. */ static const char *Html_parse_named_charref(DilloHtml *html, char *tok, bool_t is_attr, int *entsize) { Charref_t *p; char c; char *s = tok; const char *ret = NULL; while (*++s && (isalnum(*s) || strchr(":_.-", *s))) ; c = *s; *s = '\0'; if (c != ';') { if (prefs.show_extra_warnings && (html->DocType == DT_XHTML || (html->DocType == DT_HTML && html->DocTypeVersion <= 4.01f))) BUG_MSG("character reference '&%s' lacks ';'\n", tok); /* Don't require ';' for old HTML, except that our current heuristic * is to require it in attributes to avoid cases like "©=1" found * in URLs. */ if (is_attr || html->DocType == DT_XHTML || (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)) { return ret; } } if ((p = Html_charref_search(tok))) { ret = (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) ? p->html5_str : p->html4_str; } if (!ret && html->DocType == DT_XHTML && !strcmp(tok, "apos")) ret = "'"; *s = c; if (c == ';') s++; if (!ret) { c = *s; *s = '\0'; BUG_MSG("undefined character reference &%s\n", tok); *s = c; } *entsize = s-tok+1; return ret; } /* * Given an entity, return the corresponding string. * Returns NULL if not a valid entity. * * The first character *token is assumed to be == '&' * * For valid entities, *entsize is set to the length of the parsed entity. */ static const char *Html_parse_entity(DilloHtml *html, const char *token, int toksize, int *entsize, bool_t is_attr) { const char *ret = NULL; char *tok; token++; tok = dStrndup(token, (uint_t)toksize); if (*tok == '#') { ret = Html_parse_numeric_charref(html, tok+1, is_attr, entsize); } else if (isalpha(*tok)) { ret = Html_parse_named_charref(html, tok, is_attr, entsize); } else if (prefs.show_extra_warnings && (!(html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f))) { // HTML5 doesn't mind literal '&'s. BUG_MSG("literal '&'\n"); } dFree(tok); return ret; } /* * Parse all the entities in a token. Takes the token and its length, and * returns a newly allocated string. */ char *a_Html_parse_entities(DilloHtml *html, const char *token, int toksize) { const char *esc_set = "&"; int i, s, entsize; char *str; s = strcspn(token, esc_set); if (s >= toksize) { /* no ampersands */ str = dStrndup(token, toksize); } else { Dstr *ds = dStr_sized_new(toksize); dStr_append_l(ds, token, s); for (i = s; i < toksize; i++) { const char *entstr; const bool_t is_attr = FALSE; if (token[i] == '&' && (entstr = Html_parse_entity(html, token+i, toksize-i, &entsize, is_attr))) { dStr_append(ds, entstr); i += entsize-1; } else { dStr_append_c(ds, token[i]); } } str = ds->str; dStr_free(ds, 0); } return str; } /* * For white-space: pre-line, we must break the line if encountering a newline. * Otherwise, collapse whitespace as usual. */ static void Html_process_space_pre_line(DilloHtml *html, const char *space, int spacesize) { int i, breakCnt = 0; for (i = 0; i < spacesize; i++) { /* Support for "\r", "\n" and "\r\n" line breaks */ if (space[i] == '\r' || (space[i] == '\n' && !html->PrevWasCR)) { breakCnt++; html->PrevWasCR = (space[i] == '\r'); HT2TB(html)->addLinebreak (html->wordStyle ()); } } if (breakCnt == 0) { HT2TB(html)->addSpace(html->wordStyle ()); } } /* * Parse spaces */ static void Html_process_space(DilloHtml *html, const char *space, int spacesize) { char *spc; int i, offset; DilloHtmlParseMode parse_mode = S_TOP(html)->parse_mode; if (S_TOP(html)->display_none) { /* do nothing */ } else if (parse_mode == DILLO_HTML_PARSE_MODE_STASH) { html->StashSpace = (html->Stash->len > 0); } else if (parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) { dStr_append_l(html->Stash, space, spacesize); } else if (parse_mode == DILLO_HTML_PARSE_MODE_PRE) { int spaceCnt = 0; /* re-scan the string for characters that cause line breaks */ for (i = 0; i < spacesize; i++) { /* Support for "\r", "\n" and "\r\n" line breaks (skips the first) */ if (!html->PreFirstChar && (space[i] == '\r' || (space[i] == '\n' && !html->PrevWasCR))) { if (spaceCnt) { spc = dStrnfill(spaceCnt, ' '); HT2TB(html)->addText (spc, spaceCnt, html->wordStyle ()); dFree(spc); spaceCnt = 0; } HT2TB(html)->addLinebreak (html->wordStyle ()); html->pre_column = 0; } html->PreFirstChar = false; /* cr and lf should not be rendered -- they appear as a break */ switch (space[i]) { case '\r': case '\n': break; case '\t': if (prefs.show_extra_warnings) BUG_MSG("TAB character inside

\n");
            offset = TAB_SIZE - html->pre_column % TAB_SIZE;
            spaceCnt += offset;
            html->pre_column += offset;
            break;
         default:
            spaceCnt++;
            html->pre_column++;
            break;
         }

         html->PrevWasCR = (space[i] == '\r');
      }

      if (spaceCnt) {
         // add break possibility for the white-space:pre-wrap case
         HT2TB(html)->addBreakOption (html->wordStyle (), false);
         spc = dStrnfill(spaceCnt, ' ');
         HT2TB(html)->addText (spc, spaceCnt, html->wordStyle ());
         dFree(spc);
      }

   } else {
      if (SGML_SPCDEL) {
         /* SGML_SPCDEL ignores white space immediately after an open tag */
      } else if (html->wordStyle ()->whiteSpace == WHITE_SPACE_PRE_LINE) {
         Html_process_space_pre_line(html, space, spacesize);
      } else {
         HT2TB(html)->addSpace(html->wordStyle ());
      }

      if (parse_mode == DILLO_HTML_PARSE_MODE_STASH_AND_BODY)
         html->StashSpace = (html->Stash->len > 0);
   }
}

/*
 * Handles putting the word into its proper place
 *  > STASH and VERBATIM --> html->Stash
 *  > otherwise it goes through addText()
 *
 * Entities are parsed (or not) according to parse_mode.
 * 'word' is a '\0'-terminated string.
 */
static void Html_process_word(DilloHtml *html, const char *word, int size)
{
   int i, j, start;
   char *Pword;
   DilloHtmlParseMode parse_mode = S_TOP(html)->parse_mode;

   if (S_TOP(html)->display_none)
      return;

   if (parse_mode == DILLO_HTML_PARSE_MODE_STASH ||
       parse_mode == DILLO_HTML_PARSE_MODE_STASH_AND_BODY) {
      if (html->StashSpace) {
         dStr_append_c(html->Stash, ' ');
         html->StashSpace = false;
      }
      Pword = a_Html_parse_entities(html, word, size);
      dStr_append(html->Stash, Pword);
      dFree(Pword);

   } else if (parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
      /* word goes in untouched, it is not processed here. */
      dStr_append_l(html->Stash, word, size);
   }

   if (parse_mode == DILLO_HTML_PARSE_MODE_STASH ||
       parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
      /* skip until the closing instructions */

   } else if (parse_mode == DILLO_HTML_PARSE_MODE_PRE) {
      /* all this overhead is to catch white-space entities */
      Pword = a_Html_parse_entities(html, word, size);
      for (start = i = 0; Pword[i]; start = i)
         if (isspace(Pword[i])) {
            while (Pword[++i] && isspace(Pword[i])) ;
            Html_process_space(html, Pword + start, i - start);
         } else {
            while (Pword[++i] && !isspace(Pword[i])) ;
            HT2TB(html)->addText(Pword + start, i - start, html->wordStyle ());
            html->pre_column += i - start;
            html->PreFirstChar = false;
         }
      dFree(Pword);

   } else {
      const char *word2, *beyond_word2;

      Pword = NULL;
      if (!memchr(word,'&', size)) {
         /* No entities */
         word2 = word;
         beyond_word2 = word + size;
      } else {
         /* Collapse white-space entities inside the word (except  ) */
         Pword = a_Html_parse_entities(html, word, size);
         /* Collapse adjacent " \t\f\n\r" characters into a single space */
         for (i = j = 0; (Pword[i] = Pword[j]); ++i, ++j) {
            if (strchr(" \t\f\n\r", Pword[i])) {
               if (i == 0 || (i > 0 && Pword[i-1] != ' '))
                  Pword[i] = ' ';
               else
                  for (--i; Pword[j+1] && strchr(" \t\f\n\r", Pword[j+1]); ++j)
                     ;
            }
         }
         word2 = Pword;
         beyond_word2 = word2 + strlen(word2);
      }
      for (start = i = 0; word2[i]; start = i) {
         int len;

         if (isspace(word2[i])) {
            while (word2[++i] && isspace(word2[i])) ;
            Html_process_space(html, word2 + start, i - start);
         } else if (!strncmp(word2+i, utf8_zero_width_space, 3)) {
            i += 3;
            HT2TB(html)->addBreakOption(html->wordStyle (), false);
         } else if (a_Utf8_ideographic(word2+i, beyond_word2, &len)) {
            i += len;
            HT2TB(html)->addText(word2 + start, i - start, html->wordStyle ());
            HT2TB(html)->addBreakOption(html->wordStyle (), false);
         } else {
            do {
               i += len;
            } while (word2[i] && !isspace(word2[i]) &&
                     strncmp(word2+i, utf8_zero_width_space, 3) &&
                     (!a_Utf8_ideographic(word2+i, beyond_word2, &len)));
            HT2TB(html)->addText(word2 + start, i - start, html->wordStyle ());
         }
      }
      if (Pword == word2)
         dFree(Pword);
   }
}

/*
 * Does the tag in tagstr (e.g. "p") match the tag in the tag, tagsize
 * structure, with the initial < skipped over (e.g. "P align=center>")?
 */
static bool Html_match_tag(const char *tagstr, char *tag, int tagsize)
{
   int i;

   for (i = 0; i < tagsize && tagstr[i] != '\0'; i++) {
      if (D_ASCII_TOLOWER(tagstr[i]) != D_ASCII_TOLOWER(tag[i]))
         return false;
   }
   /* The test for '/' is for xml compatibility: "empty/>" will be matched. */
   if (i < tagsize && (isspace(tag[i]) || tag[i] == '>' || tag[i] == '/'))
      return true;
   return false;
}

/*
 * This function is called after popping the stack, to
 * handle nested Textblock widgets.
 */
static void Html_eventually_pop_dw(DilloHtml *html, bool hand_over_break)
{
   if (html->dw != S_TOP(html)->textblock) {
      if (hand_over_break)
         HT2TB(html)->handOverBreak (html->style ());
      HT2TB(html)->flush ();
      html->dw = S_TOP(html)->textblock;
   }
}

/*
 * Push the tag (copying attributes from the top of the stack)
 */
static void Html_push_tag(DilloHtml *html, int tag_idx)
{
   int n_items;

   n_items = html->stack->size ();
   html->stack->increase ();
   /* We'll copy the former stack item and just change the tag and its index
    * instead of copying all fields except for tag.  --Jcid */
   *html->stack->getRef(n_items) = *html->stack->getRef(n_items - 1);
   html->stack->getRef(n_items)->tag_idx = tag_idx;
   html->dw = S_TOP(html)->textblock;
}

/*
 * Push the tag (used to force en element with optional open into the stack)
 * Note: now it's the same as Html_push_tag(), but things may change...
 */
static void Html_force_push_tag(DilloHtml *html, int tag_idx)
{
   html->startElement (tag_idx);
   Html_push_tag(html, tag_idx);
}

/*
 * Pop the top tag in the stack
 */
static void Html_real_pop_tag(DilloHtml *html)
{
   bool hand_over_break;

   html->styleEngine->endElement (S_TOP(html)->tag_idx);
   hand_over_break = S_TOP(html)->hand_over_break;
   html->stack->setSize (html->stack->size() - 1);
   Html_eventually_pop_dw(html, hand_over_break);
}

/*
 * Cleanup the stack to a given index.
 */
static void Html_tag_cleanup_to_idx(DilloHtml *html, int idx)
{
   int s_sz;
   while ((s_sz = html->stack->size()) > idx) {
      int toptag_idx = S_TOP(html)->tag_idx;
      TagInfo toptag = Tags[toptag_idx];
      if (s_sz > idx + 1 && toptag.EndTag != 'O')
         BUG_MSG("  - forcing close of open tag: <%s>\n", toptag.name);
      _MSG("Close: %*s%s\n", size," ", toptag.name);
      if (toptag.close)
         toptag.close(html);
      Html_real_pop_tag(html);
   }
}

/*
 * Default close function for tags.
 * (conditional cleanup of the stack)
 * There are several ways of doing it. Considering the HTML 4.01 spec
 * which defines optional close tags, and the will to deliver useful diagnose
 * messages for bad-formed HTML, it'll go as follows:
 *   1.- Search the stack for the first tag that requires a close tag.
 *   2.- If it matches, clean all the optional-close tags in between.
 *   3.- Cleanup the matching tag. (on error, give a warning message)
 *
 * If 'w3c_mode' is NOT enabled:
 *   1.- Search the stack for a matching tag based on tag level.
 *   2.- If it exists, clean all the tags in between.
 *   3.- Cleanup the matching tag. (on error, give a warning message)
 */
static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx)
{
   static int i_BUTTON = a_Html_tag_index("button"),
              i_SELECT = a_Html_tag_index("select"),
              i_TEXTAREA = a_Html_tag_index("textarea");
   int w3c_mode = !prefs.w3c_plus_heuristics;
   int stack_idx, tag_idx, matched = 0, expected = 0;
   TagInfo new_tag = Tags[new_idx];

   /* Look for the candidate tag to close */
   stack_idx = html->stack->size();
   while (--stack_idx) {
      tag_idx = html->stack->getRef(stack_idx)->tag_idx;
      if (tag_idx == new_idx) {
         /* matching tag found */
         matched = 1;
         break;
      } else if (Tags[tag_idx].EndTag == 'O') {
         /* skip an optional tag */
         continue;
      } else if ((new_idx == i_BUTTON && html->InFlags & IN_BUTTON) ||
                 (new_idx == i_SELECT && html->InFlags & IN_SELECT) ||
                 (new_idx == i_TEXTAREA && html->InFlags & IN_TEXTAREA)) {
         /* let these elements close tags inside them */
         continue;
      } else if (w3c_mode || Tags[tag_idx].TagLevel >= new_tag.TagLevel) {
         /* this is the tag that should have been closed */
         expected = 1;
         break;
      }
   }

   if (matched) {
      Html_tag_cleanup_to_idx(html, stack_idx);
   } else if (expected) {
      BUG_MSG("unexpected closing tag:  -- expected .\n",
              new_tag.name, Tags[tag_idx].name);
   } else {
      BUG_MSG("unexpected closing tag: .\n", new_tag.name);
   }
}

/*
 * Avoid nesting and inter-nesting of BUTTON, SELECT and TEXTAREA,
 * by closing them before opening another.
 * This is not an HTML SPEC restriction , but it avoids lots of trouble
 * inside dillo (concurrent inputs), and makes almost no sense to have.
 */
static void Html_tag_cleanup_nested_inputs(DilloHtml *html, int new_idx)
{
   static int i_BUTTON = a_Html_tag_index("button"),
              i_SELECT = a_Html_tag_index("select"),
              i_TEXTAREA = a_Html_tag_index("textarea");
   int stack_idx, u_idx, matched = 0;

   dReturn_if_fail(html->InFlags & (IN_BUTTON | IN_SELECT | IN_TEXTAREA));
   dReturn_if_fail(new_idx == i_BUTTON || new_idx == i_SELECT ||
                   new_idx == i_TEXTAREA);

   /* Get the unclosed tag index */
   u_idx = (html->InFlags & IN_BUTTON) ? i_BUTTON :
                 (html->InFlags & IN_SELECT) ? i_SELECT : i_TEXTAREA;

   /* Look for it inside the stack */
   stack_idx = html->stack->size();
   while (--stack_idx) {
      if (html->stack->getRef(stack_idx)->tag_idx == u_idx) {
         /* matching tag found */
         matched = 1;
         break;
      }
   }

   if (matched) {
      BUG_MSG("attempt to nest <%s> element inside <%s> -- closing <%s>\n",
              Tags[new_idx].name, Tags[u_idx].name, Tags[u_idx].name);
      Html_tag_cleanup_to_idx(html, stack_idx);
   } else {
      MSG_WARN("Inconsistent parser state, flag is SET but no '%s' element"
               "was found in the stack\n", Tags[u_idx].name);
   }

   html->InFlags &= ~(IN_BUTTON | IN_SELECT | IN_TEXTAREA);
}


/*
 * Some parsing routines.
 */

/*
 * Used by a_Html_parse_length
 */
static CssLength Html_parse_length_or_multi_length (const char *attr,
                                                    char **endptr)
{
   CssLength l;
   double v;
   char *end;

   v = strtod (attr, &end);
   switch (*end) {
   case '%':
      end++;
      l = CSS_CREATE_LENGTH (v / 100, CSS_LENGTH_TYPE_PERCENTAGE);
      break;

   case '*':
      end++;
      l = CSS_CREATE_LENGTH (v, CSS_LENGTH_TYPE_RELATIVE);
      break;
/*
   The "px" suffix seems not allowed by HTML4.01 SPEC.
   case 'p':
      if (end[1] == 'x')
         end += 2;
*/
   default:
      l = CSS_CREATE_LENGTH (v, CSS_LENGTH_TYPE_PX);
      break;
   }

   if (endptr)
      *endptr = end;
   return l;
}


/*
 * Returns a length or a percentage, or UNDEF_LENGTH in case
 * of an error, or if attr is NULL.
 */
CssLength a_Html_parse_length (DilloHtml *html, const char *attr)
{
   CssLength l;
   char *end;

   l = Html_parse_length_or_multi_length (attr, &end);
   if (CSS_LENGTH_TYPE (l) == CSS_LENGTH_TYPE_RELATIVE)
      /* not allowed as &Length; */
      l = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
   else {
      /* allow only whitespaces */
      if (*end && !isspace (*end)) {
         BUG_MSG("Garbage after length: %s\n", attr);
         l = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
      }
   }

   _MSG("a_Html_parse_length: \"%s\" %d\n", attr, CSS_LENGTH_VALUE(l));
   return l;
}

/*
 * Parse a color attribute.
 * Return value: parsed color, or default_color (+ error msg) on error.
 */
int32_t a_Html_color_parse(DilloHtml *html, const char *str,
                           int32_t default_color)
{
   int err = 1;
   int32_t color = a_Color_parse(str, default_color, &err);

   if (err) {
      BUG_MSG("color \"%s\" is not in \"#RRGGBB\" format\n", str);
   }
   return color;
}

/*
 * Check that 'val' is composed of characters inside [A-Za-z0-9:_.-]
 * Note: ID can't have entities, but this check is enough (no '&').
 * Return value: 1 if OK, 0 otherwise.
 */
static int
 Html_check_name_val(DilloHtml *html, const char *val, const char *attrname)
{
   if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) {
      bool valid = *val && !strchr(val, ' ');

      if (!valid) {
         BUG_MSG("'%s' value must not be empty and must not contain spaces.\n",
                 attrname);
      }
      return valid ? 1 : 0;
   } else {
      int i;

      for (i = 0; val[i]; ++i)
         if (!isascii(val[i]) || !(isalnum(val[i]) || strchr(":_.-", val[i])))
            break;

      if (val[i] || !(isascii(val[0]) && isalpha(val[0])))
         BUG_MSG("'%s' value \"%s\" is not of the form "
                 "[A-Za-z][A-Za-z0-9:_.-]*\n", attrname, val);

      return !(val[i]);
   }
}

/*
 * Handle DOCTYPE declaration
 *
 * Follows the convention that HTML 4.01
 * doctypes which include a full w3c DTD url are treated as
 * standards-compliant, but 4.01 without the url and HTML 4.0 and
 * earlier are not. XHTML doctypes are always standards-compliant
 * whether or not an url is present.
 *
 * Note: I'm not sure about this convention. The W3C validator
 * recognizes the "HTML Level" with or without the URL. The convention
 * comes from mozilla (see URLs below), but Dillo doesn't have the same
 * rendering modes, so it may be better to chose another behaviour. --Jcid
 *
 * http://www.mozilla.org/docs/web-developer/quirks/doctypes.html
 * http://lists.auriga.wearlab.de/pipermail/dillo-dev/2004-October/002300.html
 *
 * This is not a full DOCTYPE parser, just enough for what Dillo uses.
 */
static void Html_parse_doctype(DilloHtml *html, const char *tag, int tagsize)
{
   static const char HTML_SGML_sig [] = "";
   static const char HTML20     [] = "-//IETF//DTD HTML";
   static const char HTML32     [] = "-//W3C//DTD HTML 3.2";
   static const char HTML40     [] = "-//W3C//DTD HTML 4.0";
   static const char HTML401    [] = "-//W3C//DTD HTML 4.01";
   static const char HTML401_url[] = "http://www.w3.org/TR/html4/";
   static const char XHTML1     [] = "-//W3C//DTD XHTML 1.0";
   static const char XHTML1_url [] = "http://www.w3.org/TR/xhtml1/DTD/";
   static const char XHTML11    [] = "-//W3C//DTD XHTML 1.1";
   static const char XHTML11_url[] = "http://www.w3.org/TR/xhtml11/DTD/";

   size_t i;
   int quote;
   char *p, *ntag = dStrndup(tag, tagsize);

   /* Tag sanitization: Collapse whitespace between tokens
    * and replace '\n' and '\r' with ' ' inside quoted strings. */
   for (i = 0, p = ntag; *p; ++p) {
      if (isspace(*p)) {
         for (ntag[i++] = ' '; isspace(p[1]); ++p) ;
      } else if ((quote = *p) == '"' || *p == '\'') {
         for (ntag[i++] = *p++; (ntag[i] = *p) && ntag[i++] != quote; ++p) {
            if (*p == '\n' || *p == '\r')
               ntag[i - 1] = ' ';
            p += (p[0] == '\r' && p[1] == '\n') ? 1 : 0;
         }
      } else {
         ntag[i++] = *p;
      }
      if (!*p)
         break;
   }
   ntag[i] = 0;

   _MSG("New: {%s}\n", ntag);

   if (html->DocType != DT_NONE)
      BUG_MSG("Multiple DOCTYPE declarations.\n");

   /* The default DT_NONE type is TagSoup */
   if (i > strlen(HTML_SGML_sig) && // avoid out of bounds reads!
       !dStrnAsciiCasecmp(ntag, HTML_SGML_sig, strlen(HTML_SGML_sig))) {
      p = ntag + strlen(HTML_SGML_sig) + 1;
      if (!strncmp(p, HTML401, strlen(HTML401)) &&
          dStriAsciiStr(p + strlen(HTML401), HTML401_url)) {
         html->DocType = DT_HTML;
         html->DocTypeVersion = 4.01f;
      } else if (!strncmp(p, XHTML1, strlen(XHTML1)) &&
                 dStriAsciiStr(p + strlen(XHTML1), XHTML1_url)) {
         html->DocType = DT_XHTML;
         html->DocTypeVersion = 1.0f;
      } else if (!strncmp(p, XHTML11, strlen(XHTML11)) &&
                 dStriAsciiStr(p + strlen(XHTML11), XHTML11_url)) {
         html->DocType = DT_XHTML;
         html->DocTypeVersion = 1.1f;
      } else if (!strncmp(p, HTML40, strlen(HTML40))) {
         html->DocType = DT_HTML;
         html->DocTypeVersion = 4.0f;
      } else if (!strncmp(p, HTML32, strlen(HTML32))) {
         html->DocType = DT_HTML;
         html->DocTypeVersion = 3.2f;
      } else if (!strncmp(p, HTML20, strlen(HTML20))) {
         html->DocType = DT_HTML;
         html->DocTypeVersion = 2.0f;
      }
   } else if (!dStrAsciiCasecmp(ntag, HTML5_sig)) {
      html->DocType = DT_HTML;
      html->DocTypeVersion = 5.0f;
   }
   if (html->DocType == DT_NONE) {
      html->DocType = DT_UNRECOGNIZED;
      BUG_MSG("DOCTYPE not recognized:\n%s.\n", ntag);
   }
   dFree(ntag);
}

/*
 * Handle open HTML element
 */
static void Html_tag_open_html(DilloHtml *html, const char *tag, int tagsize)
{
   /* The IN_HTML flag will be kept set until at IN_EOF condition.
    * This allows to handle pages with multiple or uneven HTML tags */

   if (!(html->InFlags & IN_HTML))
      html->InFlags |= IN_HTML;
   if (html->Num_HTML < UCHAR_MAX)
      ++html->Num_HTML;

   if (html->Num_HTML > 1) {
      BUG_MSG("HTML element was already open\n");
      html->ReqTagClose = true;
   }
}

/*
 * Handle close HTML element
 */
static void Html_tag_close_html(DilloHtml *html)
{
   _MSG("Html_tag_close_html: Num_HTML=%d\n", html->Num_HTML);
}

/*
 * Handle open HEAD element
 */
static void Html_tag_open_head(DilloHtml *html, const char *tag, int tagsize)
{
   if (html->InFlags & IN_BODY) {
      BUG_MSG("HEAD element must go before the BODY section\n");
      html->ReqTagClose = true;
      return;
   }

   if (html->Num_HEAD < UCHAR_MAX)
      ++html->Num_HEAD;
   if (html->InFlags & IN_HEAD) {
      BUG_MSG("HEAD element was already open\n");
      html->ReqTagClose = true;
   } else if (html->Num_HEAD > 1) {
      BUG_MSG("HEAD section already finished -- ignoring\n");
      html->ReqTagClose = true;
   } else {
      html->InFlags |= IN_HEAD;
   }
}

/*
 * Handle close HEAD element
 * Note: HEAD is parsed once completely got.
 */
static void Html_tag_close_head(DilloHtml *html)
{
   if (html->InFlags & IN_HEAD) {
      if (html->Num_HEAD == 1) {
         /* match for the well formed start of HEAD section */
         if (html->Num_TITLE == 0)
            BUG_MSG("HEAD section lacks the TITLE element\n");

         html->InFlags &= ~IN_HEAD;

         /* charset is already set, load remote stylesheets now */
         for (int i = 0; i < html->cssUrls->size(); i++) {
            a_Html_load_stylesheet(html, html->cssUrls->get(i));
         }
      } else if (html->Num_HEAD > 1) {
         --html->Num_HEAD;
      }
   } else {
      /* not reached, see Html_tag_cleanup_at_close() */
   }
}

/*
 * Handle open TITLE
 * calls stash init, where the title string will be stored
 */
static void Html_tag_open_title(DilloHtml *html, const char *tag, int tagsize)
{
   /* fill the stash buffer so TITLE content can be ignored
    * when not valid, redundant or outside HEAD section */
   a_Html_stash_init(html);

   if (html->InFlags & IN_HEAD) {
      if (html->Num_TITLE < UCHAR_MAX)
         ++html->Num_TITLE;
      if (html->Num_TITLE > 1)
         BUG_MSG("A redundant TITLE element was found\n");
   } else {
      BUG_MSG("TITLE element must be inside the HEAD section -- ignoring\n");
   }
}

/*
 * Handle close TITLE
 * set page-title in the browser window and in the history.
 */
static void Html_tag_close_title(DilloHtml *html)
{
   if (html->InFlags & IN_HEAD && html->Num_TITLE == 1) {
      /* title is only valid inside HEAD */
      a_UIcmd_set_page_title(html->bw, html->Stash->str);
      a_History_set_title_by_url(html->page_url, html->Stash->str);
   }
}

/*
 * Handle open SCRIPT
 * initializes stash, where the embedded code will be stored.
 * MODE_VERBATIM is used because MODE_STASH catches entities.
 */
static void Html_tag_open_script(DilloHtml *html, const char *tag, int tagsize)
{
   a_Html_stash_init(html);
   S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_VERBATIM;
}

/*
 * Handle close SCRIPT
 */
static void Html_tag_close_script(DilloHtml *html)
{
   /* eventually the stash will be sent to an interpreter for parsing */
}

/*
 * Handle open STYLE
 * Store contents in the stash where the style sheet interpreter can get it.
 */
static void Html_tag_open_style(DilloHtml *html, const char *tag, int tagsize)
{
   const char *attrbuf;

   html->loadCssFromStash = true;

   if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "type"))) {
      if (html->DocType != DT_HTML || html->DocTypeVersion <= 4.01f)
         BUG_MSG("type attribute is required for