diff options
author | Jorge Arellano Cid <jcid@dillo.org> | 2016-06-21 18:03:17 -0400 |
---|---|---|
committer | Jorge Arellano Cid <jcid@dillo.org> | 2016-06-21 18:03:17 -0400 |
commit | 482e759575dd46e0ad2d342e2f4df9251c34fc82 (patch) | |
tree | e2f657039d048f7650c8aadcd64cb9523a8521d5 /src | |
parent | 1530ff9fc54277046cbe081f8bd1a9928e89e982 (diff) |
New design for the nesting-cleanup process within the parser.
* Forbidden nesting now is handled by cleanup_at_open.
(it previously used an ad-hoc function hooked into the process)
Much safer and versatile now. [1],[2],[4]
* Heuristical cleanup at close is no longer used.
cleanup_at_close is now based on block/inline element/container semantics,
and also cosiders special nesting rules expressed in the DTD for HTML-4.01.
Note: this design is easy to tailor for HTML5. [2]
* Bug reporting changed and is now is more centralized in cleanup_to_idx.
* The bug meter gives more accurate and concise messages. [4]
* Page rendering improved as now the cleanup process strives to produce
a correct tree out of Tag Soup, before feeding it to Dw. [4]
* Better handling/recovery from Tag Soup (even in the worst cases). [5]
* The w3c_plus_heuristics=FALSE mode was removed (not necessary anymore)
* Elements with optional close also follow Firefox de facto rules.
* Special case logic is now isolated in helper functions.
A few examples:
[1] http://dillo.org/test/cross-nesting-simple.html
[2] http://slashdot.org/
[3] http://apod.nasa.gov/apod/ap160604.html
[4] http://www.mypetchicken.com:80/catalog/Day-Old-Baby-Chicks/Olive-Eggers-p1478.aspx
[5] http://dillo.org/test/sd3.html
Diffstat (limited to 'src')
-rw-r--r-- | src/html.cc | 295 |
1 files changed, 143 insertions, 152 deletions
diff --git a/src/html.cc b/src/html.cc index dc6d5b78..0945d6f9 100644 --- a/src/html.cc +++ b/src/html.cc @@ -104,6 +104,7 @@ static bool Html_load_image(BrowserWindow *bw, DilloUrl *url, const DilloUrl *requester, DilloImage *image); static void Html_callback(int Op, CacheClient_t *Client); static void Html_tag_cleanup_at_close(DilloHtml *html, int TagIdx); +int a_Html_tag_index(const char *tag); /*----------------------------------------------------------------------------- * Local Data @@ -120,6 +121,26 @@ typedef struct { } TagInfo; extern const TagInfo Tags[]; +/* Some element indexes required in scattered places */ +static int + i_A = a_Html_tag_index("a"), + i_BODY = a_Html_tag_index("body"), + i_BUTTON = a_Html_tag_index("button"), + i_DD = a_Html_tag_index("dd"), + i_DT = a_Html_tag_index("dt"), + i_HTML = a_Html_tag_index("html"), + i_HR = a_Html_tag_index("hr"), + i_LI = a_Html_tag_index("li"), + i_OPTGROUP = a_Html_tag_index("optgroup"), + i_OPTION = a_Html_tag_index("option"), + i_P = a_Html_tag_index("p"), + i_SELECT = a_Html_tag_index("select"), + i_TEXTAREA = a_Html_tag_index("textarea"), + i_TD = a_Html_tag_index("td"), + i_TR = a_Html_tag_index("tr"), + i_TH = a_Html_tag_index("th"); + + /*----------------------------------------------------------------------------- *----------------------------------------------------------------------------- * Main Code @@ -1353,21 +1374,66 @@ static void Html_real_pop_tag(DilloHtml *html) } /* + * Check nesting and cross-nesting between BUTTON, SELECT, TEXTAREA and A. + * The cleanup process will close any of them before opening another. + * This is not an HTML SPEC restriction , but it avoids lots of trouble + * inside dillo (concurrent inputs), and makes almost no sense to have. + * return: index of the open element, -1 if none. + */ +static inline int Html_forbids_cross_nesting(const int InFlags, + const int new_idx) +{ + int f = InFlags, ni = new_idx, oi = -1; + if (f & (IN_A | IN_BUTTON | IN_SELECT | IN_TEXTAREA) && + (ni == i_A || ni == i_BUTTON || ni == i_SELECT || ni == i_TEXTAREA)) + oi = (f & IN_A ? i_A : f & IN_BUTTON ? i_BUTTON : f & IN_SELECT ? + i_SELECT : f & IN_TEXTAREA ? i_TEXTAREA : 0); + return oi; +} + +/* * Cleanup the stack to a given index. + * + * 's_idx' stack index to clean up to. + * 'new_idx' is the tag index that triggered the cleanup. + * 'fi' forbidden tag index. -1 if allowed (most of the time). + * 'op' cleanup operation. {'o' = open, 'c' = close}. */ -static void Html_tag_cleanup_to_idx(DilloHtml *html, int idx) +static void Html_tag_cleanup_to_idx(DilloHtml *html, int s_idx, + int new_idx, int fi, char op) { - static int i_BODY = a_Html_tag_index("body"); - int s_sz; - while ((s_sz = html->stack->size()) > idx) { + int s_top, ni = new_idx; + while ((s_top = html->stack->size() - 1) >= s_idx) { int toptag_idx = S_TOP(html)->tag_idx; TagInfo toptag = Tags[toptag_idx]; - if (s_sz > idx + 1 && toptag.EndTag != 'O') - BUG_MSG("Bad nesting - forcing close of open tag: <%s>.",toptag.name); - _MSG("Close: %s sz=%d idx=%d\n", toptag.name, s_sz, idx); + + if (fi >= 0) { + // forbidden nesting + if (toptag_idx != fi) + BUG_MSG(" Nesting cleanup - forcing close of open tag: <%s>.", + toptag.name); + } else if (s_top == s_idx && op == 'c') { + // target tag, no bug when closing. + } else if (toptag.EndTag == 'O') { + // optional close, that's OK + } else if ((!(toptag.Flags & 4) && + (Tags[ni].Flags & 4 || !(Tags[ni].Flags & 1))) || + (Tags[ni].Flags & 1 && !(toptag.Flags & 2))) { + // block {element, container} in non block container or + // inline element in non inline container + BUG_MSG((op == 'o') ? + "Bad nesting: <%s> can't contain <%s>. -- closing <%s>." : + "<%s> must have been closed before </%s>. -- closing <%s>.", + toptag.name, Tags[ni].name, toptag.name); + } else { + BUG_MSG( + "<%s> should have been closed before </%s>. -- closing <%s>.", + toptag.name, Tags[ni].name, toptag.name); + } + _MSG("op(%c): %s s_top=%d s_idx=%d\n", op, toptag.name, s_top, s_idx); if (toptag_idx == i_BODY && !((html->InFlags & IN_EOF) || html->ReqTagClose)) { - (idx == 1 ? html->PrevWasHtmlClose : html->PrevWasBodyClose) = true; + (s_idx == 1 ? html->PrevWasHtmlClose : html->PrevWasBodyClose) = true; break; // only pop {BODY,HTML} upon EOF or redundancy } if (toptag.close) @@ -1377,27 +1443,19 @@ static void Html_tag_cleanup_to_idx(DilloHtml *html, int idx) } /* - * Default close function for tags. - * (conditional cleanup of the stack) + * Conditional cleanup of the stack, called before closing any tag. + * * There are several ways of doing it. Considering the HTML 4.01 spec * which defines optional close tags, and the will to deliver useful diagnose * messages for bad-formed HTML, it'll go as follows: - * 1.- Search the stack for the first tag that requires a close tag. - * 2.- If it matches, clean all the optional-close tags in between. - * 3.- Cleanup the matching tag. (on error, give a warning message) * - * If 'w3c_mode' is NOT enabled: - * 1.- Search the stack for a matching tag based on tag level. + * 1.- Search the stack for a matching tag by closing elements that: + * have optional close | are inline in a block container | force closing. * 2.- If it exists, clean all the tags in between. * 3.- Cleanup the matching tag. (on error, give a warning message) */ static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx) { - static int i_A = a_Html_tag_index("a"), - i_BUTTON = a_Html_tag_index("button"), - i_SELECT = a_Html_tag_index("select"), - i_TEXTAREA = a_Html_tag_index("textarea"); - int w3c_mode = !prefs.w3c_plus_heuristics; int stack_idx, tag_idx, matched = 0, expected = 0; TagInfo new_tag = Tags[new_idx]; @@ -1418,7 +1476,11 @@ static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx) (new_idx == i_TEXTAREA && html->InFlags & IN_TEXTAREA)) { /* Let these elements close anything left open inside them */ continue; - } else if (w3c_mode || Tags[tag_idx].TagLevel >= new_tag.TagLevel) { + } else if (Tags[new_idx].Flags & 4 && // Block container + Tags[stack_idx].Flags & 3) { // Inline element or container + /* Let a block container close inline elements left open inside it. */ + continue; + } else { /* this is the tag that should have been closed */ expected = 1; break; @@ -1426,7 +1488,7 @@ static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx) } if (matched) { - Html_tag_cleanup_to_idx(html, stack_idx); + Html_tag_cleanup_to_idx(html, stack_idx, new_idx, -1, 'c'); } else if (expected) { BUG_MSG("Unexpected closing tag: </%s> -- expected </%s>.", new_tag.name, Tags[tag_idx].name); @@ -1435,51 +1497,6 @@ static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx) } } -/* - * Avoid nesting and inter-nesting of BUTTON, SELECT, TEXTAREA and A, - * by closing them before opening another. - * This is not an HTML SPEC restriction , but it avoids lots of trouble - * inside dillo (concurrent inputs), and makes almost no sense to have. - */ -static void Html_tag_cleanup_nested_elements(DilloHtml *html, int new_idx) -{ - static int i_A = a_Html_tag_index("a"), - i_BUTTON = a_Html_tag_index("button"), - i_SELECT = a_Html_tag_index("select"), - i_TEXTAREA = a_Html_tag_index("textarea"); - int stack_idx, u_idx, matched = 0; - - dReturn_if_fail(html->InFlags & (IN_BUTTON |IN_SELECT |IN_TEXTAREA |IN_A)); - dReturn_if_fail(new_idx == i_BUTTON || new_idx == i_SELECT || - new_idx == i_TEXTAREA || new_idx == i_A); - - /* Get the unclosed tag index */ - u_idx = (html->InFlags & IN_BUTTON) ? i_BUTTON : - (html->InFlags & IN_SELECT) ? i_SELECT : - (html->InFlags & IN_TEXTAREA) ? i_TEXTAREA : i_A; - - /* Look for it inside the stack */ - stack_idx = html->stack->size(); - while (--stack_idx) { - if (html->stack->getRef(stack_idx)->tag_idx == u_idx) { - /* matching tag found */ - matched = 1; - break; - } - } - - if (matched) { - BUG_MSG("Attempt to nest <%s> element inside <%s> -- closing <%s>.", - Tags[new_idx].name, Tags[u_idx].name, Tags[u_idx].name); - Html_tag_cleanup_to_idx(html, stack_idx); - } else { - MSG_WARN("Inconsistent parser state, flag is SET but no '%s' element" - "was found in the stack\n", Tags[u_idx].name); - } - - html->InFlags &= ~(IN_BUTTON | IN_SELECT | IN_TEXTAREA | IN_A); -} - /* * Some parsing routines. @@ -1877,7 +1894,6 @@ static void Html_tag_open_body(DilloHtml *html, const char *tag, int tagsize) { const char *attrbuf; int32_t color; - int tag_index_a = a_Html_tag_index ("a"); style::Color *bgColor; style::StyleImage *bgImage; style::BackgroundRepeat bgRepeat; @@ -1955,14 +1971,14 @@ static void Html_tag_open_body(DilloHtml *html, const char *tag, int tagsize) * On reload style including color for visited links is computed properly * according to CSS. */ - html->startElement (tag_index_a); + html->startElement (i_A); html->styleEngine->setPseudoVisited (); if (html->non_css_visited_color != -1) { html->styleEngine->setNonCssHint (CSS_PROPERTY_COLOR, CSS_TYPE_COLOR, html->non_css_visited_color); } html->visited_color = html->style ()->color->getColor (); - html->styleEngine->endElement (tag_index_a); + html->styleEngine->endElement (i_A); if (prefs.contrast_visited_color) { /* get a color that has a "safe distance" from text, link and bg */ @@ -3547,7 +3563,7 @@ static void Html_tag_content_wbr(DilloHtml *html, const char *tag, int tagsize) */ const TagInfo Tags[] = { - {"a", B8(01111),'R',2, Html_tag_open_a, NULL, Html_tag_close_a}, + {"a", B8(01011),'R',2, Html_tag_open_a, NULL, Html_tag_close_a}, {"abbr", B8(01011),'R',2, Html_tag_open_abbr, NULL, NULL}, /* acronym 010101 -- obsolete in HTML5 */ {"address", B8(01110),'R',2,Html_tag_open_default, NULL, Html_tag_close_par}, @@ -3713,38 +3729,26 @@ int a_Html_tag_index(const char *tag) } /* - * For elements with optional close, check whether is time to close. + * For elements with optional close, check whether is time to close, + * by also following Firefox's de facto rules. + * Called at open time. + * * Return value: (1: Close, 0: Don't close) * --tuned for speed. */ -static int Html_needs_optional_close(int old_idx, int cur_idx) +static int Html_triggers_optional_close(int old_idx, int cur_idx) { - static int i_P = -1, i_LI, i_TD, i_TR, i_TH, i_DD, i_DT, i_OPTION; - // i_THEAD, i_TFOOT, i_COLGROUP; - - if (i_P == -1) { - /* initialize the indexes of elements with optional close */ - i_P = a_Html_tag_index("p"), - i_LI = a_Html_tag_index("li"), - i_TD = a_Html_tag_index("td"), - i_TR = a_Html_tag_index("tr"), - i_TH = a_Html_tag_index("th"), - i_DD = a_Html_tag_index("dd"), - i_DT = a_Html_tag_index("dt"), - i_OPTION = a_Html_tag_index("option"); - // i_THEAD = a_Html_tag_index("thead"); - // i_TFOOT = a_Html_tag_index("tfoot"); - // i_COLGROUP = a_Html_tag_index("colgroup"); - } - + int Flags = Tags[cur_idx].Flags; if (old_idx == i_P || old_idx == i_DT) { - /* P and DT are closed by block elements */ - return (!(Tags[cur_idx].Flags & 1)); + /* P and DT are closed by block elements (i.e. non inline)*/ + return (!(Flags & 1)); } else if (old_idx == i_LI) { - /* LI closes LI */ + /* LI closes LI + * Note: non-flow should also close it, but FF does not. */ return (cur_idx == i_LI); } else if (old_idx == i_TD || old_idx == i_TH) { - /* TD and TH are closed by TD, TH and TR */ + /* TD and TH are closed by: TD, TH and TR. + * Note: non-flow should also close it, but FF does not. */ return (cur_idx == i_TD || cur_idx == i_TH || cur_idx == i_TR); } else if (old_idx == i_TR) { /* TR closes TR */ @@ -3752,69 +3756,62 @@ static int Html_needs_optional_close(int old_idx, int cur_idx) } else if (old_idx == i_DD) { /* DD is closed by DD and DT */ return (cur_idx == i_DD || cur_idx == i_DT); + } else if (old_idx == i_OPTGROUP) { + /* i_OPTGROUP can only contain OPTION */ + return (cur_idx != i_OPTION); } else if (old_idx == i_OPTION) { return 1; // OPTION always needs close } - /* HTML, HEAD, BODY are handled by Html_test_section(), not here. */ - /* TODO: TBODY is pending */ + /* Don't close HTML, HEAD and BODY. They're handled by Html_test_section(). + * TODO: TBODY is pending */ return 0; } /* - * Conditional cleanup of the stack (at open time). - * - This helps catching block elements inside inline containers (a BUG). - * - It also closes elements with "optional" close tag. - * - * This function is called when opening a block element or <OPTION>. + * Conditional cleanup of the stack (at open time). Handles: + * - Forbidden cross nesting (a BUG). + * - Block elements inside non block containers (a BUG). + * - Elements with "optional" close tag (OK). * - * It searches the stack closing open inline containers, and closing - * elements with optional close tag when necessary. - * - * Note: OPTION is the only non-block element with an optional close. + * This function is called before opening/pushing a new tag into the stack. + * 'ni' is the new tag's index in Tags[]. */ -static void Html_stack_cleanup_at_open(DilloHtml *html, int new_idx) +static void Html_stack_cleanup_at_open(DilloHtml *html, int ni) { - /* We know that the element we're about to push is a block element. - * (except for OPTION, which is an empty inline, so is closed anyway) - * Notes: - * Its 'tag' is not yet pushed into the stack, - * 'new_idx' is its index inside Tags[]. - */ - if (!html->TagSoup) return; - while (html->stack->size() > 1) { - int oldtag_idx = S_TOP(html)->tag_idx; - - if (Tags[oldtag_idx].EndTag == 'O') { // Element with optional close - if (!Html_needs_optional_close(oldtag_idx, new_idx)) - break; - } else if (Tags[oldtag_idx].Flags & 4) { // Block container - break; - } - - /* we have an inline (or empty) container... */ - if (Tags[oldtag_idx].EndTag == 'R') { - BUG_MSG("<%s> is not allowed to contain <%s>. -- closing <%s>.", - Tags[oldtag_idx].name, Tags[new_idx].name, - Tags[oldtag_idx].name); + int s_top = html->stack->size() - 1, s_idx; + int fi = Html_forbids_cross_nesting(html->InFlags, ni); + for (s_idx = s_top; s_idx > 0; --s_idx) { + int ti = html->stack->getRef(s_idx)->tag_idx; + + if (fi >= 0) { + // forbidden cross nesting found + if (ti != fi) + continue; // don't allow, close + --s_idx; + BUG_MSG("Forbidden nesting: <%s> can't contain <%s>. -- closing " + "<%s>.", Tags[fi].name, Tags[ni].name, Tags[fi].name); + + } else if ((html->InFlags & IN_PRE) && ni == i_HR) { + break; // allow Apache's bad HTML directory listings... + + } else if (Tags[ti].EndTag == 'O') { // Element with optional close + if (Html_triggers_optional_close(ti, ni)) + continue; // close + } else if (!(Tags[ni].Flags & 1) && !(Tags[ti].Flags & 4)) { + // Block element over a NON block container + continue; // close } - /* Workaround for Apache and its bad HTML directory listings... */ - if ((html->InFlags & IN_PRE) && - strcmp(Tags[new_idx].name, "hr") == 0) - break; - /* Avoid OPTION closing SELECT */ - if ((html->InFlags & IN_SELECT) && - strcmp(Tags[new_idx].name,"option") == 0) - break; - - /* This call closes the top tag only. */ - Html_tag_cleanup_at_close(html, oldtag_idx); + break; } + + if (s_idx < s_top) + Html_tag_cleanup_to_idx(html, s_idx + 1, ni, fi, 'o'); } /* @@ -4003,8 +4000,7 @@ static void Html_display_listitem(DilloHtml *html) */ static void Html_process_tag(DilloHtml *html, char *tag, int tagsize) { - static int i_HTML = a_Html_tag_index("html"); - int ci, ni; /* current and new tag indexes */ + int ti, ni; /* stack tag index and new tag index */ char *start = tag + 1; /* discard the '<' */ int IsCloseTag = (*start == '/'); @@ -4035,25 +4031,20 @@ static void Html_process_tag(DilloHtml *html, char *tag, int tagsize) Html_test_section(html, ni, IsCloseTag); /* Tag processing */ - ci = S_TOP(html)->tag_idx; + ti = S_TOP(html)->tag_idx; switch (IsCloseTag) { case 0: /* Open function */ - /* Cleanup when opening a block element, or - * when openning over an element with optional close */ - if (!(Tags[ni].Flags & 1) || (ci != -1 && Tags[ci].EndTag == 'O')) + /* Cleanup before opening a new tag */ + if (ti != -1) Html_stack_cleanup_at_open(html, ni); - /* TODO: this is only raising a warning, take some defined action. - * Note: apache uses IMG inside PRE (we could use its "alt"). */ + /* TODO: this is only raising a warning, but allows the element. + * Note: Apache uses IMG inside PRE. */ if ((html->InFlags & IN_PRE) && Html_tag_pre_excludes(html, ni)) BUG_MSG("<pre> is not allowed to contain <%s>.", Tags[ni].name); - /* Make sure these elements don't nest each other */ - if (html->InFlags & (IN_BUTTON | IN_SELECT | IN_TEXTAREA | IN_A)) - Html_tag_cleanup_nested_elements(html, ni); - /* Push the tag into the stack */ Html_push_tag(html, ni); |