Magazyn Amiga 13

home *** CD-ROM | disk | FTP | other *** search

/ Magazyn Amiga 13 / MA_Cover_13.bin / source / c / tidy26jul99 / parser.c < prev next >

Wrap

C/C++ Source or Header | 1999-12-08 | 81.5 KB | 3,029 lines

/* parser.c - HTML Parser (c) 1998 (W3C) MIT, INRIA, Keio University See tidy.c for the copyright notice. */ #include "platform.h" /* platform independent stuff */ #include "html.h" /* to pull in definition of nodes */ int SeenBodyEndTag; /* could be moved into lexer structure */ void InsertNode(Node *element, Node *node) { node->parent = element; node->prev = element->last; if (element->last != null) element->last->next = node; else element->content = node; element->last = node; } void InsertNodeBeforeElement(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; node->next = element; node->prev = element->prev; element->prev = node; if (node->prev) node->prev->next = node; if (parent->content == element) parent->content = node; } void InsertNodeAfterElement(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; if (parent->last == element) parent->last = node; else node->next = element->next; element->next = node; node->prev = element; } void DiscardElement(Lexer *lexer, Node *element) { Node *parent; parent = element->parent; if (parent->last == element) parent->last = element->prev; if (parent->content == element) parent->content = element->next; if (element->prev) element->prev->next = element->next; if (element->next) element->next->prev = element->prev; element->next = null; FreeNode(element); } void TrimEmptyElement(Lexer *lexer, Node *element) { if (element->content == null && (element->tag != tag_a || element->attributes == null)) { if (element->type == TextNode || (element->tag != null && element->tag != tag_layer && !(element->tag->model & CM_ROW))) { if (element->type != TextNode) ReportWarning(lexer, element, null, TRIM_EMPTY_ELEMENT); DiscardElement(lexer, element); } } } /* If last child of element is a text node then trim trailing white space character. */ void TrimSpace(Lexer *lexer, Node *last) { unsigned char c; if (last != null && last->type == TextNode && last->end > last->start) { while (last->end > last->start) { c = (unsigned char)lexer->lexbuf[last->end - 1]; if (c == 160) /* non breaking space */ { if (last->parent->tag == tag_td || last->parent->tag == tag_th) { if (last->end > last->start + 1) last->end -= 1; else break; } else last->end -= 1; } else if (c == ' ') last->end -= 1; else break; } if (last->end < last->start) tidy_out(lexer->errout, "TrimSpace: screwed up text node\n"); /* if empty string then delete from parse tree */ if (last->start == last->end) TrimEmptyElement(lexer, last); } } /* This maps hello world to hello world If last child of element is a text node then trim trailing white space character moving it to after element's end tag. */ void TrimTrailingSpace(Lexer *lexer, Node *last) { unsigned char c; if (last != null && last->type == TextNode && last->end > last->start) { c = (unsigned char)lexer->lexbuf[last->end - 1]; if (c == ' ' || c == 160) { last->end -= 1; if (last->parent->tag->model & CM_INLINE) lexer->insertspace = yes; } /* if empty string then delete from parse tree */ if (last->start == last->end) TrimEmptyElement(lexer, last); } } /* This maps hello world to hello world Trims initial space, by moving it before the start tag, or if this element is the first in parent's content, then by discarding the space */ void TrimInitialSpace(Lexer *lexer, Node *element, Node *text) { Node *prev, *node; if (text->type == TextNode && lexer->lexbuf[text->start] == ' ') { if (element->tag->model & CM_INLINE && element->parent->content != element) { prev = element->prev; if (prev->type == TextNode) { if (lexer->lexbuf[prev->end - 1] != ' ') lexer->lexbuf[(prev->end)++] = ' '; ++(element->start); } else /* create new node */ { node = NewNode(); node->start = (element->start)++; node->end = element->start; lexer->lexbuf[node->start] = ' '; node->prev = prev; prev->next = node; node->next = element; element->prev = node; node->parent = element->parent; } } /* discard the space in current node */ ++(text->start); } } Bool DescendantOf(Node *element, Dict *tag) { Node *parent; for (parent = element->parent; parent != null; parent = parent->parent) { if (parent->tag == tag) return yes; } return no; } void ParseTag(Lexer *lexer, Node *node, uint mode) { if (node->tag->model & CM_EMPTY) { lexer->waswhite = no; return; } else if (!(node->tag->model & CM_INLINE)) lexer->insertspace = no; if (node->tag->parser == null || node->type == StartEndTag) return; (*node->tag->parser)(lexer, node, mode); } /* the doctype has been found after other tags, and needs moving to before the html element */ void InsertDocType(Lexer *lexer, Node *element, Node *doctype) { ReportWarning(lexer, element, doctype, DOCTYPE_AFTER_TAGS); while (element->tag != tag_html) element = element->parent; InsertNodeBeforeElement(element, doctype); } void MoveToHead(Lexer *lexer, Node *element, Node *node) { Node *head; if (node->type == StartTag || node->type == StartEndTag) { ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); while (element->tag != tag_html) element = element->parent; for (head = element->content; head; head = head->next) { if (head->tag == tag_head) { InsertNode(head, node); break; } } if (node->tag->parser) ParseTag(lexer, node, IgnoreWhitespace); } else { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); } } /* element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is inferred */ void ParseBlock(Lexer *lexer, Node *element, uint mode) { Node *node, *parent; Bool checkstack; uint istackbase; checkstack = yes; if (element->tag->model & CM_EMPTY) return; if (element->tag == tag_form && DescendantOf(element, tag_form)) ReportWarning(lexer, element, null, ILLEGAL_NESTING); /* InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack context is created and disposed of upon reaching the end of the element. They thus behave like table cells in this respect. */ if (element->tag->model & CM_OBJECT) { istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; } if (!(element->tag->model & CM_MIXED)) InlineDup(lexer, null); mode = IgnoreWhitespace; while ((node = GetToken(lexer, mode /*MixedContent*/)) != null) { /* end tag for this element */ if (node->type == EndTag && (node->tag == element->tag || element->was == node->tag)) { FreeNode(node); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; } TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } if (node->tag == tag_html || node->tag == tag_head || node->tag == tag_body) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { for (parent = element->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; } TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } } } /* mixed content model permits text */ if (node->type == TextNode) { if (checkstack) { checkstack = no; if (!(element->tag->model & CM_MIXED)) { if (InlineDup(lexer, node) > 0) continue; } } InsertNode(element, node); mode = MixedContent; continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(element, node); continue; } /* allow PARAM elements? */ if (node->tag == tag_param) { if ((element->tag->model & CM_PARAM) && node->type == StartTag) { InsertNode(element, node); continue; } /* otherwise discard it */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* allow AREA elements? */ if (node->tag == tag_area) { if ((element->tag == tag_map) && (node->type == StartTag || node->type == StartEndTag)) { InsertNode(element, node); continue; } /* otherwise discard it */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* ignore unknown start/end tags */ if (node->tag == null) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* Allow CM_INLINE elements here. Allow CM_BLOCK elements here unless lexer->excludeBlocks is yes. LI and DD are special cased. Otherwise infer end tag for this element. */ if (!(node->tag->model & CM_INLINE)) { if (node->type != StartTag && node->type != StartEndTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); continue; } if (element->tag == tag_td || element->tag == tag_th) { /* if parent is a table cell, avoid inferring the end of the cell */ if (node->tag->model & CM_HEAD) { MoveToHead(lexer, element, node); continue; } if (node->tag->model & CM_LIST) { UngetToken(lexer); node = InferredTag(lexer, "ul"); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_DEFLIST) { UngetToken(lexer); node = InferredTag(lexer, "dl"); lexer->excludeBlocks = yes; } /* infer end of current table cell */ if (!(node->tag->model & CM_BLOCK)) { UngetToken(lexer); TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } } else if (node->tag->model & CM_BLOCK) { if (lexer->excludeBlocks) { if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); if (element->tag->model & CM_OBJECT) lexer->istackbase = istackbase; TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } } else /* things like list items */ { if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); if (node->tag->model & CM_HEAD) { MoveToHead(lexer, element, node); continue; } UngetToken(lexer); if (node->tag->model & CM_LIST) { if (element->parent->tag == tag_ul || element->parent->tag == tag_ol) { TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } node = InferredTag(lexer, "ul"); } else if (node->tag->model & CM_DEFLIST) { if (element->parent->tag == tag_dl) { TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } node = InferredTag(lexer, "dl"); } else if (node->tag->model & CM_TABLE || node->tag->model & CM_ROW) { node = InferredTag(lexer, "table"); } else if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } else { TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } } } /* parse known element */ if (node->type == StartTag || node->type == StartEndTag) { if (node->tag->model & CM_INLINE) { if (checkstack && !node->implicit) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } mode = MixedContent; } else { checkstack = yes; mode = IgnoreWhitespace; } /* trim white space before */ if (node->tag == tag_br) TrimSpace(lexer, element->last); InsertNode(element, node); if (node->implicit) ReportWarning(lexer, element, node, INSERTING_TAG); ParseTag(lexer, node, IgnoreWhitespace /*MixedContent*/); continue; } /* discard unexpected tags */ if (node->type == EndTag) PopInline(lexer, node); /* if inline end tag */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); } if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_FOR); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; } TrimSpace(lexer, element->last); TrimEmptyElement(lexer, element); } void ParseInline(Lexer *lexer, Node *element, uint mode) { Node *node, *parent; if (element->tag->model & CM_EMPTY) return; if (element->tag == tag_a) { if (element->attributes == null) { ReportWarning(lexer, element->parent, element, DISCARDING_UNEXPECTED); DiscardElement(lexer, element); return; } } /* ParseInline is used for some block level elements like H1 to H6 For such elements we need to insert inline emphasis tags currently on the inline stack. For Inline elements, we normally push them onto the inline stack provided they aren't implicit or OBJECT/APPLET. This test is carried out in PushInline and PopInline, see istack.c */ if (element->tag->model & CM_BLOCK) InlineDup(lexer, null); else if (element->tag->model & CM_INLINE) PushInline(lexer, element); if (element->tag == tag_nobr) lexer->badLayout |= USING_NOBR; else if (element->tag == tag_font) lexer->badLayout |= USING_FONT; /* Inline elements may or may not be within a preformatted element */ if (mode != Preformatted) mode = MixedContent; while ((node = GetToken(lexer, mode)) != null) { /* end tag for current element */ if (node->tag == element->tag && node->type == EndTag) { if (element->tag->model & CM_INLINE) PopInline(lexer, node); FreeNode(node); TrimTrailingSpace(lexer, element->last); /* if a font element wraps an anchor and nothing else then move the font element inside the anchor since otherwise it won't alter the anchor text color */ if (element->tag == tag_font && element->content && element->content == element->last) { Node *child = element->content; if (child->tag == tag_a) { child->parent = element->parent; child->next = element->next; child->prev = element->prev; if (child->prev) child->prev->next = child; else child->parent->content = child; if (child->next) child->next->prev = child; else child->parent->last = child; element->prev = element->last = null; element->parent = child; element->content = child->content; child->content = child->last = element; } } TrimEmptyElement(lexer, element); return; } if (node->type == TextNode) { /* only called for 1st child */ if (element->content == null) TrimInitialSpace(lexer, element, node); if (node->start >= node->end) { FreeNode(node); continue; } InsertNode(element, node); continue; } /* mixed content model so allow text */ if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(element, node); continue; } /* deal with HTML tags */ if (node->tag == tag_html) { if (node->type == StartTag || node->type == StartEndTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* otherwise infer end of inline element */ UngetToken(lexer); TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } /* ignore unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* allow any inline end tag to end current element */ if (node->type == EndTag && node->tag->model & CM_INLINE && !(node->tag->model & CM_OBJECT) && element->tag->model & CM_INLINE) { PopInline(lexer, element); if (node->tag == tag_a && node->tag != element->tag) { ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); } else { ReportWarning(lexer, element, node, NON_MATCHING_ENDTAG); FreeNode(node); } TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } /* allow any header tag to end current header */ if (node->tag->model & CM_HEADING && element->tag->model & CM_HEADING) { ReportWarning(lexer, element, node, NON_MATCHING_ENDTAG); FreeNode(node); TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } /* an <A> tag to ends any open <A> element but <A href=...> is mapped to </A><A href=...> */ if (node->tag == tag_a && !node->implicit && IsPushed(lexer, node)) { /* coerce <a> to </a> unless it has some attributes */ if (node->attributes == null) { node->type = EndTag; ReportWarning(lexer, element, node, FORCED_END_ANCHOR); PopInline(lexer, node); UngetToken(lexer); continue; } UngetToken(lexer); ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); PopInline(lexer, element); TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { for (parent = element->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT) && !element->implicit) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); if (element->tag == tag_a) PopInline(lexer, element); UngetToken(lexer); TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } } } if (node->tag == tag_hr && (element->tag->model & CM_HEADING)) { ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); /* if the HR is the 1st thing in the heading then simply insert it before the heading element */ if (element->content == null) { parent = element->parent; if (parent->content == element) { parent->content = node; node->prev = null; } else /* element isn't first node in parent's content */ { element->prev->next = node; node->prev = element->prev; } node->parent = parent; node->next = element; element->prev = node; continue; } /* otherwise close the heading, insert the HR and then continue with a new heading element */ element->next = node; node->prev = element; node->parent = element->parent; TrimSpace(lexer, element->last); element = CloneNode(lexer, element); element->prev = node; element->parent = node->parent; node->next = element; node->parent->last = element; continue; } /* block level tags end this element */ if (!(node->tag->model & CM_INLINE)) { if (node->type != StartTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); continue; } if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) { MoveToHead(lexer, element, node); continue; } /* prevent anchors from propagating into block tags except for headings h1 to h6 */ if (element->tag == tag_a) { if (node->tag && !(node->tag->model & CM_HEADING)) PopInline(lexer, element); else if (!(element->content)) { DiscardElement(lexer, element); UngetToken(lexer); return; } } UngetToken(lexer); TrimTrailingSpace(lexer, element->last); TrimEmptyElement(lexer, element); return; } /* parse inline element */ if (node->type == StartTag || node->type == StartEndTag) { if (node->implicit) ReportWarning(lexer, element, node, INSERTING_TAG); /* trim white space before */ if (node->tag == tag_br) TrimSpace(lexer, element->last); InsertNode(element, node); ParseTag(lexer, node, mode); continue; } /* discard unexpected tags */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); } if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, element); } void ParseDefList(Lexer *lexer, Node *list, uint mode) { Node *node, *parent; if (list->tag->model & CM_EMPTY) return; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == list->tag && node->type == EndTag) { FreeNode(node); TrimEmptyElement(lexer, list); return; } /* deal with comments */ if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(list, node); continue; } if (node->type == TextNode) { UngetToken(lexer); node = InferredTag(lexer, "dd"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } if (node->tag == null) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } for (parent = list->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { ReportWarning(lexer, list, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimEmptyElement(lexer, list); return; } } } if (!(node->tag == tag_dt || node->tag == tag_dd)) { UngetToken(lexer); if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); TrimEmptyElement(lexer, list); return; } /* if DD appeared directly in BODY then exclude blocks */ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) { TrimEmptyElement(lexer, list); return; } node = InferredTag(lexer, "dd"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } if (node->type == EndTag) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } /* node should be <DT> or <DD>*/ InsertNode(list, node); ParseTag(lexer, node, IgnoreWhitespace); } ReportWarning(lexer, list, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, list); } void ParseList(Lexer *lexer, Node *list, uint mode) { Node *node, *parent; Bool first = yes; if (list->tag->model & CM_EMPTY) return; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == list->tag && node->type == EndTag) { FreeNode(node); /* a bit of a hack, this code is made more complex by the fact that ReportWarning takes 2 nodes rather than a node and a string */ if (MakeClean && (list->tag->model & CM_OBSOLETE)) { /* create a ul node for the error message */ node = CloneNode(lexer, list); MemFree(node->element); node->element = wstrdup("ul"); node->tag = tag_ul; ReportWarning(lexer, list, node, OBSOLETE_ELEMENT); node->parent = node->prev = node->next = null; FreeNode(node); /* only used for error message */ /* and coerce listing to pre */ MemFree(list->element); list->element = wstrdup("ul"); list->tag = tag_ul; } TrimEmptyElement(lexer, list); return; } if (node->type == TextNode) { UngetToken(lexer); /* the illegal form <ul>some text</ul> is sometimes used to get an indent; map it to <blockquote class="indent">some text</blockquote> */ if (first) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); MemFree(list->element); list->element = wstrdup("blockquote"); list->was = list->tag; list->tag = tag_blockquote; list->implicit = yes; ParseBlock(lexer, list, mode); return; } node = InferredTag(lexer, "li"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } /* deal with comments */ if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(list, node); continue; } if (node->tag == null) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } for (parent = list->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { ReportWarning(lexer, list, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimEmptyElement(lexer, list); return; } } } if (!(node->tag == tag_li)) { /* if node is <ul> or <ol> and we have at least one <li> then parse node and append to the previous <li> */ if (node->type != EndTag && (node->tag->parser == ParseList || node->tag->parser == ParseDefList)) { Node *blockquote; if (first) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); blockquote = InferredTag(lexer, "blockquote"); InsertNodeBeforeElement(list, blockquote); InsertNode(blockquote, node); ParseTag(lexer, node, mode); continue; } ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); InsertNode(list->last, node); ParseTag(lexer, node, IgnoreWhitespace); continue; } UngetToken(lexer); if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); TrimEmptyElement(lexer, list); return; } /* if LI appeared directly in BODY then exclude blocks */ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) { TrimEmptyElement(lexer, list); return; } if (node->type == EndTag) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } /* the illegal form <ul>some text</ul> is sometimes used to get an indent; map it to <blockquote class="indent">some text</blockquote> */ if (first) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); MemFree(list->element); list->element = wstrdup("blockquote"); list->was = list->tag; list->tag = tag_blockquote; list->implicit = yes; ParseBlock(lexer, list, mode); return; } node = InferredTag(lexer, "li"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } if (node->type == EndTag) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } /* node should be <LI> */ InsertNode(list,node); ParseTag(lexer, node, IgnoreWhitespace); first = no; } ReportWarning(lexer, list, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, list); } /* unexpected content in table row is moved to just before the table in accordance with Netscape and IE. This code assumes that node hasn't been inserted into the row. */ void MoveBeforeTable(Node *row, Node *node) { Node *table; /* first find the table element */ for (table = row->parent; table; table = table->parent) { if (table->tag == tag_table) { node->prev = table->prev; node->next = table; table->prev = node; node->parent = table->parent; if (node->prev) node->prev->next = node; break; } } } void ParseRow(Lexer *lexer, Node *row, uint mode) { Node *node, *parent; Bool exclude_state; if (row->tag->model & CM_EMPTY) return; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == row->tag) { if (node->type == EndTag) { FreeNode(node); TrimEmptyElement(lexer, row); return; } UngetToken(lexer); TrimEmptyElement(lexer, row); return; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, row, node, DISCARDING_UNEXPECTED); continue; } if (node->tag == tag_td || node->tag == tag_th) { ReportWarning(lexer, row, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } for (parent = row->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { UngetToken(lexer); TrimEmptyElement(lexer, row); return; } } } /* deal with comments */ if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(row, node); continue; } /* discard unknown tags */ if (node->tag == null && node->type != TextNode) { ReportWarning(lexer, row, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* discard unexpected <table> element */ if (node->tag == tag_table) { ReportWarning(lexer, row, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* THEAD, TFOOT or TBODY */ if (node->tag && (node->tag->model & CM_ROWGRP)) { UngetToken(lexer); TrimEmptyElement(lexer, row); return; } if (node->type == EndTag) { ReportWarning(lexer, row, node, DISCARDING_UNEXPECTED); continue; } if (!(node->tag == tag_td || node->tag == tag_th)) { MoveBeforeTable(row, node); ReportWarning(lexer, row, node, TAG_NOT_ALLOWED_IN); if (node->type != TextNode) ParseTag(lexer, node, IgnoreWhitespace); continue; #if 0 /* previous code inferred either </tr> or <td> */ if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { ReportWarning(lexer, row, node, MISSING_ENDTAG_BEFORE); TrimEmptyElement(lexer, row); return; } node = InferredTag(lexer, "td"); ReportWarning(lexer, row, node, MISSING_STARTTAG); #endif } /* node should be <TD> or <TH> */ InsertNode(row, node); exclude_state = lexer->excludeBlocks; lexer->excludeBlocks = no; ParseTag(lexer, node, IgnoreWhitespace); lexer->excludeBlocks = exclude_state; /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); } TrimEmptyElement(lexer, row); } void ParseRowGroup(Lexer *lexer, Node *rowgroup, uint mode) { Node *node, *parent; if (rowgroup->tag->model & CM_EMPTY) return; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == rowgroup->tag) { if (node->type == EndTag) { TrimEmptyElement(lexer, rowgroup); FreeNode(node); return; } UngetToken(lexer); return; } /* if </table> infer end tag */ if (node->tag == tag_table && node->type == EndTag) { UngetToken(lexer); TrimEmptyElement(lexer, rowgroup); return; } /* discard unknown tags */ if (node->tag == null && node->type != TextNode) { ReportWarning(lexer, rowgroup, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if TD or TH or text or inline or block then infer <TR> */ if (node->type == StartTag && (node->tag == tag_td || node->tag == tag_th || (node->tag->model & (CM_BLOCK | CM_INLINE))) || node->type == TextNode) { UngetToken(lexer); node = InferredTag(lexer, "tr"); ReportWarning(lexer, rowgroup, node, MISSING_STARTTAG); } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(rowgroup, node); continue; } /* if this is the end tag for ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, rowgroup, node, DISCARDING_UNEXPECTED); continue; } if (node->tag == tag_tr || node->tag == tag_td || node->tag == tag_th) { ReportWarning(lexer, rowgroup, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } for (parent = rowgroup->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { UngetToken(lexer); TrimEmptyElement(lexer, rowgroup); return; } } } /* if THEAD, TFOOT or TBODY then implied end tag */ if (node->tag->model & CM_ROWGRP) { if (node->type != EndTag) UngetToken(lexer); TrimEmptyElement(lexer, rowgroup); return; } if (node->type == EndTag) { ReportWarning(lexer, rowgroup, node, DISCARDING_UNEXPECTED); continue; } if (!(node->tag == tag_tr)) { node = InferredTag(lexer, "tr"); ReportWarning(lexer, rowgroup, node, MISSING_STARTTAG); UngetToken(lexer); } /* node should be <TR> */ InsertNode(rowgroup, node); ParseTag(lexer, node, IgnoreWhitespace); } TrimEmptyElement(lexer, rowgroup); } void ParseColGroup(Lexer *lexer, Node *colgroup, uint mode) { Node *node, *parent; if (colgroup->tag->model & CM_EMPTY) return; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == colgroup->tag && node->type == EndTag) { FreeNode(node); return; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, colgroup, node, DISCARDING_UNEXPECTED); continue; } for (parent = colgroup->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { UngetToken(lexer); return; } } } if (node->type == TextNode) { UngetToken(lexer); return; } /* deal with comments */ if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(colgroup, node); continue; } /* discard unknown tags */ if (node->tag == null) { ReportWarning(lexer, colgroup, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->tag != tag_col) { UngetToken(lexer); return; } if (node->type == EndTag) { ReportWarning(lexer, colgroup, node, DISCARDING_UNEXPECTED); continue; } /* node should be <COL> */ InsertNode(colgroup, node); ParseTag(lexer, node, IgnoreWhitespace); } } void ParseTableTag(Lexer *lexer, Node *table, uint mode) { Node *node, *parent; uint istackbase; istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == table->tag && node->type == EndTag) { FreeNode(node); lexer->istackbase = istackbase; TrimEmptyElement(lexer, table); return; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(table, node); continue; } /* discard unknown tags */ if (node->tag == null && node->type != TextNode) { ReportWarning(lexer, table, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if TD or TH or text or inline or block then infer <TR> */ if (node->type != EndTag) { if (node->tag == tag_td || node->tag == tag_th || node->type == TextNode || (node->tag->model & (CM_BLOCK | CM_INLINE))) { UngetToken(lexer); node = InferredTag(lexer, "tr"); ReportWarning(lexer, table, node, MISSING_STARTTAG); } } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, table, node, DISCARDING_UNEXPECTED); continue; } if (node->tag && node->tag->model & (CM_TABLE|CM_ROW)) { ReportWarning(lexer, table, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } for (parent = table->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { ReportWarning(lexer, table, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); lexer->istackbase = istackbase; TrimEmptyElement(lexer, table); return; } } } if (!(node->tag->model & CM_TABLE)) { UngetToken(lexer); ReportWarning(lexer, table, node, TAG_NOT_ALLOWED_IN); lexer->istackbase = istackbase; TrimEmptyElement(lexer, table); return; } if (node->type == StartTag || node->type == StartEndTag) { InsertNode(table, node);; ParseTag(lexer, node, IgnoreWhitespace); continue; } /* discard unexpected text nodes and end tags */ ReportWarning(lexer, table, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, table, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, table); lexer->istackbase = istackbase; } void ParsePre(Lexer *lexer, Node *pre, uint mode) { Node *node, *parent; if (pre->tag->model & CM_EMPTY) return; /* a bit of a hack, this code is made more complex by the fact that ReportWarning takes 2 nodes rather than a node and a string */ if (pre->tag->model & CM_OBSOLETE) { /* create a pre node for the error message */ node = CloneNode(lexer, pre); MemFree(node->element); node->element = wstrdup("pre"); node->tag = tag_pre; ReportWarning(lexer, pre, node, OBSOLETE_ELEMENT); node->parent = node->prev = node->next = null; FreeNode(node); /* only used for error message */ /* and coerce listing to pre */ MemFree(pre->element); pre->element = wstrdup("pre"); pre->tag = tag_pre; } InlineDup(lexer, null); /* tell lexer to insert inlines if needed */ while ((node = GetToken(lexer, Preformatted)) != null) { if (node->tag == pre->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, pre); TrimEmptyElement(lexer, pre); return; } if (node->tag == tag_html) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type == TextNode) { /* if first check for inital newline */ if (pre->content == null) { if (lexer->lexbuf[node->start] == '\n') ++(node->start); if (node->start >= node->end) { FreeNode(node); continue; } } InsertNode(pre, node); continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(pre, node); continue; } /* discard unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->tag == tag_p && node->type == StartTag) { ReportWarning(lexer, pre, node, USING_BR_INPLACE_OF); /* trim white space before in <pre>*/ TrimSpace(lexer, pre->last); /* coerce to */ node->tag = tag_br; MemFree(node->element); node->element = wstrdup("br"); InsertNode(pre, node); continue; } if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) { MoveToHead(lexer, pre, node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { lexer->badForm = yes; ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); continue; } for (parent = pre->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { ReportWarning(lexer, pre, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimSpace(lexer, pre); TrimEmptyElement(lexer, pre); return; } } } /* what about head content, HEAD, BODY tags etc? */ if (!(node->tag->model & CM_INLINE)) { if (node->type != StartTag) { ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); continue; } ReportWarning(lexer, pre, node, MISSING_ENDTAG_BEFORE); lexer->excludeBlocks = yes; /* check if we need to infer a container */ if (node->tag->model & CM_LIST) { UngetToken(lexer); node = InferredTag(lexer, "ul"); } else if (node->tag->model & CM_DEFLIST) { UngetToken(lexer); node = InferredTag(lexer, "dl"); } else if (node->tag->model & CM_TABLE) { UngetToken(lexer); node = InferredTag(lexer, "table"); } InsertNodeAfterElement(pre, node); pre = InferredTag(lexer, "pre"); InsertNodeAfterElement(node, pre); ParseTag(lexer, node, IgnoreWhitespace); lexer->excludeBlocks = no; continue; } #if 0 if (!(node->tag->model & CM_INLINE)) { ReportWarning(lexer, pre, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); return; } #endif if (node->type == StartTag || node->type == StartEndTag) { /* trim white space before */ if (node->tag == tag_br) TrimSpace(lexer, pre->last); InsertNode(pre, node); ParseTag(lexer, node, Preformatted); continue; } /* discard unexpected tags */ ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, pre, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, pre); } void ParseOptGroup(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, field->last); return; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(field, node); continue; } if (node->type == StartTag && (node->tag == tag_option || node->tag == tag_optgroup)) { if (node->tag == tag_optgroup) ReportWarning(lexer, field, node, CANT_BE_NESTED); InsertNode(field, node); ParseTag(lexer, node, MixedContent); continue; } /* discard unexpected tags */ ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); } } void ParseSelect(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, field->last); return; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(field, node); continue; } if (node->type == StartTag && (node->tag == tag_option || node->tag == tag_optgroup)) { InsertNode(field, node); ParseTag(lexer, node, IgnoreWhitespace); continue; } /* discard unexpected tags */ ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, field, node, MISSING_ENDTAG_FOR); } void ParseText(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, Preformatted)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, field->last); return; } if (node->type == TextNode || node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(field, node); continue; } if (node->tag == tag_font) { ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* terminate element on other tags */ if (!(field->tag->model & CM_OPT)) ReportWarning(lexer, field, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimSpace(lexer, field->last); return; } if (!(field->tag->model & CM_OPT)) ReportWarning(lexer, field, node, MISSING_ENDTAG_FOR); } void ParseTitle(Lexer *lexer, Node *title, uint mode) { Node *node; while ((node = GetToken(lexer, MixedContent)) != null) { if (node->tag == title->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, title->last); return; } if (node->type == TextNode) { /* only called for 1st child */ if (title->content == null) TrimInitialSpace(lexer, title, node); if (node->start >= node->end) { FreeNode(node); continue; } InsertNode(title, node); continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(title, node); continue; } /* discard unknown tags */ if (node->tag == null) { ReportWarning(lexer, title, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* pushback unexpected tokens */ ReportWarning(lexer, title, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimSpace(lexer, title->last); return; } ReportWarning(lexer, title, node, MISSING_ENDTAG_FOR); } /* This isn't quite right for CDATA content as it recognises tags within the content and parses them accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / + letter */ void ParseScript(Lexer *lexer, Node *script, uint mode) { Node *node; node = GetCDATA(lexer, script); if (node) InsertNode(script, node); } Bool IsJavaScript(Node *node) { Bool result = no; AttVal *attr; if (node->attributes == null) return yes; for (attr = node->attributes; attr; attr = attr->next) { if ( (wstrcasecmp(attr->attribute, "language") == 0 || wstrcasecmp(attr->attribute, "type") == 0) && wsubstr(attr->value, "javascript")) result = yes; } return result; } void ParseHead(Lexer *lexer, Node *head, uint mode) { Node *node; Bool HasTitle = no; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == head->tag && node->type == EndTag) { FreeNode(node); break; } if (node->type == TextNode) { UngetToken(lexer); break; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(head, node); continue; } if (node->type == DocTypeTag) { InsertDocType(lexer, head, node); continue; } /* discard unknown tags */ if (node->tag == null) { ReportWarning(lexer, head, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (!(node->tag->model & CM_HEAD)) { UngetToken(lexer); break; } if (node->type == StartTag || node->type == StartEndTag) { if (node->tag == tag_title) HasTitle = yes; else if (node->tag == tag_noscript) ReportWarning(lexer, head, node, TAG_NOT_ALLOWED_IN); InsertNode(head, node); ParseTag(lexer, node, IgnoreWhitespace); continue; } /* discard unexpected text nodes and end tags */ ReportWarning(lexer, head, node, DISCARDING_UNEXPECTED); FreeNode(node); } if (!HasTitle) { ReportWarning(lexer, head, null, MISSING_TITLE_ELEMENT); InsertNode(head, InferredTag(lexer, "title")); } } void ParseBody(Lexer *lexer, Node *body, uint mode) { Node *node; Bool checkstack, iswhitenode; mode = IgnoreWhitespace; checkstack = yes; while ((node = GetToken(lexer, mode)) != null) { if (node->tag == body->tag && node->type == EndTag) { TrimSpace(lexer, body->last); FreeNode(node); SeenBodyEndTag = 1; mode = IgnoreWhitespace; if (body->parent->tag == tag_noframes) break; continue; } if (node->tag == tag_html) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, body, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } iswhitenode = no; if (node->type == TextNode && node->end <= node->start + 1 && lexer->lexbuf[node->start] == ' ') iswhitenode = yes; if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(body, node); continue; } if (SeenBodyEndTag == 1 && !iswhitenode) { ++SeenBodyEndTag; ReportWarning(lexer, body, node, CONTENT_AFTER_BODY); } /* mixed content model permits text */ if (node->type == TextNode) { if (iswhitenode && mode == IgnoreWhitespace) { FreeNode(node); continue; } if (checkstack) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } InsertNode(body, node); mode = MixedContent; continue; } if (node->type == DocTypeTag) { InsertDocType(lexer, body, node); continue; } /* discard unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, body, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this Bool to exclude block-level elements so as to match Netscape's observed behaviour. */ lexer->excludeBlocks = no; if (!(node->tag->model & CM_BLOCK) && !(node->tag->model & CM_INLINE)) { /* avoid this error message being issued twice */ if (!node->tag->model & CM_HEAD) ReportWarning(lexer, body, node, TAG_NOT_ALLOWED_IN); if (node->tag->model & CM_HTML) { /* copy body attributes if current body was inferred */ if (node->tag == tag_body && body->implicit && body->attributes == null) { body->attributes = node->attributes; node->attributes = null; } FreeNode(node); continue; } if (node->tag->model & CM_HEAD) { MoveToHead(lexer, body, node); continue; } if (node->tag->model & CM_LIST) { UngetToken(lexer); node = InferredTag(lexer, "ul"); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_DEFLIST) { UngetToken(lexer); node = InferredTag(lexer, "dl"); lexer->excludeBlocks = yes; } else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) { UngetToken(lexer); node = InferredTag(lexer, "table"); lexer->excludeBlocks = yes; } else { if (!node->tag->model & (CM_ROW | CM_FIELD)) { UngetToken(lexer); return; } /* ignore </td> </th> <option> etc. */ continue; } } if (node->type == StartTag || node->type == StartEndTag) { if ((node->tag->model & CM_INLINE) && !(node->tag->model & CM_MIXED)) { if (checkstack && !node->implicit) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } mode = MixedContent; } else { checkstack = yes; mode = IgnoreWhitespace; } if (node->implicit) ReportWarning(lexer, body, node, INSERTING_TAG); InsertNode(body, node); ParseTag(lexer, node, mode /* IgnoreWhitespace /*MixedContent*/); continue; } else if (node->type == EndTag) PopInline(lexer, node); /* if inline end tag */ /* discard unexpected end tags */ ReportWarning(lexer, body, node, DISCARDING_UNEXPECTED); FreeNode(node); } } void ParseNoFrames(Lexer *lexer, Node *noframes, uint mode) { Node *node; Bool checkstack; lexer->badAccess |= USING_NOFRAMES; mode = IgnoreWhitespace; checkstack = yes; while ((node = GetToken(lexer, mode)) != null) { if (node->tag == noframes->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, noframes->last); return; } if (node->tag == tag_html) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, noframes, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(noframes, node); continue; } if (node->tag == tag_body && node->type == StartTag) { InsertNode(noframes, node); ParseTag(lexer, node, IgnoreWhitespace /*MixedContent*/); continue; } if (node->type == StartTag || node->type == StartEndTag || node->type == TextNode) { UngetToken(lexer); node = InferredTag(lexer, "body"); ReportWarning(lexer, noframes, node, INSERTING_TAG); InsertNode(noframes, node); ParseTag(lexer, node, IgnoreWhitespace /*MixedContent*/); continue; } #if 0 /* mixed content model permits text */ if (node->type == TextNode) { if (checkstack) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } InsertNode(noframes, node); mode = MixedContent; continue; } /* discard unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, noframes, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* Treat LI and DD etc. in same way as in BODY */ lexer->excludeBlocks = no; if (!(node->tag->model & CM_BLOCK) && !(node->tag->model & CM_INLINE)) { ReportWarning(lexer, noframes, node, TAG_NOT_ALLOWED_IN); if (node->tag->model & CM_HTML) { FreeNode(node); continue; } if (node->tag->model & CM_HEAD) { MoveToHead(lexer, noframes, node); continue; } UngetToken(lexer); if (node->tag->model & CM_LIST) { node = InferredTag(lexer, "ul"); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_DEFLIST) { node = InferredTag(lexer, "dl"); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_TABLE) { node = InferredTag(lexer, "table"); lexer->excludeBlocks = yes; } else return; } if (node->type == StartTag || node->type == StartEndTag) { if (node->tag->model & CM_INLINE) { if (checkstack && !node->implicit) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } mode = MixedContent; } else { checkstack = yes; mode = IgnoreWhitespace; } if (node->implicit) ReportWarning(lexer, noframes, node, INSERTING_TAG); InsertNode(noframes, node); ParseTag(lexer, node, IgnoreWhitespace /*MixedContent*/); continue; } else if (node->type == EndTag) PopInline(lexer, node); /* if inline end tag */ #endif /* discard unexpected end tags */ ReportWarning(lexer, noframes, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, noframes, node, MISSING_ENDTAG_FOR); } void ParseFrameSet(Lexer *lexer, Node *frameset, uint mode) { Node *node; lexer->badAccess |= USING_FRAMES; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == frameset->tag && node->type == EndTag) { FreeNode(node); TrimSpace(lexer, frameset->last); return; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(frameset, node); continue; } if (node->tag == null) { ReportWarning(lexer, frameset, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->tag == tag_body) { UngetToken(lexer); node = InferredTag(lexer, "noframes"); ReportWarning(lexer, frameset, node, INSERTING_TAG); } if (node->type == StartTag && node->tag->model & CM_FRAMES) { InsertNode(frameset, node); lexer->excludeBlocks = no; ParseTag(lexer, node, MixedContent); continue; } /* discard unexpected tags */ ReportWarning(lexer, frameset, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, frameset, node, MISSING_ENDTAG_FOR); } void ParseHTML(Lexer *lexer, Node *html, uint mode) { Node *node, *head; Node *frameset = null; Node *noframes = null; XmlTags = no; SeenBodyEndTag = 0; for (;;) { node = GetToken(lexer, IgnoreWhitespace); if (node == null) { node = InferredTag(lexer, "head"); break; } if (node->tag == tag_head) break; if (node->tag == html->tag && node->type == EndTag) { ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(html, node); continue; } UngetToken(lexer); node = InferredTag(lexer, "head"); break; } head = node; InsertNode(html, head); ParseHead(lexer, head, mode); for (;;) { node = GetToken(lexer, IgnoreWhitespace); if (node == null) { if (frameset == null) /* create an empty body */ node = InferredTag(lexer, "body"); return; } /* robustly handle html tags */ if (node->tag == html->tag) { if (node->type != StartTag && frameset == null) ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag) { InsertNode(html, node); continue; } /* if frameset document coerce <body> to <noframes> */ if (node->tag == tag_body) { if (node->type != StartTag) { ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (frameset != null) { UngetToken(lexer); if (noframes == null) { noframes = InferredTag(lexer, "noframes"); InsertNode(frameset, noframes); ReportWarning(lexer, html, noframes, INSERTING_TAG); } ParseTag(lexer, noframes, mode); continue; } break; /* to parse body */ } /* flag an error if we see more than one frameset */ if (node->tag == tag_frameset) { if (node->type != StartTag) { ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (frameset != null) ReportError(lexer, html, node, DUPLICATE_FRAMESET); else frameset = node; InsertNode(html, node); ParseTag(lexer, node, mode); /* see if it includes a noframes element so that we can merge subsequent noframes elements */ for (node = frameset->content; node; node = node->next) { if (node->tag == tag_noframes) noframes = node; } continue; } /* if not a frameset document coerce <noframes> to <body> */ if (node->tag == tag_noframes) { if (node->type != StartTag) { ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (frameset == null) { ReportWarning(lexer, html, node, DISCARDING_UNEXPECTED); FreeNode(node); node = InferredTag(lexer, "body"); break; } if (noframes == null) { noframes = node; InsertNode(frameset, noframes); } else FreeNode(node); ParseTag(lexer, noframes, mode); continue; } UngetToken(lexer); /* insert other content into noframes element */ if (frameset) { if (noframes == null) { noframes = InferredTag(lexer, "noframes"); InsertNode(frameset, noframes); } else ReportWarning(lexer, html, node, NOFRAMES_CONTENT); ParseTag(lexer, noframes, mode); continue; } node = InferredTag(lexer, "body"); break; } /* node must be body */ InsertNode(html, node); ParseTag(lexer, node, mode); } /* HTML is the top level element */ Node *ParseDocument(Lexer *lexer) { Node *node, *document, *html; document = NewNode(); document->type = RootNode; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->type == CommentTag || node->type == ProcInsTag || node->type == AspTag || node->type == DocTypeTag) { InsertNode(document, node); continue; } if (node->type == EndTag) { ReportWarning(lexer, RootNode, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type != StartTag || node->tag != tag_html) { UngetToken(lexer); html = InferredTag(lexer, "html"); } else html = node; InsertNode(document, html); ParseHTML(lexer, html, no); break; } return document; } Bool XMLPreserveWhiteSpace(Node *element) { AttVal *attribute; /* search attributes for xml:space */ for (attribute = element->attributes; attribute; attribute = attribute->next) { if (wstrcmp(attribute->attribute, "xml:space") == 0) { if (wstrcmp(attribute->value, "preserve") == 0) return yes; return no; } } /* kludge for html docs without explicit xml:space attribute */ if (wstrcasecmp(element->element, "pre") == 0 || wstrcasecmp(element->element, "script") == 0 || wstrcasecmp(element->element, "style") == 0) return yes; /* kludge for XSL docs */ if (wstrcasecmp(element->element, "xsl:text") == 0) return yes; return no; } /* XML documents */ void ParseXMLElement(Lexer *lexer, Node *element, uint mode) { Node *node; /* Jeff Young's kludge for XSL docs */ if (wstrcasecmp(element->element, "xsl:text") == 0) return; /* if node is pre or has xml:space="preserve" then do so */ if (XMLPreserveWhiteSpace(element)) mode = Preformatted; while ((node = GetToken(lexer, mode)) != null) { if (node->type == EndTag && wstrcmp(node->element, element->element) == 0) { FreeNode(node); break; } /* discard unexpected end tags */ if (node->type == EndTag) { ReportWarning(lexer, element, node, UNEXPECTED_ENDTAG); FreeNode(node); continue; } /* parse content on seeing start tag */ if (node->type == StartTag) ParseXMLElement(lexer, node, mode); InsertNode(element, node); } /* if first child is text then trim initial space and delete text node if it is empty. */ node = element->content; if (node && node->type == TextNode && mode != Preformatted) { if (lexer->lexbuf[node->start] == ' ') { node->start++; if (node->start >= node->end) DiscardElement(lexer, node); } } /* if last child is text then trim final space and delete the text node if it is empty */ node = element->last; if (node && node->type == TextNode && mode != Preformatted) { if (lexer->lexbuf[node->end - 1] == ' ') { node->end--; if (node->start >= node->end) DiscardElement(lexer, node); } } } Node *ParseXMLDocument(Lexer *lexer) { Node *node, *document, *last; document = NewNode(); document->type = RootNode; last = null; XmlTags = yes; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { /* discard unexpected end tags */ if (node->type == EndTag) { ReportWarning(lexer, null, node, UNEXPECTED_ENDTAG); FreeNode(node); continue; } /* if start tag then parse element's content */ if (node->type == StartTag) ParseXMLElement(lexer, node, IgnoreWhitespace); if (last != null) last->next = node; else document->content = node; last = node; } if (doctype_mode == doctype_omit) DiscardDocType(document); /* ensure presence of initial <?XML version="1.0"?> */ if (XmlPi) FixXMLPI(lexer, document); return document; }