home *** CD-ROM | disk | FTP | other *** search
- /* html.h
-
- (c) 1998 (W3C) MIT, INRIA, Keio University
- See tidy.c for the copyright notice.
- */
-
- /* indentation modes */
-
- #define NO_INDENT 0
- #define BLOCK_INDENT 1
- #define SMART_INDENT 2
-
- /* character encodings */
-
- #define RAW 0
- #define ASCII 1
- #define LATIN1 2
- #define UTF8 3
- #define ISO2022 4
-
- /* states for ISO 2022
-
- A document in ISO-2022 based encoding uses some ESC sequences called
- "designator" to switch character sets. The designators defined and
- used in ISO-2022-JP are:
-
- "ESC" + "(" + ? for ISO646 variants
-
- "ESC" + "$" + ? and
- "ESC" + "$" + "(" + ? for multibyte character sets
- */
- #define FSM_ASCII 0
- #define FSM_ESC 1
- #define FSM_ESCD 2
- #define FSM_ESCDP 3
- #define FSM_ESCP 4
- #define FSM_NONASCII 5
-
- /* lexer char types */
-
- #define digit 1
- #define letter 2
- #define namechar 4
- #define white 8
- #define newline 16
- #define lowercase 32
- #define uppercase 64
-
- /* lexer GetToken states */
-
- #define LEX_CONTENT 0
- #define LEX_GT 1
- #define LEX_ENDTAG 2
- #define LEX_STARTTAG 3
- #define LEX_COMMENT 4
- #define LEX_DOCTYPE 5
- #define LEX_PROCINSTR 6
- #define LEX_ENDCOMMENT 7
- #define LEX_ASP 8
-
- /* content model shortcut encoding */
-
- #define CM_UNKNOWN 0
- #define CM_EMPTY (1 << 0)
- #define CM_HTML (1 << 1)
- #define CM_HEAD (1 << 2)
- #define CM_BLOCK (1 << 3)
- #define CM_INLINE (1 << 4)
- #define CM_LIST (1 << 5)
- #define CM_DEFLIST (1 << 6)
- #define CM_TABLE (1 << 7)
- #define CM_ROWGRP (1 << 8)
- #define CM_ROW (1 << 9)
- #define CM_FIELD (1 << 10)
- #define CM_OBJECT (1 << 11)
- #define CM_PARAM (1 << 12)
- #define CM_FRAMES (1 << 13)
- #define CM_HEADING (1 << 14)
- #define CM_OPT (1 << 15)
- #define CM_IMG (1 << 16)
- #define CM_MIXED (1 << 17)
- #define CM_NO_INDENT (1 << 18)
- #define CM_OBSOLETE (1 << 19)
- #define CM_NEW (1 << 20)
-
- /*
- Linked list of class names and styles
- */
- struct _style
- {
- char *tag;
- char *tag_class;
- char *properties;
- struct _style *next;
- };
-
- typedef struct _style Style;
-
- /*
- Linked list of style properties
- */
- struct _styleprop
- {
- char *name;
- char *value;
- struct _styleprop *next;
- };
-
- typedef struct _styleprop StyleProp;
-
- /* mode controlling treatment of doctype */
- typedef enum
- {
- doctype_omit,
- doctype_auto,
- doctype_strict,
- doctype_loose,
- doctype_user
- } DocTypeMode;
-
- /*
- Attribute/Value linked list node
- */
-
- struct _attval
- {
- struct _attval *next;
- struct _attribute *dict;
- struct _node *asp;
- int delim;
- char *attribute;
- char *value;
- };
-
- typedef struct _attval AttVal;
-
- /*
- node->type is one of these values
- */
- #define RootNode 0
- #define DocTypeTag 1
- #define CommentTag 2
- #define ProcInsTag 3
- #define TextNode 4
- #define StartTag 5
- #define EndTag 6
- #define StartEndTag 7
- #define AspTag 8
-
- struct _node
- {
- struct _node *parent;
- struct _node *prev;
- struct _node *next;
- struct _node *last;
- uint start; /* start of span onto text array */
- uint end; /* end of span onto text array */
- uint type; /* TextNode, StartTag, EndTag etc. */
- Bool implicit; /* true if inferred */
- struct _tagdict *was; /* old tag when it was changed */
- struct _tagdict *tag; /* tag's dictionary definition */
- char *element; /* name (null for text nodes) */
- struct _attval *attributes;
- struct _node *content;
- };
-
- typedef struct _node Node;
-
- /*
-
- If the document uses just HTML 2.0 tags and attributes described it as HTML 2.0
- Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. If there are proprietary
- tags and attributes then describe it as HTML Proprietary. If it includes the
- xml-lang or xmlns attributes but is otherwise HTML 2.0, 3.2 or 4.0 then describe
- it as one of the flavors of Voyager (strict, loose or frameset).
- */
-
- #define VERS_UNKNOWN 0
-
- #define VERS_HTML20 1
- #define VERS_HTML32 2
- #define VERS_HTML40_STRICT 4
- #define VERS_HTML40_LOOSE 8
- #define VERS_FRAMES 16
- #define VERS_XML 32
-
- #define VERS_NETSCAPE 64
- #define VERS_MICROSOFT 128
- #define VERS_SUN 256
-
- #define VERS_MALFORMED 512
-
- #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMES)
- #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMES)
- #define VERS_LOOSE (VERS_HTML32|VERS_HTML40_LOOSE|VERS_FRAMES)
- #define VERS_IFRAMES (VERS_HTML40_LOOSE|VERS_FRAMES)
- #define VERS_FROM32 (VERS_HTML40_STRICT|VERS_LOOSE)
- #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
-
- #define VERS_EVERYTHING (VERS_ALL|VERS_PROPRIETARY)
-
- /*
- Mosaic handles inlines via a separate stack from other elements
- We duplicate this to recover from inline markup errors such as:
-
- <i>italic text
- <p>more italic text</b> normal text
-
- which for compatibility with Mosaic is mapped to:
-
- <i>italic text</i>
- <p><i>more italic text</i> normal text
-
- Note that any inline end tag pop's the effect of the current
- inline start tag, so that </b> pop's <i> in the above example.
- */
-
- struct _inode
- {
- struct _inode *next;
- struct _tagdict *tag; /* tag's dictionary definition */
- char *element; /* name (null for text nodes) */
- struct _attval *attributes;
- };
-
- typedef struct _inode IStack;
- typedef struct _lexer Lexer;
-
- /* tidy.c */
- #define EndOfStream EOF
-
- /* non-raw input is cleaned up*/
- typedef struct
- {
- int state; /* FSM for ISO2022 */
- Bool pushed;
- int c;
- int tabs;
- int lastcol;
- int curcol;
- int curline;
- int encoding;
- FILE *file;
- Lexer *lexer; /* needed for error reporting */
- } StreamIn;
-
- StreamIn *OpenInput(FILE *fp);
- int ReadChar(StreamIn *in);
- void UngetChar(int c, StreamIn *in);
-
-
- /*
- The following are private to the lexer
- Use NewLexer(fp) to create a lexer, and
- FreeLexer(lexer) to free it.
- */
-
- struct _lexer
- {
- StreamIn *in; /* file stream */
- FILE *errout; /* error output stream */
- uint badAccess; /* for accessibility errors */
- uint badLayout; /* for bad style errors */
- uint badChars; /* for bad char encodings */
- uint badForm; /* for mismatched/mispositioned form tags */
- uint warnings; /* count of warnings in this document */
- uint errors; /* count of errors */
- uint lines; /* lines seen */
- uint columns; /* at start of current token */
- Bool waswhite; /* used to collapse contiguous white space */
- Bool pushed; /* true after token has been pushed back */
- Bool insertspace; /* when space is moved after end tag */
- Bool excludeBlocks; /* Netscape compatibility */
- Bool isvoyager; /* true if xmlns attribute on html element */
- uint versions; /* bit vector of HTML versions */
- int doctype; /* version as given by doctype (if any) */
- Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
- uint txtstart; /* start of current node */
- uint txtend; /* end of current node */
- uint state; /* state of lexer's finite state machine */
- struct _node *token;
-
- /*
- lexer character buffer
-
- parse tree nodes span onto this buffer
- which contains the concatenated text
- contents of all of the elements.
-
- lexsize must be reset for each file.
- */
- char *lexbuf; /* char buffer */
- uint lexlength; /* allocated */
- uint lexsize; /* used */
-
- /* Inline stack for compatibility with Mosaic */
- Node *inode; /* for deferring text node */
- IStack *insert; /* for inferring inline tags */
- IStack *istack;
- uint istacklength; /* allocated */
- uint istacksize; /* used */
- uint istackbase; /* start of frame */
-
- Style *styles; /* used for cleaning up presentation markup */
- };
-
- typedef void (Parser)(Lexer *lexer, Node *node, uint mode);
- typedef void (CheckAttribs)(Lexer *lexer, Node *node);
-
- /* declaration for methods that check attribute values */
- typedef void (AttrCheck)(Lexer *lexer, Node *node, AttVal *attval);
-
- struct _attribute
- {
- struct _attribute *next;
- char *name;
- Bool nowrap;
- unsigned versions;
- AttrCheck *attrchk;
- };
-
- typedef struct _attribute Attribute;
-
- /* well known attributes */
- extern Attribute *attr_href;
- extern Attribute *attr_src;
- extern Attribute *attr_id;
- extern Attribute *attr_name;
- extern Attribute *attr_summary;
- extern Attribute *attr_alt;
- extern Attribute *attr_longdesc;
- extern Attribute *attr_title;
-
- /*
- Tag dictionary node
- */
-
- struct _tagdict
- {
- struct _tagdict *next;
- char *name;
- uint versions;
- uint model;
- Parser *parser;
- CheckAttribs *chkattrs;
- };
-
- typedef struct _tagdict Dict;
-
- /* modes for GetToken() */
- #define IgnoreWhitespace 0
- #define MixedContent 1
- #define Preformatted 2
- #define IgnoreMarkup 3
-
- void FatalError(char *msg);
-
- Node *GetToken(Lexer *lexer, uint mode);
-
- /* one level unget only */
- void UngetToken(Lexer *lexer);
-
- /* create lexer for a file stream */
- Lexer *NewLexer(StreamIn *in);
-
- /* delete lexer */
- void FreeLexer(Lexer *lexer);
-
- Bool EndOfInput(Lexer *lexer);
-
- /* used for script or style */
- Node *GetCDATA(Lexer *lexer, Node *container);
-
- /* use this to create node for inferred start tag */
- Node *InferredTag(Lexer *lexer, char *name);
-
- /* Parser calls this to create RootNode */
- Node *NewNode(void);
-
- void FreeAttrs(Node *node);
-
- /* use this to free parse tree node and all its children */
- void FreeNode(Node *node);
-
- /* used to clone heading nodes when split by an <HR> */
- Node *CloneNode(Lexer *lexer, Node *element);
-
- /* lexer char map - must be initialized */
- void InitMap(void);
-
- void AddCharToLexer(Lexer *lexer, uint c);
- void AddStringLiteral(Lexer *lexer, char *str);
- Node *TextToken(Lexer *lexer);
-
- /* used by pretty printer for tag names */
- char FoldCase(char c, Bool tocaps);
-
- Bool IsLetter(uint c);
- Bool IsDigit(uint c);
- Bool IsWhite(uint c);
-
- /* used to fixup doctype to match contents */
- Node *FindDocType(Node *root);
- void DiscardDocType(Node *root);
- Bool FixDocType(Lexer *lexer, Node *node);
- char *HTMLVersionName(Lexer *lexer);
- Bool FixXMLPI(Lexer *lexer, Node *root);
- Bool SetXHTMLDocType(Lexer *lexer, Node *root);
-
- /* used to detect faulty attributes */
- Bool IsValidAttrName( char *attr);
-
- /* parser.c */
- Node *ParseDocument(Lexer *lexer);
- Node *ParseXMLDocument(Lexer *lexer);
- Bool XMLPreserveWhiteSpace(Node *element);
- void DiscardElement(Lexer *lexer, Node *element);
- Bool IsJavaScript(Node *node);
-
- /* attrs.c */
- void InitAttrs(void);
- void FreeAttrTable(void);
- Attribute *FindAttribute(AttVal *attval);
- Attribute *CheckAttribute(Lexer *lexer, Node *node, AttVal *attval);
- Bool IsUrl(char *attrname);
- Bool IsScript(char *attrname);
-
- /* istack.c */
- void PushInline(Lexer *lexer, Node *node);
- void PopInline(Lexer *lexer, Node *node);
- Bool IsPushed(Lexer *lexer, Node *node);
- int InlineDup(Lexer *lexer, Node *node);
- Node *InsertedToken(Lexer *lexer);
- AttVal *DupAttrs(AttVal *attrs);
- void InsertNode(Node *element, Node *node);
-
- /* clean.c */
- void FreeStyles(Lexer *lexer);
- void CleanTree(Lexer *lexer, Node *node);
- void EmFromI(Node *node);
-
- /* entities.c */
- void InitEntities(void);
- void FreeEntities(void);
- uint EntityCode(char *name);
- char *EntityName(uint n);
-
- /* tags.c */
- void DefineEmptyTag(char *name);
- void DefineInlineTag(char *name);
- void DefineBlockTag(char *name);
-
- Bool FindTag(Node *node);
- void InitTags(void);
- void FreeTags(void);
- int HTMLVersion(Lexer *lexer);
-
- /* localize.c -- used for all message text */
-
- void NeedsAuthorIntervention(FILE *errout);
- void MissingBody(FILE *errout);
- void ReportNumberOfSlides(FILE *errout, int count);
- void GeneralInfo(FILE *errout);
- void HelloMessage(FILE *errout, char *date, char *filename);
- void ReportVersion(FILE *errout, char *filename, char *vers);
- void ReportNumWarnings(FILE *errout, Lexer *lexer);
-
- /* pprint.c */
- void FreePrintBuf(void);
- void PPrintTree(Out *out, uint mode, uint indent,
- Lexer *lexer, Node *node);
- void PPrintXMLTree(Out *fout, uint mode, uint indent,
- Lexer *lexer, Node *node);
- void PFlushLine(Out *out, uint indent);
- void PCondFlushLine(Out *out, uint indent);
- void PPrintAsp(Out *fout, uint indent, Lexer *lexer, Node *node);
-
- /* tidy.c */
- #define EndOfStream EOF
-
- void *MemAlloc(uint size);
- void *MemRealloc(void *mem, uint newsize);
- void MemFree(void *mem);
- int ReadChar(StreamIn *in);
- void UngetChar(int c, StreamIn *in);
-
- /* string functions */
- char *wstrdup(char *str);
- char *wstrndup(char *str, int len);
- void wstrncpy(char *s1, char *s2, int size);
- int wstrcmp(char *s1, char *s2);
- int wstrcasecmp(char *s1, char *s2);
- int wstrncmp(char *s1, char *s2, int n);
- int wstrncasecmp(char *s1, char *s2, int n);
- int wstrlen(char *str);
- Bool wsubstr(char *s1, char *s2);
- void ClearMemory(void *, uint size);
-
- #define uprintf fprintf
-
- /* error codes for entities */
-
- #define MISSING_SEMICOLON 1
- #define UNKNOWN_ENTITY 2
- #define UNESCAPED_AMPERSAND 3
-
- /* error codes for element messages */
-
- #define MISSING_ENDTAG_FOR 1
- #define MISSING_ENDTAG_BEFORE 2
- #define DISCARDING_UNEXPECTED 3
- #define NON_MATCHING_ENDTAG 4
- #define TAG_NOT_ALLOWED_IN 5
- #define MISSING_STARTTAG 6
- #define UNEXPECTED_ENDTAG 7
- #define USING_BR_INPLACE_OF 8
- #define INSERTING_TAG 9
- #define SUSPECTED_MISSING_QUOTE 10
- #define MISSING_TITLE_ELEMENT 11
- #define DUPLICATE_FRAMESET 12
- #define CANT_BE_NESTED 13
- #define OBSOLETE_ELEMENT 14
- #define PROPRIETARY_ELEMENT 15
- #define UNKNOWN_ELEMENT 16
- #define TRIM_EMPTY_ELEMENT 17
- #define FORCED_END_ANCHOR 18
- #define ILLEGAL_NESTING 19
- #define NOFRAMES_CONTENT 20
- #define CONTENT_AFTER_BODY 21
- #define INCONSISTENT_VERSION 22
- #define BAD_COMMENT 23
- #define BAD_CDATA_CONTENT 24
- #define INCONSISTENT_NAMESPACE 25
- #define DOCTYPE_AFTER_TAGS 26
- #define MALFORMED_DOCTYPE 27
- #define UNEXPECTED_END_OF_FILE 28
-
- /* error codes used for attribute messages */
-
- #define UNKNOWN_ATTRIBUTE 1
- #define MISSING_ATTRIBUTE 2
- #define MISSING_ATTR_VALUE 3
- #define BAD_ATTRIBUTE_VALUE 4
- #define UNEXPECTED_GT 5
- #define PROPRIETARY_ATTR_VALUE 6
- #define MISSING_IMAGEMAP 7
- #define XML_ATTRIBUTE_VALUE 8
- #define UNEXPECTED_QUOTEMARK 9
-
- /* accessibility flaws */
-
- #define MISSING_IMAGE_ALT 1
- #define MISSING_LINK_ALT 2
- #define MISSING_SUMMARY 4
- #define MISSING_IMAGE_MAP 8
- #define USING_FRAMES 16
- #define USING_NOFRAMES 32
-
- /* presentation flaws */
-
- #define USING_SPACER 1
- #define USING_LAYER 2
- #define USING_NOBR 4
- #define USING_FONT 8
-
- /* character encoding errors */
- #define WINDOWS_CHARS 1
- #define NON_ASCII 2
- #define FOUND_UTF16 4
-
- void HelpText(FILE *errout, char *prog);
- void GeneralInfo(FILE *errout);
- void UnknownOption(FILE *errout, char c);
- void UnknownFile(FILE *errout, char *program, char *file);
- void ErrorSummary(Lexer *lexer);
- void ReportEncodingError(Lexer *lexer, uint code, uint c);
- void ReportEntityError(Lexer *lexer, uint code, char *entity, int c);
- void ReportAttrError(Lexer *lexer, Node *node, char *attr, uint code);
- void ReportWarning(Lexer *lexer, Node *element, Node *node, uint code);
- void ReportError(Lexer *lexer, Node *element, Node *node, uint code);
-
- /* slide maker functions */
- Node *FindBody(Node *node);
-
- /* counts number of h1 children belonging to node */
- int CountSlides(Node *node);
- void PPrintSlide(Out *fout, uint mode, uint indent, Lexer *lexer);
- void CreateSlides(Lexer *lexer, Node *root);
-
- /* config parameters, see config.c for defaults */
-
- void InitConfig(void);
- void FreeConfig(void);
- void ParseConfigFile(char *file);
- void AdjustConfig(void);
-
- extern uint spaces; /* default indentation */
- extern uint wraplen; /* default wrap margin */
- extern int CharEncoding;
- extern int tabsize;
-
- extern DocTypeMode doctype_mode; /* see doctype property */
- extern char *doctype_str; /* user specified doctype */
- extern char *slide_style; /* style sheet for slides */
-
- extern char *errfile; /* file name to write errors to */
- extern Bool writeback; /* if true then output tidied markup */
-
- extern Bool OnlyErrors; /* if true normal output is suppressed */
- extern Bool ShowWarnings; /* errors are always shown */
- extern Bool IndentContent;
- extern Bool SmartIndent;
- extern Bool HideEndTags;
- extern Bool XmlTags;
- extern Bool XmlOut;
- extern Bool xHTML;
- extern Bool XmlPi; /* add <?xml?> */
- extern Bool XmlPIs; /* assume PIs end with ?> as per XML */
- extern Bool RawOut;
- extern Bool UpperCaseTags;
- extern Bool UpperCaseAttrs;
- extern Bool MakeClean;
- extern Bool LogicalEmphasis;
- extern Bool DropFontTags;
- extern Bool BurstSlides;
- extern Bool BreakBeforeBR;
- extern Bool NumEntities;
- extern Bool QuoteMarks;
- extern Bool QuoteNbsp;
- extern Bool QuoteAmpersand;
- extern Bool WrapScriptlets;
- extern Bool WrapAsp;
- extern Bool FixBackslash;
- extern Bool IndentAttributes;
-
- /* Parser methods for tags */
-
- Parser ParseHTML;
- Parser ParseHead;
- Parser ParseTitle;
- Parser ParseScript;
- Parser ParseFrameSet;
- Parser ParseNoFrames;
- Parser ParseBody;
- Parser ParsePre;
- Parser ParseList;
- Parser ParseLI;
- Parser ParseDefList;
- Parser ParseBlock;
- Parser ParseInline;
- Parser ParseTableTag;
- Parser ParseColGroup;
- Parser ParseRowGroup;
- Parser ParseRow;
- Parser ParseSelect;
- Parser ParseOptGroup;
- Parser ParseText;
- Parser ParseObject;
- Parser ParseMap;
-
- /* Attribute checking methods */
-
- CheckAttribs CheckIMG;
- CheckAttribs CheckAREA;
- CheckAttribs CheckTABLE;
- CheckAttribs CheckCaption;
- CheckAttribs CheckSCRIPT;
- CheckAttribs CheckHTML;
-
- /* used to control printing of null attributes */
- Bool IsBoolAttribute(AttVal *attval);
-
- extern Dict *tag_html;
- extern Dict *tag_head;
- extern Dict *tag_body;
- extern Dict *tag_frameset;
- extern Dict *tag_noframes;
- extern Dict *tag_title;
- extern Dict *tag_hr;
- extern Dict *tag_pre;
- extern Dict *tag_listing;
- extern Dict *tag_h1;
- extern Dict *tag_h2;
- extern Dict *tag_p;
- extern Dict *tag_ul;
- extern Dict *tag_ol;
- extern Dict *tag_dir;
- extern Dict *tag_li;
- extern Dict *tag_dt;
- extern Dict *tag_dd;
- extern Dict *tag_dl;
- extern Dict *tag_td;
- extern Dict *tag_th;
- extern Dict *tag_tr;
- extern Dict *tag_col;
- extern Dict *tag_br;
- extern Dict *tag_a;
- extern Dict *tag_link;
- extern Dict *tag_b;
- extern Dict *tag_i;
- extern Dict *tag_strong;
- extern Dict *tag_em;
- extern Dict *tag_param;
- extern Dict *tag_option;
- extern Dict *tag_optgroup;
- extern Dict *tag_map;
- extern Dict *tag_area;
- extern Dict *tag_nobr;
- extern Dict *tag_wbr;
- extern Dict *tag_layer;
- extern Dict *tag_center;
- extern Dict *tag_spacer;
- extern Dict *tag_font;
- extern Dict *tag_style;
- extern Dict *tag_script;
- extern Dict *tag_noscript;
- extern Dict *tag_table;
- extern Dict *tag_caption;
- extern Dict *tag_form;
- extern Dict *tag_blockquote;
- extern Dict *tag_div;
- extern Dict *tag_span;
-