Module HTMLparser from libxml2
Action against software patentsGnome2 LogoW3C LogoRed Hat Logo
Made with Libxml2 Logo

Module HTMLparser from libxml2

API Menu
API Indexes
Related links

this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.

Table of Contents

#define htmlDefaultSubelement
#define htmlElementAllowedHereDesc
#define htmlRequiredAttrs
Typedef xmlDocPtr htmlDocPtr
Structure htmlElemDesc
struct _htmlElemDesc
Typedef htmlElemDesc * htmlElemDescPtr
Structure htmlEntityDesc
struct _htmlEntityDesc
Typedef htmlEntityDesc * htmlEntityDescPtr
Typedef xmlNodePtr htmlNodePtr
Typedef xmlParserCtxt htmlParserCtxt
Typedef xmlParserCtxtPtr htmlParserCtxtPtr
Typedef xmlParserInput htmlParserInput
Typedef xmlParserInputPtr htmlParserInputPtr
Typedef xmlParserNodeInfo htmlParserNodeInfo
Enum htmlParserOption
Typedef xmlSAXHandler htmlSAXHandler
Typedef xmlSAXHandlerPtr htmlSAXHandlerPtr
Enum htmlStatus
int	UTF8ToHtml			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)
htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy)
int	htmlAutoCloseTag		(htmlDocPtr doc, 
const xmlChar * name,
htmlNodePtr elem)
htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size)
htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options)
htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
void	htmlCtxtReset			(htmlParserCtxtPtr ctxt)
int	htmlCtxtUseOptions		(htmlParserCtxtPtr ctxt, 
int options)
int	htmlElementAllowedHere		(const htmlElemDesc * parent, 
const xmlChar * elt)
htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt)
int	htmlEncodeEntities		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)
const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name)
const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value)
void	htmlFreeParserCtxt		(htmlParserCtxtPtr ctxt)
int	htmlHandleOmittedElem		(int val)
int	htmlIsAutoClosed		(htmlDocPtr doc, 
htmlNodePtr elem)
int	htmlIsScriptAttribute		(const xmlChar * name)
htmlParserCtxtPtr	htmlNewParserCtxt	(void)
htmlStatus	htmlNodeStatus		(const htmlNodePtr node, 
int legacy)
int	htmlParseCharRef		(htmlParserCtxtPtr ctxt)
int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate)
htmlDocPtr	htmlParseDoc		(xmlChar * cur, 
const char * encoding)
int	htmlParseDocument		(htmlParserCtxtPtr ctxt)
void	htmlParseElement		(htmlParserCtxtPtr ctxt)
const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str)
htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding)
htmlDocPtr	htmlReadDoc		(const xmlChar * cur, 
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options)
htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * URL,
const char * encoding,
int options)
htmlDocPtr	htmlSAXParseDoc		(xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag)

Description

Macro: htmlDefaultSubelement

#define htmlDefaultSubelement

Returns the default subelement for this element

Macro: htmlElementAllowedHereDesc

#define htmlElementAllowedHereDesc

Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.

Macro: htmlRequiredAttrs

#define htmlRequiredAttrs

Returns the attributes required for the specified element.

Structure htmlElemDesc

Structure htmlElemDesc
struct _htmlElemDesc { const char * name : The tag name char startTag : Whether the start tag can be implied char endTag : Whether the end tag can be implied char saveEndTag : Whether the end tag should be saved char empty : Is this an empty element ? char depr : Is this a deprecated element ? char dtd : 1: only in Loose DTD, 2: only Frameset char isinline : is this a block 0 or inline 1 element const char * desc : the description NRK Jan.2003 * New fiel const char ** subelts : allowed sub-elements of this element const char * defaultsubelt : subelement for suggested auto-repair if const char ** attrs_opt : Optional Attributes const char ** attrs_depr : Additional deprecated attributes const char ** attrs_req : Required attributes }

Structure htmlEntityDesc

Structure htmlEntityDesc
struct _htmlEntityDesc { unsigned int value : the UNICODE value for the character const char * name : The entity name const char * desc : the description }

Enum htmlParserOption

Enum htmlParserOption {
    HTML_PARSE_RECOVER = 1 : Relaxed parsing
    HTML_PARSE_NODEFDTD = 4 : do not default a doctype if not found
    HTML_PARSE_NOERROR = 32 : suppress error reports
    HTML_PARSE_NOWARNING = 64 : suppress warning reports
    HTML_PARSE_PEDANTIC = 128 : pedantic error reporting
    HTML_PARSE_NOBLANKS = 256 : remove blank nodes
    HTML_PARSE_NONET = 2048 : Forbid network access
    HTML_PARSE_NOIMPLIED = 8192 : Do not add implied html/body... elements
    HTML_PARSE_COMPACT = 65536 : compact small text nodes
    HTML_PARSE_IGNORE_ENC = 2097152 : ignore internal document encoding hint
}

Enum htmlStatus

Enum htmlStatus {
    HTML_NA = 0 : something we don't check at all
    HTML_INVALID = 1
    HTML_DEPRECATED = 2
    HTML_VALID = 4
    HTML_REQUIRED = 12 : VALID bit set so ( & HTML_VALID ) is TRUE
}

Function: UTF8ToHtml

int	UTF8ToHtml			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out:a pointer to an array of bytes to store the result
outlen:the length of @out
in:a pointer to an array of UTF-8 chars
inlen:the length of @in
Returns:0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

Function: htmlAttrAllowed

htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy)

Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes

elt:HTML element
attr:HTML attribute
legacy:whether to allow deprecated attributes
Returns:one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID

Function: htmlAutoCloseTag

int	htmlAutoCloseTag		(htmlDocPtr doc, 
const xmlChar * name,
htmlNodePtr elem)

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

doc:the HTML document
name:The tag name
elem:the HTML element
Returns:1 if autoclose, 0 otherwise

Function: htmlCreateMemoryParserCtxt

htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size)

Create a parser context for an HTML in-memory document.

buffer:a pointer to a char array
size:the size of the array
Returns:the new parser context or NULL

Function: htmlCreatePushParserCtxt

htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)

Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.

sax:a SAX handler
user_data:The user data returned on SAX callbacks
chunk:a pointer to an array of chars
size:number of chars in the array
filename:an optional file name or URI
enc:an optional encoding
Returns:the new parser context or NULL

Function: htmlCtxtReadDoc

htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * cur,
const char * URL,
const char * encoding,
int options)

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

ctxt:an HTML parser context
cur:a pointer to a zero terminated string
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlCtxtReadFd

htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options)

parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context

ctxt:an HTML parser context
fd:an open file descriptor
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlCtxtReadFile

htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options)

parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context

ctxt:an HTML parser context
filename:a file or URL
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlCtxtReadIO

htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)

parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context

ctxt:an HTML parser context
ioread:an I/O read function
ioclose:an I/O close function
ioctx:an I/O handler
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlCtxtReadMemory

htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

ctxt:an HTML parser context
buffer:a pointer to a char array
size:the size of the array
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlCtxtReset

void	htmlCtxtReset			(htmlParserCtxtPtr ctxt)

Reset a parser context

ctxt:an HTML parser context

Function: htmlCtxtUseOptions

int	htmlCtxtUseOptions		(htmlParserCtxtPtr ctxt, 
int options)

Applies the options to the parser context

ctxt:an HTML parser context
options:a combination of htmlParserOption(s)
Returns:0 in case of success, the set of unknown or unimplemented options in case of error.

Function: htmlElementAllowedHere

int	htmlElementAllowedHere		(const htmlElemDesc * parent, 
const xmlChar * elt)

Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements

parent:HTML parent element
elt:HTML element
Returns:1 if allowed; 0 otherwise.

Function: htmlElementStatusHere

htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt)

Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.

parent:HTML parent element
elt:HTML element
Returns:one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID

Function: htmlEncodeEntities

int	htmlEncodeEntities		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out:a pointer to an array of bytes to store the result
outlen:the length of @out
in:a pointer to an array of UTF-8 chars
inlen:the length of @in
quoteChar:the quote character to escape (' or ") or zero.
Returns:0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

Function: htmlEntityLookup

const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name)

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

name:the entity name
Returns:the associated htmlEntityDescPtr if found, NULL otherwise.

Function: htmlEntityValueLookup

const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value)

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

value:the entity's unicode value
Returns:the associated htmlEntityDescPtr if found, NULL otherwise.

Function: htmlFreeParserCtxt

void	htmlFreeParserCtxt		(htmlParserCtxtPtr ctxt)

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

ctxt:an HTML parser context

Function: htmlHandleOmittedElem

int	htmlHandleOmittedElem		(int val)

Set and return the previous value for handling HTML omitted tags.

val:int 0 or 1
Returns:the last value for 0 for no handling, 1 for auto insertion.

Function: htmlIsAutoClosed

int	htmlIsAutoClosed		(htmlDocPtr doc, 
htmlNodePtr elem)

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

doc:the HTML document
elem:the HTML element
Returns:1 if autoclosed, 0 otherwise

Function: htmlIsScriptAttribute

int	htmlIsScriptAttribute		(const xmlChar * name)

Check if an attribute is of content type Script

name:an attribute name
Returns:1 is the attribute is a script 0 otherwise

Function: htmlNewParserCtxt

htmlParserCtxtPtr	htmlNewParserCtxt	(void)

Allocate and initialize a new parser context.

Returns:the htmlParserCtxtPtr or NULL in case of allocation error

Function: htmlNodeStatus

htmlStatus	htmlNodeStatus		(const htmlNodePtr node, 
int legacy)

Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)

node:an htmlNodePtr in a tree
legacy:whether to allow deprecated elements (YES is faster here for Element nodes)
Returns:for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed)

Function: htmlParseCharRef

int	htmlParseCharRef		(htmlParserCtxtPtr ctxt)

parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'

ctxt:an HTML parser context
Returns:the value parsed (as an int)

Function: htmlParseChunk

int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate)

Parse a Chunk of memory

ctxt:an HTML parser context
chunk:an char array
size:the size in byte of the chunk
terminate:last chunk indicator
Returns:zero if no error, the xmlParserErrors otherwise.

Function: htmlParseDoc

htmlDocPtr	htmlParseDoc		(xmlChar * cur, 
const char * encoding)

parse an HTML in-memory document and build a tree.

cur:a pointer to an array of xmlChar
encoding:a free form C string describing the HTML document encoding, or NULL
Returns:the resulting document tree

Function: htmlParseDocument

int	htmlParseDocument		(htmlParserCtxtPtr ctxt)

parse an HTML document (and build a tree if using the standard SAX interface).

ctxt:an HTML parser context
Returns:0, -1 in case of error. the parser context is augmented as a result of the parsing.

Function: htmlParseElement

void	htmlParseElement		(htmlParserCtxtPtr ctxt)

parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue

ctxt:an HTML parser context

Function: htmlParseEntityRef

const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str)

parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'

ctxt:an HTML parser context
str:location to store the entity name
Returns:the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.

Function: htmlParseFile

htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding)

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

filename:the filename
encoding:a free form C string describing the HTML document encoding, or NULL
Returns:the resulting document tree

Function: htmlReadDoc

htmlDocPtr	htmlReadDoc		(const xmlChar * cur, 
const char * URL,
const char * encoding,
int options)

parse an XML in-memory document and build a tree.

cur:a pointer to a zero terminated string
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlReadFd

htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options)

parse an XML from a file descriptor and build a tree.

fd:an open file descriptor
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlReadFile

htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options)

parse an XML file from the filesystem or the network.

filename:a file or URL
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlReadIO

htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)

parse an HTML document from I/O functions and source and build a tree.

ioread:an I/O read function
ioclose:an I/O close function
ioctx:an I/O handler
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlReadMemory

htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * URL,
const char * encoding,
int options)

parse an XML in-memory document and build a tree.

buffer:a pointer to a char array
size:the size of the array
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree

Function: htmlSAXParseDoc

htmlDocPtr	htmlSAXParseDoc		(xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

cur:a pointer to an array of xmlChar
encoding:a free form C string describing the HTML document encoding, or NULL
sax:the SAX handler block
userData:if using SAX, this pointer will be provided on callbacks.
Returns:the resulting document tree unless SAX is NULL or the document is not well formed.

Function: htmlSAXParseFile

htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

filename:the filename
encoding:a free form C string describing the HTML document encoding, or NULL
sax:the SAX handler block
userData:if using SAX, this pointer will be provided on callbacks.
Returns:the resulting document tree unless SAX is NULL or the document is not well formed.

Function: htmlTagLookup

const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag)

Lookup the HTML tag in the ElementTable

tag:The tag name in lowercase
Returns:the related htmlElemDescPtr or NULL if not found.

Daniel Veillard