.\" Automatically generated by Pod::Man 2.27 (Pod::Simple 3.28) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{ . if \nF \{ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "HTML::TagParser 3" .TH HTML::TagParser 3 "2012-05-03" "perl v5.16.3" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" HTML::TagParser \- Yet another HTML document parser with DOM\-like methods .SH "SYNOPSIS" .IX Header "SYNOPSIS" Parse a \s-1HTML\s0 file and find its element's value. .PP .Vb 3 \& my $html = HTML::TagParser\->new( "index\-j.html" ); \& my $elem = $html\->getElementsByTagName( "title" ); \& print "<title>", $elem\->innerText(), "\en" if ref $elem; .Ve .PP Parse a \s-1HTML\s0 source and find its first
attribute's value and find all input elements belonging to this form. .PP .Vb 6 \& my $src = \*(Aq...
\*(Aq; \& my $html = HTML::TagParser\->new( $src ); \& my $elem = $html\->getElementsByTagName( "form" ); \& print "
getAttribute("action"), "\e">\en" if ref $elem; \& my @first_inputs = $elem\->subTree()\->getElementsByTagName( "input" ); \& my $form = $first_inputs[0]\->getParent(); .Ve .PP Fetch a \s-1HTML\s0 file via \s-1HTTP,\s0 and display its all elements and attributes. .PP .Vb 10 \& my $url = \*(Aqhttp://www.kawa.net/xp/index\-e.html\*(Aq; \& my $html = HTML::TagParser\->new( $url ); \& my @list = $html\->getElementsByTagName( "a" ); \& foreach my $elem ( @list ) { \& my $tagname = $elem\->tagName; \& my $attr = $elem\->attributes; \& my $text = $elem\->innerText; \& print "<$tagname"; \& foreach my $key ( sort keys %$attr ) { \& print " $key=\e"$attr\->{$key}\e""; \& } \& if ( $text eq "" ) { \& print " />\en"; \& } else { \& print ">$text\en"; \& } \& } .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" HTML::TagParser is a pure Perl module which parses \s-1HTML/XHTML\s0 files. This module provides some methods like \s-1DOM\s0 interface. This module is not strict about \s-1XHTML\s0 format because many of \s-1HTML\s0 pages are not strict. You know, many pages use
elemtents instead of
and have

elements which are not closed. .SH "METHODS" .IX Header "METHODS" .ie n .SS "$html = HTML::TagParser\->\fInew()\fP;" .el .SS "\f(CW$html\fP = HTML::TagParser\->\fInew()\fP;" .IX Subsection "$html = HTML::TagParser->new();" This method constructs an empty instance of the \f(CW\*(C`HTML::TagParser\*(C'\fR class. .ie n .SS "$html = HTML::TagParser\->new( $url );" .el .SS "\f(CW$html\fP = HTML::TagParser\->new( \f(CW$url\fP );" .IX Subsection "$html = HTML::TagParser->new( $url );" If \fInew()\fR is called with a \s-1URL,\s0 this method fetches a \s-1HTML\s0 file from remote web server and parses it and returns its instance. URI::Fetch module is required to fetch a file. .ie n .SS "$html = HTML::TagParser\->new( $file );" .el .SS "\f(CW$html\fP = HTML::TagParser\->new( \f(CW$file\fP );" .IX Subsection "$html = HTML::TagParser->new( $file );" If \fInew()\fR is called with a filename, this method parses a local \s-1HTML\s0 file and returns its instance .ie n .SS "$html = HTML::TagParser\->new( ""...snip..."" );" .el .SS "\f(CW$html\fP = HTML::TagParser\->new( ``...snip...'' );" .IX Subsection "$html = HTML::TagParser->new( ...snip... );" If \fInew()\fR is called with a string of \s-1HTML\s0 source code, this method parses it and returns its instance. .ie n .SS "$html\->fetch( $url, %param );" .el .SS "\f(CW$html\fP\->fetch( \f(CW$url\fP, \f(CW%param\fP );" .IX Subsection "$html->fetch( $url, %param );" This method fetches a \s-1HTML\s0 file from remote web server and parse it. The second argument is optional parameters for URI::Fetch module. .ie n .SS "$html\->open( $file );" .el .SS "\f(CW$html\fP\->open( \f(CW$file\fP );" .IX Subsection "$html->open( $file );" This method parses a local \s-1HTML\s0 file. .ie n .SS "$html\->parse( $source );" .el .SS "\f(CW$html\fP\->parse( \f(CW$source\fP );" .IX Subsection "$html->parse( $source );" This method parses a string of \s-1HTML\s0 source code. .ie n .SS "$elem = $html\->getElementById( $id );" .el .SS "\f(CW$elem\fP = \f(CW$html\fP\->getElementById( \f(CW$id\fP );" .IX Subsection "$elem = $html->getElementById( $id );" This method returns the element which id attribute is \f(CW$id\fR. .ie n .SS "@elem = $html\->getElementsByName( $name );" .el .SS "\f(CW@elem\fP = \f(CW$html\fP\->getElementsByName( \f(CW$name\fP );" .IX Subsection "@elem = $html->getElementsByName( $name );" This method returns an array of elements which name attribute is \f(CW$name\fR. On scalar context, the first element is only retruned. .ie n .SS "@elem = $html\->getElementsByTagName( $tagname );" .el .SS "\f(CW@elem\fP = \f(CW$html\fP\->getElementsByTagName( \f(CW$tagname\fP );" .IX Subsection "@elem = $html->getElementsByTagName( $tagname );" This method returns an array of elements which tagName is \f(CW$tagName\fR. On scalar context, the first element is only retruned. .ie n .SS "@elem = $html\->getElementsByClassName( $class );" .el .SS "\f(CW@elem\fP = \f(CW$html\fP\->getElementsByClassName( \f(CW$class\fP );" .IX Subsection "@elem = $html->getElementsByClassName( $class );" This method returns an array of elements which className is \f(CW$tagName\fR. On scalar context, the first element is only retruned. .ie n .SS "@elem = $html\->getElementsByAttribute( $attrname, $value );" .el .SS "\f(CW@elem\fP = \f(CW$html\fP\->getElementsByAttribute( \f(CW$attrname\fP, \f(CW$value\fP );" .IX Subsection "@elem = $html->getElementsByAttribute( $attrname, $value );" This method returns an array of elements which \f(CW$attrname\fR attribute's value is \f(CW$value\fR. On scalar context, the first element is only retruned. .SH "HTML::TagParser::Element SUBCLASS" .IX Header "HTML::TagParser::Element SUBCLASS" .ie n .SS "$tagname = $elem\->\fItagName()\fP;" .el .SS "\f(CW$tagname\fP = \f(CW$elem\fP\->\fItagName()\fP;" .IX Subsection "$tagname = $elem->tagName();" This method returns \f(CW$elem\fR's tagName. .ie n .SS "$text = $elem\->\fIid()\fP;" .el .SS "\f(CW$text\fP = \f(CW$elem\fP\->\fIid()\fP;" .IX Subsection "$text = $elem->id();" This method returns \f(CW$elem\fR's id attribute. .ie n .SS "$text = $elem\->\fIinnerText()\fP;" .el .SS "\f(CW$text\fP = \f(CW$elem\fP\->\fIinnerText()\fP;" .IX Subsection "$text = $elem->innerText();" This method returns \f(CW$elem\fR's innerText without tags. .ie n .SS "$subhtml = $elem\->\fIsubTree()\fP;" .el .SS "\f(CW$subhtml\fP = \f(CW$elem\fP\->\fIsubTree()\fP;" .IX Subsection "$subhtml = $elem->subTree();" This method returns a new object of class HTML::Parser, with all the elements that are in the \s-1DOM\s0 hierarchy under \f(CW$elem\fR. .ie n .SS "$elem = $elem\->\fInextSibling()\fP;" .el .SS "\f(CW$elem\fP = \f(CW$elem\fP\->\fInextSibling()\fP;" .IX Subsection "$elem = $elem->nextSibling();" This method returns the next sibling within the same parent. It returns undef when called on a closing tag or on the lastChild node of a parentNode. .ie n .SS "$elem = $elem\->\fIpreviousSibling()\fP;" .el .SS "\f(CW$elem\fP = \f(CW$elem\fP\->\fIpreviousSibling()\fP;" .IX Subsection "$elem = $elem->previousSibling();" This method returns the previous sibling within the same parent. It returns undef when called on the firstChild node of a parentNode. .ie n .SS "$child_elem = $elem\->\fIfirstChild()\fP;" .el .SS "\f(CW$child_elem\fP = \f(CW$elem\fP\->\fIfirstChild()\fP;" .IX Subsection "$child_elem = $elem->firstChild();" This method returns the first child node of \f(CW$elem\fR. It returns undef when called on a closing tag element or on a non-container or empty container element. .ie n .SS "$child_elems = $elem\->\fIchildNodes()\fP;" .el .SS "\f(CW$child_elems\fP = \f(CW$elem\fP\->\fIchildNodes()\fP;" .IX Subsection "$child_elems = $elem->childNodes();" This method creates an array of all child nodes of \f(CW$elem\fR and returns the array by reference. It returns an empty array-ref [] whenever \fIfirstChild()\fR would return undef. .ie n .SS "$child_elem = $elem\->\fIlastChild()\fP;" .el .SS "\f(CW$child_elem\fP = \f(CW$elem\fP\->\fIlastChild()\fP;" .IX Subsection "$child_elem = $elem->lastChild();" This method returns the last child node of \f(CW$elem\fR. It returns undef whenever \fIfirstChild()\fR would return undef. .ie n .SS "$parent = $elem\->\fIparentNode()\fP;" .el .SS "\f(CW$parent\fP = \f(CW$elem\fP\->\fIparentNode()\fP;" .IX Subsection "$parent = $elem->parentNode();" This method returns the parent node of \f(CW$elem\fR. It returns undef when called on root nodes. .ie n .SS "$attr = $elem\->\fIattributes()\fP;" .el .SS "\f(CW$attr\fP = \f(CW$elem\fP\->\fIattributes()\fP;" .IX Subsection "$attr = $elem->attributes();" This method returns a hash of \f(CW$elem\fR's all attributes. .ie n .SS "$value = $elem\->getAttribute( $key );" .el .SS "\f(CW$value\fP = \f(CW$elem\fP\->getAttribute( \f(CW$key\fP );" .IX Subsection "$value = $elem->getAttribute( $key );" This method returns the value of \f(CW$elem\fR's attributes which name is \f(CW$key\fR. .SH "BUGS" .IX Header "BUGS" The HTML-Parser is simple. Methods innerText and subTree may be fooled by nested tags or embedded javascript code. .PP The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. The most expensive ones are \fIlastChild()\fR and \fIpreviousSibling()\fR. \&\fIparentNode()\fR is also expensive, but only once. It does caching. .PP The \s-1DOM\s0 tree is read-only, as this is just a parser. .SH "INTERNATIONALIZATION" .IX Header "INTERNATIONALIZATION" This module natively understands the character encoding used in document by parsing its meta element. .PP .Vb 1 \& .Ve .PP The parsed document's encoding is converted as this class's fixed internal encoding \*(L"\s-1UTF\-8\*(R".\s0 .SH "AUTHORS AND CONTRIBUTORS" .IX Header "AUTHORS AND CONTRIBUTORS" .Vb 4 \& drry [drry] \& Juergen Weigert [jnw] \& Yusuke Kawasaki [kawasaki] [kawanet] \& Tim Wilde [twilde] .Ve .SH "COPYRIGHT AND LICENSE" .IX Header "COPYRIGHT AND LICENSE" The following copyright notice applies to all the files provided in this distribution, including binary files, unless explicitly noted otherwise. .PP Copyright 2006\-2012 Yusuke Kawasaki .PP This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.