Dear all, I am a perl beginner and I am suggested to parse HTML by using
other codes but not re-inventing the wheel.
The following code is from Internet Search but what i find is a lot of
subroutines. When I fed it with an HTM file, nothing is generated or printed
out. Would anybody tell me where all the TD elements it store?
-----------------------------------
# HTML parser
# Jim Davis, July 15 1994
# This is an HTML parser not an SGML parser. It does not parse a DTD,
# The DTD is implicit in the code, and specific to HTML.
# The processing of the HTML can be customized by the user by
# 1) Defining routines to be called for various tags (see Begin and End
arrays)
# 2) Defining routines html_content and html_whitespace
# This is not a validating parser. It does not check the content model
# eg you can use DT outside a DL and it won't know. It is too liberal in
# what tags are allowed to minimize what other tags.
# Bugs - can't parse the prolog or whatever you call it
# <!DOCTYPE HTML [
# <!entity % HTML.Minimal "INCLUDE">
# <!-- Include standard HTML DTD -->
# <!ENTITY % html PUBLIC "-//connolly hal.com//DTD WWW HTML 1.8//EN">
# %html;
# ]>
# modified 3 Aug to add a bunch of HTML 2.0 tags
# modified 3 Sept to print HTML stack to STDERR not STDOUT, to add new
# routines html_begin_doc and html_end_doc for application specific cleanup
# and to break parse_html into two pieces.
# modified 30 Sept 94. parse_attributes now handles tag attributes that
# don't have values. thanks to Bill Simpson-Young
<>
# for the code.
# modified 17 Apr 95 to support FORMS tags.
# modified 8 Dec 95 by Daniel LaLiberte to centralize STDERR output
# so it may be switched off more easily.
$debug = 0;
$whitespace_significant = 0;
# global variables:
# $line_buffer is line buffer
# $line_count is input line number.
$line_buffer = "";
$line_count = 0;
sub printErr {
# All errors should be printed through here, so they may be turned off.
print STDERR @_ if $debug;
}
sub parse_html {
local ($file) = @_;
open (HTML, $file) || die "Could not open $file: $!\nStopped";
&parse_html_stream ();
close (HTML);}
# Global input HTML is the handle to the stream of HTML
sub parse_html_stream {
local ($token, $new);
## initialization
@stack=();
$line_count = 0;
$line_buffer = "";
## application specific initialization
&html_begin_doc();
main:
while (1) {
# if whitespace does not matter, trim any leading space.
if (! $whitespace_significant) {
$line_buffer =~ s/^\s+//;}
# now dispatch on the type of token
if ($line_buffer =~ /^(\s+)/) {
$token = $1;
$line_buffer = $';
&html_whitespace ($token);}
# This will lose if there is more than one comment on the line!
elsif ($line_buffer =~ /^(\<!--.*-->)/) {
$token = $1;
$line_buffer = $';
&html_comment ($token);}
elsif ($line_buffer =~ /^(\<![^-][^\>]*\>)/) {
$token = $1;
$line_buffer = $';
&html_comment ($token);}
elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) {
$token = $1;
$line_buffer = $';
&html_etag ($token);}
elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) {
$token = $1;
$line_buffer = $';
&html_tag ($token);}
elsif ($line_buffer =~ /^([^\s<]+)/) {
$token = $1;
$line_buffer = $';
$token = &substitute_entities($token); # not enough anyway.
&html_content ($token); }
else {
# No valid token in buffer. Maybe it's empty, or maybe there's an
# incomplete tag. So get some more data.
$new = <HTML>;
if (! defined ($new)) {last main;}
# if we're trying to find a match for a tag, then get rid of embedded
newline
# this is, I think, a kludge
if ($line_buffer =~ /^\</ && $line_buffer =~ /\n$/) {
chop $line_buffer;
$line_buffer .= " ";}
$line_buffer .= $new;
$line_count++;}
}
## cleanup
&html_end_doc();
if ($#stack > -1) {
&printErr ("Stack not empty at end of document\n");
&print_html_stack();}
}
sub html_tag {
local ($tag) = @_;
local ($element) = &tag_element ($tag);
local (%attributes) = &tag_attributes ($tag);
# the tag might minimize (be an implicit end) for the previous tag
local ($prev_element);
while (&Minimizes(&stack_top_element(), $element)) {
$prev_element = &stack_pop_element ();
if ($debug) {
&printErr ("MINIMIZING $prev_element with $element on $line_count\n");}
&html_end ($prev_element, 0);}
push (@stack, $tag);
&html_begin ($element, $tag, *attributes);
if (&Empty($element)) {
pop(@stack);
&html_end ($element, 0);}
}
sub html_etag {
local ($tag) = @_;
local ($element) = &tag_element ($tag);
# pop stack until find matching tag. This is probably a bad idea,
# or at least too general.
local ( $prev_element) = &stack_pop_element();
until ($prev_element eq $element) {
if ($debug) {
&printErr ("MINIMIZING $prev_element with /$element on $line_count
\n");}
&html_end ($prev_element, 0);
if ($#stack == -1) {
&printErr ("No match found for /$element. You will lose\n");
last;}
$prev_element = &stack_pop_element();}
&html_end ($element, 1);
}
# For each element, the names of elements which minimize it.
# This is of course totally HTML dependent and probably I have it wrong too
$Minimize{"DT"} = "DT

D";
$Minimize{"DD"} = "DT";
$Minimize{"LI"} = "LI";
$Minimize{"P"} = "P

T:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL

L";
# Does element E2 minimize E1?
sub Minimizes {
local ($e1, $e2) = @_;
local ($value) = 0;
foreach $elt (split (":", $Minimize{$e1})) {
if ($elt eq $e2) {$value = 1;}}
$value;}
$Empty{"BASE"} = 1;
$Empty{"BR"} = 1;
$Empty{"HR"} = 1;
$Empty{"IMG"} = 1;
$Empty{"ISINDEX"} = 1;
$Empty{"LINK"} = 1;
$Empty{"META"} = 1;
$Empty{"NEXTID"} = 1;
$Empty{"INPUT"} = 1;
# Empty tags have no content and hence no end tags
sub Empty {
local ($element) = @_;
$Empty{$element};}
sub print_html_stack {
&printErr ("\n ==\n");
foreach $elt (reverse @stack) {&printErr (" $elt\n");}
&printErr (" ==========\n");}
# The element on top of stack, if any.
sub stack_top_element {
if ($#stack >= 0) { &tag_element ($stack[$#stack]);}}
sub stack_pop_element {
&tag_element (pop (@stack));}
# The element from the tag, normalized.
sub tag_element {
local ($tag) = @_;
$tag =~ /<\/?([^\s>]+)/;
local ($element) = $1;
$element =~ tr/a-z/A-Z/;
$element;}
# associative array of the attributes of a tag.
sub tag_attributes {
local ($tag) = @_;
$tag =~ /^<[A-Za-z]+ +(.*)>$/;
&parse_attributes($1);}
# string should be something like
# KEY="value" KEY2="longer value" KEY3="tags o doom"
# output is an associative array (like a lisp property list)
# attributes names are not case sensitive, do I downcase them
# Maybe (probably) I should substitute for entities when parsing attributes.
sub parse_attributes {
local ($string) = @_;
local (%attributes);
local ($name, $val);
get: while (1) {
if ($string =~ /^ *([A-Za-z]+)\s*=\s*\"([^\"]*)\"/) {
$name = $1;
$val = $2;
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val; }
elsif ($string =~ /^ *([A-Za-z]+)\s*=\s*(\S*)/) {
$name = $1;
$val = $2;
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val;}
elsif ($string =~ /^ *([A-Za-z]+)/) {
$name = $1;
$val = "";
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val;}
else {last;}}
%attributes;}
sub substitute_entities {
local ($string) = @_;
$string =~ s/</</g;
$string =~ s/>/>/g;
$string =~ s/"/\"/g;
$string =~ s/ / /g;
# Other entities.
$string =~ s/&/&/g; # Do this last.
$string;}
@HTML_elements = (
"A",
"ADDRESS",
"B",
"BASE",
"BLINK", # Netscape addition

"BLOCKQUOTE",
"BODY",
"BR",
"CITE",
"CENTER", # Netscape addition

"CODE",
"DD",
"DIR",
"DFN",
"DL",
"DT",
"EM",
"FORM",
"H1", "H2", "H3", "H4", "H5", "H6",
"HEAD",
"HR",
"HTML",
"I",
"ISINDEX",
"IMG",
"INPUT",
"KBD",
"LI",
"LINK",
"MENU",
"META",
"NEXTID",
"OL",
"OPTION",
"P",
"PRE",
"SAMP",
"SELECT",
"STRIKE",
"STRONG",
"TITLE",
"TEXTAREA",
"TT",
"UL",
"VAR",
);
sub define_element {
local ($element) = @_;
$Begin{$element} = "Noop";
$End{$element} = "Noop";}
foreach $element (@HTML_elements) {&define_element($element);}
# do nothing
sub Noop {
local ($element, $xxx) = @_;}
# called when a tag begins. Dispatches using Begin
sub html_begin {
local ($element, $tag, *attributes) = @_;
local ($routine) = $Begin{$element};
if ($routine eq "") {
&printErr ("Unknown HTML element $element ($tag) on line $line_count\n");}
else {eval "&$routine;"}}
# called when a tag ends. Explicit is 0 if tag end is because of
minimization
# not that you should care.
sub html_end {
local ($element, $explicit) = @_;
local ($routine) = $End{$element};
if ($routine eq "") {
&printErr ("Unknown HTML element \"$element\" (END $explicit) on line
$line_count\n");}
else {eval "&$routine(\"$element\", $explicit)";}}
sub html_content {
local ($word) = @_;
}
sub html_whitespace {
local ($whitespace) = @_;}
sub html_comment {
local ($tag) = @_;}
# redefine these for application-specific initialization and cleanup
sub html_begin_doc {}
sub html_end_doc {}
# return a "true value" when loaded by perl.
1;