home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Acorn User 10
/
AU_CD10.iso
/
Updates
/
Perl
/
Perl_Libs
/
site_perl
/
HTML
/
dtd2pm.pl
< prev
next >
Wrap
Perl Script
|
1996-09-17
|
6KB
|
244 lines
#!/local/bin/perl -w
# This is a crude hack to parse the HTML DTDs in order to generate
# a HTML parser in perl. The output is a perl structure that describe
# the same releationships as the DTD. This script in known to work
# on the HTML 3.2 of Tuesday 23-Apr-96, but it might need some
# tweaks if that DTD use more sophisticated SGML features.
#
# Author: Gisle Aas
#
# $Id: dtd2pm.pl,v 1.3 1996/09/17 12:40:27 aas Exp $
#
# Disclaimer: I am not an SGML expert and don't really understand how
# to read those damn DTDs.
$VERBOSE = 0;
undef($/);
$DTD = "HTML32.dtd";
open(DTD, $DTD) or die "Can't open $DTD: $!";
$_ = <DTD>;
close(DTD);
$| = 1;
($intro) = /<!\s*--(.*?)--\s*>/s;
#print $_;
while (s/^\s*<!//) {
if (s/^(\[.*?\])>//s) { # ignore <![......]> constructs
#print "Skip: <!$1>\n";
next;
}
s/^([^>]*)>//;
$c = $1;
while (1) {
$c =~ s/--.*?--//gs; # remove comments
if ($c =~ /--/) {
# we really did read to little
s/([^>]*)>//;
$c .= $1
} else {
last;
}
}
$c =~ s/^\s+//;
$c =~ s/\s+$//;
next unless length $c; # only comments
#$c =~ s/\s+/ /g;
print "C: $c\n" if $VERBOSE;
while (s/^\s*([^\s<]+)//) {
print "T: $1\n" if $VERBOSE;
}
if ($c =~ /^ENTITY\s+(%\s*)?(\S+)\s+(.*)/is) {
my($percent, $key, $val) = ($1, lc($2), $3);
if ($percent) {
$key = "%$key";
} else {
$key = "&$key";
$val =~ s/CDATA\s+//;
}
$val =~ s/^"//s;
$val =~ s/"$//s;
$val =~ s/(%[\w\.\-]+);?/$entity{lc $1} || $1/eg;
$entity{$key} = $val;
#print "E: $key => $val\n";
} else {
# Expand entities
$c =~ s/(%[\w\.\-]+);?/$entity{lc $1} || $1/eg;
#print "C: $c\n"
if ($c =~ /^ELEMENT\s+\(([^\)]+)\)\s+([-O])\s+([-O])\s+(.*)/is) {
my($elems, $start, $stop, $content) = (lc $1, $2, $3, lc $4);
for ($elems, $content) {
s/\s+//g;
}
$content =~ s/(\#pcdata)\b/\U$1/g;
for $elem (split(/\|/, $elems)) {
$element{$elem} = [$start, $stop, $content];
}
} elsif ($c =~ /^ELEMENT\s+(\S+)\s+([-O])\s+([-O])\s+(.*)/is) {
my($elem, $start, $stop, $content) = (lc $1, $2, $3, lc $4);
$content =~ s/\s+//g;
$content =~ s/(\#pcdata)\b/\U$1/g;
$element{$elem} = [$start, $stop, $content];
} elsif ($c =~ s/^ATTLIST\s+\(([^\)]+)\)\s+//) {
my $elems = lc $1;
$elems =~ s/\s+//g;
my $attrs = parse_attrs($c);
for $elem (split(/\|/, $elems)) {
$attr{$elem} = $attrs;
}
} elsif ($c =~ s/^ATTLIST\s+(\S+)\s+//) {
$attr{lc $1} = parse_attrs($c);
} else {
print STDERR "?: $c\n";
}
}
}
# is there anything left?
s/^\s+//;
print STDERR "?: ", substr($_, 0, 200), "\n" if length $_;
# At this point, we have initialized the %element, %attr arrays.
# Their content is as described here:
#
# %element = ( tag => [ $start, $end, $content ],
# ...
# );
#
# %attr = ( tag => {
# attr => [ $values, $default ],
# ...
# },
# ...
# );
#
# The %entity hash is also available, but should not be of much use
# now.
# Dump result to stdout so that it is useful to a perl program.
print "##### Do not edit!! Auto-generated from $DTD\n\n";
print "package HTML::DTD; # <!DOCTYPE HTML PUBLIC \"$entity{'%html.version'}\">\n\n";
$intro =~ s/^[ \t]*/\# /gm;
print "$intro\n\n";
my @all_tags = sort keys %element;
my @empty = ();
my @optional_end_tag = ();
my @optional_start_tag = ();
for (@all_tags) {
push(@empty, $_) if $element{$_}[2] eq 'empty';
push(@optional_end_tag, $_) if $element{$_}[2] ne 'empty' and
$element{$_}[1] ne '-';
push(@optional_start_tag, $_) if $element{$_}[0] ne '-';
}
print "\@all_tags = qw(@all_tags);\n";
print "\@empty = qw(@empty);\n";
print "\@optional_end_tag = qw(@optional_end_tag);\n";
print "\@optional_start_tag = qw(@optional_start_tag);\n";
print <<'EOT';
# The %elem hash is indexed by lowercase tag identifiers. Each element is
# an anonymouse hash with the following values:
#
# 'content': Describes the content that can be present within this
# element. This value is missing if the element should
# always be empty.
#
# 'optend': True if the end tag for this element is optional
#
# 'attr': A hash that describes the attributes of this element.
# Each element in this hash is a anonymous array with
# two values: allowed values; default value
#
EOT
print "\n%elem = (\n";
@boolean_attr = ();
for (@all_tags) {
my $e = $_;
$e = "'$e'" if $e eq 'tr' || $e eq 'link' || $e eq 'sub'; # these are perl keywords
printf "%-4s => {\n", $e;
print "\t content => '$element{$_}[2]',\n" if $element{$_}[2] ne 'empty';
print "\t optend => 1,\n" if $element{$_}[1] ne '-';
if (exists $attr{$_}) {
print "\t attr => {\n";
for $a (sort keys %{$attr{$_}}) {
my @a = @{$attr{$_}{$a}};
print "\t\t\t$a => [", join(",", map {qq("$_")} @a), "],\n";
push(@boolean_attr, "$_\t=> '$a'") if $a eq $attr{$_}{$a}[0];
}
print "\t\t },\n";
}
print "\t},\n";
}
print ");\n";
print "\n\n\%boolean_attr = (\n";
for (@boolean_attr) {
print " $_,\n";
}
print ");\n";
print "\n1;\n";
exit;
#-----------------------------------------------------------------------
sub parse_attrs # Parse the <!ATTLIST elem ...> content
{
my $a = shift;
my %a = ();
#print "---$a---\n";
while ($a =~ /\S/) {
$a =~ s/^\s*(\S+)\s*//;
my $key = $1;
my ($val, $default);
if ($a =~ s/^\(([^\)]+)\)//) {
$val = $1;
$val =~ s/\s+//g;
} elsif ($a =~ s/^(\S+)//) {
$val = $1;
} else {
die "Missing values";
}
$val = lc($val) unless $val =~ /^[A-Z]+$/;
$val =~ s/^"(.*)"$/$1/;
$a =~ s/^\s+//;
if ($a =~ s/^(\#FIXED\s+\'[^\']+\')//) {
$default = $1;
} elsif ($a =~ s/^(\S+)//) {
$default = $1;
} else {
die "Missing default";
}
$default = lc($default) unless $default =~ /^[\#\"]/;
$default =~ s/^"(.*)"$/$1/;
$a{$key} = [$val, $default];
}
\%a;
}