#!/usr/local/bin/perl ############################################################################# # # $RCSfile: xmlvalid.pl.in,v $ # $Date: 1999/06/19 23:08:03 $ # $Source: /home/richard/Xml/RCS/xmlvalid.pl.in,v $ # $Revision: 1.27 $ # $Author: richard $ # ############################################################################# # # Xmlvalid.pl is a simple CGI interface to Xmlparse. To install it, # you'll need to do some hand editing: # # 0) Alter the strings output below to reflect your local setup; # e.g., take out the author's name, and take out references to # Brown University # # 1) Make sure you have PERL 5.x installed - and also the PERL # CGI library, CGI.pw (if you don't have CGI.pw, get it from # a nearby CTAN site) # # 2) Then install xmlparse on the machine that houses your # webserver; make sure it is working, and that its ancillary # script, get_uri, is installed correctly # # 3) Set the $sgml_catalog variable in xmlvalid.pl to wherever you # keep your SGML catalog file. If you have more than one catalog # file, separate them with a colon. # # 4) Copy xmlvalid.pl to one of your webserver's CGI directories # (making sure it is readable and executable by the webserver) # # 5) Edit xmlvalid.shtml to taste, then copy it to some spot in # your webserver's document hierarchy, making sure that its # ACTION line points to where xmlvalid.pl has been copied in # step (3) above. Note that if you want to edit xmlvalid.shtml, # you'll need to make it writable. You may also want to change # its extension from .shtml to .html, depending on whether your # site uses server-side includes. # # 6) Fill out the xmlparse.shtml form via your webserver and # submit it to verify that it is working # # If you find problems, check all paths (both the ACTION path in # xmlvalid.shtml, and the PERL and executable paths in xmlvalid.pl. # # Note 1: Do not edit xmlvalid.pl.in unless you fully understand the # implications of altering an autoconf template file. Edit the file # generated from it, xmlvalid.pl, instead. # # Note 2: This file uses both xmlparse and a simple utility program # installed along with it called xml2utf. Both must be available # (which should be the case if xmlparse was correctly installed). # ############################################################################# # use CGI; use POSIX; my $DEBUG = 0; # my $DEBUG = 1; MAIN: { local $query; local $child_pid; local $tmpfilename; local $header_printed = 0; my $prefix = "/opt/local"; my $path = "${prefix}/bin/xmlparse"; my $utfpath = "${prefix}/bin/xml2utf"; my $get_uri = "${prefix}/bin/get_uri"; # Set CATALOG location by hand, if need be my $sgml_catalog = "/usr/local/lib/sgml/CATALOG"; my ($filename, $argfilename, $msg, $status, $result, @results); my %errorlines; # Catch signals # $SIG{'INT'} = \&Interrupted; $SIG{'HUP'} = \&Interrupted; $SIG{'QUIT'} = \&Interrupted; $SIG{'TERM'} = \&Interrupted; $SIG{'PIPE'} = 'IGNORE'; # Make sure nobody else can read/write/execute any files we create umask (077); # Initialize CGI variables # print STDERR "$0: initializing query object\n" if $DEBUG; $query = new CGI (); print STDERR "$0: done initializing query object\n" if $DEBUG; # Set filename to the name of the file or URI supplied by the user # (will be "" if the user just pasted text into the text field). # Also, place the contents of that file/URL/text in $tmpfilename. # $tmpfilename = POSIX::tmpnam (); $filename = WhatFilename ($get_uri); ($filename =~ m,^((http|ftp)://|urn:),i) ? ($argfilename = $filename) : ($argfilename = $tmpfilename); # Now have xmlparse validate $argfilename. Xmlparse's syntax # works as follows: # # -f forces relaxed namespace checking # -l sets the debugging level # -E limits the number of error messages # -n limits system identifiers to URIs (so the user can't get at # local files) # -C sets the SGML catalog file # -s sends system warnings and errors to syslog (so only parsing # errors go to user) # my $debug = $DEBUG ? "-l 5" : ""; my $relax = ($query->param ("force")) ? "-f" : ""; my $cmd = "$path $relax $debug -E 200 -n -C $sgml_catalog -s $argfilename 2>&1 |"; print STDERR "$0: executing $cmd\n" if $DEBUG; if (! ($child_pid = open (XMLPARSE, $cmd))) { AbortCgi ("Can't execute $path."); } # If xmlparse isn't done in seven minutes, kill it. # $SIG{'ALRM'} = \&TimedOut; alarm (420); @results = ; close (XMLPARSE); $status = $? / 256; alarm (0); $SIG{'ALRM'} = 'DEFAULT'; # Print title, general information for the user PrintHeader ("Validation Results for " . ($filename ? $filename : "[user-supplied text]")); # Now format error/warning messages (if there were any) all # nicely. # if ($status) { # # Okay, non-zero exit status indicates some sort of error # my $n = 200; if (@results) { # # There were error messages. Print them out. Print out # warnings too, if the user hasn't suppressed them # ($query->param ("nowarn")) ? print "

A list of error messages follows\n" : print "

A list of error and warning messages follows\n"; print "along with (if needed, and if supplied) a line-numbered dump\n", "of the original document up to the last erroneous line.\n

\n"; print $query->hr (); ($query->param ("nowarn")) ? print $query->h2 ("Errors:") : print $query->h2 ("Errors, Warnings:"); print "\n
\n"; foreach $result (@results) { # Chop off trailing newline, if there is one $result =~ s/\n$//s; # # If "nowarn" is set, don't display warning messages # (which start "warning (000):", where 000 is a series # of digits) # next if $result =~ /^warning\s+\(\d+\):/ and $query->param ("nowarn"); # # If this message mentions the file we just validated # (some messages will mention other files), then print # the messages along with anchors pointing to the text # that triggered the message (the text is printed out # later on). # if ($result =~ /, $argfilename, /) { $filename ? ($result =~ s/, $argfilename, /, $filename, /) : ($result =~ s/, $argfilename, /, [user-supplied text], /); if ($result =~ m/^\s*(.*), (.*), line (\d+)([^:]*): (.*)$/s) { $n = $3; $n or $n = 1; # Record n in the list of lines with errors or warnings; # Add an anchor to the first message for a given line no. $errorlines{$n}++ ? print "
line $n, $2:\n" : print "
line $n, $2:\n"; print "
", entityize_red ("$1$4"), ($5 ? ": " : ""); print "", entityize_quote ("$5"), "\n", # "", entityize ("$5"), "\n"; "\n"; } else { print "
", entityize ($result), "\n"; } } else { # # Okay, this is either a "too many errors" message # or a message about a problem in an external file # (e.g., one found in a SYSTEM identifier). Don't # insert fancy anchors. # $filename ? ($result =~ s/$argfilename/$filename/) : ($result =~ s/$argfilename/[user-supplied text]/); if ($result =~ m/^\s*(.*), (.*), line (\d+)([^:]*): (.*)$/s) { print "
line $3, $2:\n
", entityize_red ("$1$4"), ($5 ? ": " : ""); print "", entityize_quote ("$5"), "\n", # "", entityize ("$5"), "\n"; "\n"; } else { print "
", entityize ($result), "\n"; } } } print "
\n"; print "

Xmlparse exit status = $status.

\n"; print $query->hr (); if (scalar keys %errorlines) { # # Done printing error/warning messages (if any). If # any were issued for the user-supplied file/URL/text, # print out the original text for the user to look # over. # # translate to UTF-8 from UTF-16 (or UTF-8, for that matter) open (TMPFILE, "$utfpath $tmpfilename |") or AbortCgi ("Can't execute $utfpath."); $result = ""; while () { $result .= $_; } close (TMPFILE); print $query->h2 ("Original Document:"); print "\n

"; # Make sure file is split up into constituent lines $result =~ s/\r\n/\n/gs; $result =~ s/\r/\n/gs; # Work around an apparent bug in PERL do $result =~ s/\n(\n+)$/\n $1/s while $result =~ /\n\n$/s; @results = split '\n', $result; my $i = 0; foreach $result (@results) { if (++$i > $n) { # There's more; but we don't need to print it print "etc.
\n"; last; } else { if ($errorlines{$i}) { # If this line had an error on it, then print a backlink print "line $i:  ", entityize ($result), "
\n"; } else { # ...otherwise, just print the line with no backlink print "line $i:  ", entityize ($result), "
\n"; } } } if ($i++ < $n) { # If we have more errors, but no more results, then we're at EOF print "end-of-file
\n"; } print "

"; print $query->hr (); } } else { # Hmmm; no output other than the error exit status print $query->hr (); print $query->p ("System error validating $filename."); print "

Xmlparse exit status = $status.

\n"; print $query->hr (); } } else { # # Okay, the document validated just fine. # if (! @results) { print $query->hr (); } else { if ($query->param ("nowarn")) { print $query->p ("[Warning messages suppressed at user option]"); print $query->hr (); } else { print $query->p ("A list of warning messages follows:"); print $query->hr (); print "\n
\n"; foreach $result (@results) { # Chop off trailing newline, if there is one $result =~ s/\n$//s; # Print out any warning messages generated if ($result =~ /, $argfilename, /) { $filename ? ($result =~ s/, $argfilename, /, $filename, /) : ($result =~ s/, $argfilename, /, [user-supplied text], /); } if ($result =~ m/^\s*(.*), (.*), line (\d+)([^:]*): (.*)$/s) { print "
line $3, $2:\n", "
$1$4", ($5 ? ": " : ""); print "", entityize_quote ("$5"), "\n", # "", entityize ("$5"), "\n"; "\n"; } else { $filename ? ($result =~ s/$argfilename/$filename/) : ($result =~ s/$argfilename/[user-supplied text]/); print "
", entityize ($result), "\n"; } } print "
\n"; print $query->hr (); } } # Tell the user what's up print $query->p ("Document validates OK."); print $query->hr (); } SignOff (); exit (0); } # # WhatFilename # # If the user supplied a filename or URL, returns that. Otherwise # returns an empty string (e.g., if the user pasted an anonymous # document into the text field). Fills $tmpfilename (local) with # the contents of the file, URL, or user-supplied anonymous text. # sub WhatFilename { my $get_uri = $_[0]; my ($text, $filename); if (! sysopen TMPFILE, $tmpfilename, O_CREAT | O_EXECL | O_WRONLY, 0600) { AbortCgi ("Can't open temporary file."); } # Which form did the user fill out, the one for uploading # and validating a file, or the one for checking user-supplied # text? # if ($query->param ('uploaded_file')) { # $text contains the name of the file with the uploaded text; # get at its contents via <$text>; use it straight up to get # its name # $text = $query->param ('uploaded_file'); # User wants to validate a local file. print TMPFILE <$text>; # Though not needed, strictly, for files, do it for security $filename = escape_uri ("$text"); print STDERR "$0: got uploaded file name: $filename\n" if $DEBUG; } else { if ($query->param ('URI')) { # Do we have a valid, non-local URI? (file: is not OK) # $text = $query->param ('URI'); if (not $text =~ m,^\s*((http|ftp)://|urn:),i) { AbortCgi ("Non-http/ftp/urn prefix for URI, $text."); } # We gotta URI - see just below. goto GOTTA_URI; } else { if ($query->param ('text')) { $text = $query->param ('text'); if ($text =~ m,^\s*((http|ftp)://|urn:),i) { GOTTA_URI: $text =~ s/\s+$//m; # Expand partial hostnames and convert aliases to true hostnames. # THIS CODE WAS ORIGINALLY INCLUDED TO HELP AVOID RECURSION. BUT # RECURSION IS AVOIDED OTHERWISE, AND SOMETIMES THIS CODE BREAKS # ON WEBSERVERS USING NAME-BASED VIRTUAL HOSTING. # $text = canonicalize_hostname_in_uri ($text); # # We have a URI; if get_uri can't resolve it in five # minutes, kill this script # $SIG{'ALRM'} = \&TimedOutGettingURI; alarm (300); $status = system ("$get_uri", "$text", "$tmpfilename"); alarm (0); $SIG{'ALRM'} = 'DEFAULT'; AbortCgi ("Can't resolve URI, $text.") if ($status); # If nothing else, do this for security reasons $filename = escape_uri ("$text"); print STDERR "$0: got user-supplied URI, $filename\n" if $DEBUG; } else { # User wants to validate text. print TMPFILE $query->param ('text'); $filename = ""; print STDERR "$0: got user-supplied XML text\n" if $DEBUG; } } else { print STDERR "$0: bummer, no filename, URI, or text supplied\n" if $DEBUG; AbortCgi ("No filename or text supplied."); } } } close (TMPFILE); # Check to be sure there's data # if (! -f $tmpfilename or -z $tmpfilename) { AbortCgi ("Empty or nonexistent data file."); } # Name of file or URL to print out along with error/warning # messages (If the text field was used, $filename will be # empty, "".) return $filename; } # # PrintHeader # # Print out a nice DOCTYPE decl, title, tag... # sub PrintHeader { my $title = $_[0]; if (! $header_printed) { # MIME header print $query->header (-type=>'text/html; charset=UNICODE-1-1-UTF-8', -'Content-language'=>'en'); # HTML header print $query->start_html (-title=>$title, -author=>'Richard_Goerwitz@Brown.EDU', -meta=>{'keywords'=>'XML, validate, xmlparse, document', 'description'=>'Xmlparse validation results.'}, -COLOR=>'black', -BGCOLOR=>'#F8F6F6', -LINK=>'blue', -VLINK=>'purple', -ALINK=>'red'); print $query->h1 ($title); # Don't print this twice $header_printed = 1; } } # # AbortCgi: # # Send error message to user and stderr, then quit. # sub AbortCgi { my @msg = @_; my ($txt, $ct, $msg); if (! $header_printed) { # MIME header print $query->header (-type=>'text/html; charset=UNICODE-1-1-UTF-8', -'Content-language'=>'en'); # HTML header print $query->start_html (-title=>'Validation Script Error', -author=>'Richard_Goerwitz@Brown.EDU', -COLOR=>'black', -BGCOLOR=>'#F8F6F6', -LINK=>'blue', -VLINK=>'purple', -ALINK=>'red'); # General title print $query->h1 ('Validation Script Error'); # Don't print this twice $header_printed = 1; } # Error message... print $query->hr (); if (! @msg) { print "

Error: script &script_name() encountered a fatal error

\n"; } else { foreach $txt (@msg) { print "

$txt

\n"; } } print $query->hr (); SignOff (); close (STDOUT); $ct = ctime (time ()); $ct =~ s/\s+$//; $msg = join '', @msg; $msg =~ s/\s+$//; print STDERR "[", $ct, "] $msg\n"; close (STDERR); exit 1; } # # Interrupted: # # This routine is called if the CGI script is killed by a signal (as # may happen if the system goes down, or the Web server kills it). # sub Interrupted { my $signame = shift; if ($child_pid) { kill ('HUP', $child_pid); sleep (2); kill ('TERM', $child_pid); } AbortCgi ("$0: Script interrupted (signal = $signame)."); } # # TimedOutGettingURI: # # This routine gets called if get_uri times out. The normal # default is five minutes. See below. # sub TimedOutGettingURI { my $signame = shift; # Don't bother killing anything; rely (perhaps unwisely) on our # exiting to sighup the child. # AbortCgi ("$0: Timed out fetching URI."); } # # TimedOut: # # This routine gets called if the xmlparse times out. The normal # default is five minutes. See below. # sub TimedOut { my $signame = shift; kill ('HUP', $child_pid); sleep (2); kill ('TERM', $child_pid); AbortCgi ("$0: Conversion process timed out."); } # # SignOff # # Print version, author information; prepare to exit. # sub SignOff { print "

Generated by xmlparse
\n", " Richard_Goerwitz\@Brown.EDU\n", "

\n"; print $query->end_html (); unlink ($tmpfilename); } # # entityize # # Replace <, >, ", & with built-in entity refs. # sub entityize { my $html = $_[0]; $html =~ s/&/&/gs; $html =~ s/\t/ /gs; $html =~ s/([\0-\31])/"\&#". ord($1) . ";"/ges; $html =~ s/\s/ /gs; $html =~ s/\"/"/gs; $html =~ s/>/>/gs; $html =~ s/, ", & with built-in entity refs. Add # quotes around string if it contains leading or trailing # whitespace. # sub entityize_quote { my $html = $_[0]; $html =~ s/&/&/g; $html =~ s/([\0-\31\n])/"\&#". ord($1) . ";"/ges; if ($html =~ /^\s/ or $html =~ /\s$/) { # map leading space to   $html =~ s/^\s/ /; # map trailing space to   $html =~ s/\s$/ /; # put string inside quotation marks $html = '"' . $html . '"'; } $html =~ s/\"/"/g; $html =~ s/>/>/g; $html =~ s/error<\/FONT>/; return $html; } # # escape_uri # # Hexify non-reserved characters in URI. # sub escape_uri { my($toencode) = @_; $toencode=~s/([^a-zA-Z0-9_\-.&=#?:\/%])/uc sprintf("%%%02x", ord($1))/eg; return $toencode; } # # canonicalize_hostname_in_uri # sub canonicalize_hostname_in_uri { my $uri = $_[0]; my ($prefix, $hostname, $rest, @hostent); if ($uri =~ m,^\s*((http|ftp)://)([\w\-\.]*)([:/]?.*)$,) { $prefix = $1; $hostname = $3; $rest = $4; print STDERR "$0: prefix = $prefix; hostname = $hostname; rest = $rest\n" if $DEBUG; $hostname =~ s/\.+/\./g; $hostname =~ s/\.$//; if (@hostent = gethostbyname ($hostname)) { print STDERR "$0: expanded $url to $prefix . $hostent[0] . $rest\n" if $DEBUG; return $prefix . $hostent[0] . $rest; } } return $uri; }