[30299] in Perl-Users-Digest

home help back first fref pref prev next nref lref last post

Perl-Users Digest, Issue: 1542 Volume: 11

daemon@ATHENA.MIT.EDU (Perl-Users Digest)
Tue May 13 21:14:18 2008

Date: Tue, 13 May 2008 18:14:10 -0700 (PDT)
From: Perl-Users Digest <Perl-Users-Request@ruby.OCE.ORST.EDU>
To: Perl-Users@ruby.OCE.ORST.EDU (Perl-Users Digest)

Perl-Users Digest           Tue, 13 May 2008     Volume: 11 Number: 1542

Today's topics:
        RXParse 1.3 sln@netherlands.co
        Digest Administrivia (Last modified: 6 Apr 01) (Perl-Users-Digest Admin)

----------------------------------------------------------------------

Date: Tue, 13 May 2008 16:54:36 -0700
From: sln@netherlands.co
Subject: RXParse 1.3
Message-Id: <tbak24tgbul0rrs37gup62r7aeo40v6mq0@4ax.com>

RXParse 1.3 parse/edit/filter module
Webmasters please adhere to the copyright notice at the top of the RXParse.pl header.

Version 1.3 changes:

- Added methods:
    line(),
    column(),
    original_string(),
    setMatchThrottle(),
    Errors(),
    RXProcessor()

- Completely new code for line/column and markup string and a performance throttling
- Testing hooks for writing xml for modifications on XP2,3,4,5 engines
- Staged for XP2,3 engines


Notes:

* Looking for testers and feedback, email me.
* Looking for donations to keep this effort going, email me.
* Please note that there is a sample usage 'pl' file at the bottom. For now, just use this as a guide. Documentation will arrive soon.

Thanks
robic0(at)adelphia.net


- RXParse.pm 

#################################################################
# AUTHOR: robic0, copyright (c) 2006-2008
# Reproduction or publication of contents,
# or distribution in a comercial product, is
# strictly prohibited without the authors concent.
# robic0(at)adelphia.net
#################################################################
# XML/Xhtml - RXParse parse/edit/filter module
my $VERSION = 1.3;
# ------------------------------------------------------
# Compliant with w3c XML: 1.1 
# Resources:
#   Extensible Markup Language (XML) 1.1
#   W3C Recommendation 04 February 2004,
#   15 April 2004
#   http://www.w3.org/TR/xml11
#################################################################
$|=1;
package RXParse;
use strict;
use warnings;
use Carp;
use vars qw(@ISA);
@ISA = qw();


#==========================
# RXParse package globals
#==========================
my (
  %Dflth,
  %ErrMsg,
  $Nstrt,$Nchar,$Name,
  @UC_Nstart,@UC_Nchar,
  $RxParseXP1,
  $RxAttr,
  $RxAttr_DL1,
  $RxAttr_DL2,
  $RxAttr_RM,
  $RxPi,
  $RxENTITY,
  %dflt_general_ent_subst,
  %dflt_parameter_ent_subst
);
my $parsinitflg = 0;

if (!$parsinitflg) {
	InitParser();
	$parsinitflg = 1;
}

#========================
# RXParse user methods
#========================
sub new
{
	my ($class, @args) = @_;
	my $self  = {
		'debug'          => 0,
		'ignore_errors'  => 0,
		'parse_errors'   => 0,
		'proctype'       => 'NONE',
		'line'           => 0,
		'col'            => 0,
		'match_throttle' => 400,
	};
	Cleanup($self);
	setDfltHandlers($self);
	return bless ($self, $class);
}

sub original_content
{
	my $self = shift;
	if (defined $self->{'refcontent'} &&
		ref($self->{'refcontent'}) eq 'SCALAR') {
		return ${$self->{'refcontent'}};
	}
	return "";
}

sub original_string
{
	my $self = shift;
	if (defined $self->{'refparsln'} &&
		ref($self->{'refparsln'}) eq 'SCALAR')
	{
		my $count = $self->{'EVT_end_pos'} - $self->{'EVT_begin_pos'};
		return '' if (!($count > 0));

		return substr(${$self->{'refparsln'}}, $self->{'EVT_begin_pos'}, $self->{'EVT_end_pos'}-$self->{'EVT_begin_pos'});
		
		# (alternate, this is too slow but shouldn't be)
		#else {
		#	my $savpos = pos(${$self->{'refparsln'}});
		#	pos(${$self->{'refparsln'}}) = $self->{'EVT_begin_pos'};
		#
		#	while (${$self->{'refparsln'}} =~ /(.{1,$count})/gs) {
		#		pos(${$self->{'refparsln'}}) = $savpos;
		#		return $1;
		#	}
		#}
	}
	return '';
}

sub setMatchThrottle
{
	# This is the number of non-error matches before updating the line/column position.
	# 400 is the default, 0 disables. This value should be algorithmically adjusted based
	# on file or buffer size, or users knowledge.
	# On clean xml this should be set to 0 for the best performance.
	# Automatic updates will be made on errors, or calls to get line/column, which resets the counter.
	# --------------------------------------------------------------------
	my ($self, $throttle) = @_;
	return if ($throttle = undef);
	$self->{'match_throttle'} = $throttle;
}

sub line
{
	my $self = shift;
	updateLineColumn($self);
	return $self->{'line'};
}

sub column
{
	my $self = shift;
	updateLineColumn($self);
	return $self->{'col'};
}


sub setMode
{
	my ($self, @args) = @_;

	if (scalar(@args)) {
	    while (my ($name, $val) = splice (@args, 0, 2)) {
		$name =~ s/^\s+//s; $name =~ s/\s+$//s;

		if (lc($name) eq 'debug') {
			$self->{'debug'} = 0;
			$self->{'debug'} = 1 if (defined $val && $val);
		}
		elsif (lc($name) eq 'ignore_errors') {
			$self->{'ignore_errors'} = 0;
			$self->{'ignore_errors'} = 1 if (defined $val && $val);
		}
		# add more here
	    }
	}
}

sub stopParse
{
	my $self = shift;
	$self->{'kill_parse'} = 1;
}

sub Errors
{
	my $self = shift;
	return $self->{'parse_errors'};
}

sub RXProcessor
{
	my $self = shift;
	return $self->{'proctype'};
}

sub setDfltHandlers
{
	my ($self, $name) = @_;
	if (defined $name) {
		$name =~ s/^\s+//s; $name =~ s/\s+$//s;
		my $hname = "h".lc($name);
		if (exists $Dflth{$hname}) {
			$self->{$hname} = $Dflth{$hname};
		}
	} else {
		foreach my $key (keys %Dflth) {
			$self->{$key} = $Dflth{$key};

		}
	}	
}

sub setHandlers
{
	my ($self, @args) = @_;
	my %oldh = ();
	if (scalar(@args)) {
	    while (my ($name, $val) = splice (@args, 0, 2)) {
		$name =~ s/^\s+//s; $name =~ s/\s+$//s;
		my $hname = "h".lc($name);
		if (exists $self->{$hname}) {
			$oldh{$name} = $self->{$hname};
			if (ref($val) eq 'CODE') {
				$self->{$hname} = $val;
			} else {
				# fatal error if not a CODE ref
				throwX($self, 'FATAL', '32', $name);
			}
		}
	    }
	}
	return %oldh;
}

sub parse  ## XP1 processor
{
	my ($self, $data, @args) = @_;
	$self->{'parse_errors'} = 0;
	if ($self->{'InParse'}) {
		# fatal error if already in parse
		throwX($self, 'FATAL', '30');
	}
	unless (defined $data) {
		# fatal error if data source not defined
		throwX($self, 'FATAL', '31');
	}
	$self->{'InParse'} = 1;
	$self->{'line'} = 0;
	$self->{'col'} = 0;
	$self->{'LC_crlf_pos'} = 0;
	$self->{'BUFFERED'} = 0;

	$self->{'proctype'} = 'XP1';
	if (ref($data) eq 'SCALAR') {
		# call parse handler
		$self->{'hparsestart'}($self, "SCALAR ref");
		XP1 ($self, 1, $data);
	}
	elsif (ref(\$data) eq 'SCALAR') {
		# call parse handler
		$self->{'hparsestart'}($self, "SCALAR string");
		XP1 ($self, 1, \$data);
	} else {
		if (ref($data) ne 'GLOB' && ref(\$data) ne 'GLOB') {
			# data source not string or filehandle, nor reference to one
			throwX($self, 'FATAL', '33');
		}
		# call new_parse handler
		$self->{'hparsestart'}($self, "GLOB ref or filehandle");
		XP1 ($self, 0, $data);
	}

	Cleanup($self);
	return $self->{'parse_errors'};

}

#==========================
# RXParse non-user methods
#==========================

my $copyxml = 0; # copy xml; set to 1 to copy (testing placeholder for XP2,3,4,5 engines)

sub Cleanup
{
	my $self = shift;
	InitEntities($self);
	$self->{'refcontent'} = undef;
	$self->{'refparsln'} = undef;
	$self->{'InParse'} = 0;
	$self->{'kill_parse'} = 0;
}

sub InitEntities
{
	my $self = shift;
	# initial compiled regexp
	$self->{'Entities'} = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?:([0-9]+)|(x[0-9a-fA-F]+)))";
	#                                                                  (  4      4|5             5)
	$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
	#                         1   12   23                   3
	# initial entity hash
	$self->{'general_ent_subst'} = {%dflt_general_ent_subst}; 
	$self->{'parameter_ent_subst'} = {%dflt_parameter_ent_subst};
	$self->{'ring_ent_subst'} = {};
}

sub updateLineColumn
{
	my $self = shift;

	if (defined $self->{'refparsln'} &&
		ref($self->{'refparsln'}) eq 'SCALAR')
	{
		my $beginpos = 0;
		my $endpos   = $self->{'LC_end_pos'};

		if ($self->{'BUFFERED'}) {
			$beginpos    = $self->{'LC_begin_pos'};
		}
		my $save_pos = pos(${$self->{'refparsln'}});
		pos(${$self->{'refparsln'}}) = $beginpos;

		while (${$self->{'refparsln'}} =~ /\n/g)
		{
			last if (pos(${$self->{'refparsln'}}) > $endpos);
			if ($self->{'BUFFERED'}) {
				$self->{'line'}++;
			}
			$self->{'LC_crlf_pos'} = pos(${$self->{'refparsln'}});
		}
		pos(${$self->{'refparsln'}}) = $save_pos;

		$self->{'col'} = $endpos-$self->{'LC_crlf_pos'};
		$self->{'LC_begin_pos'} = $endpos;
	}
	$self->{'LC_mtcount'} = 0;
}

sub XP1   # XP1 processor, parse only, non-edit
{
	my ($self, $BUFFERED, $rpl_mk) = @_;
	my ($markup_file);
	my $parse_ln = '';
	my $dyna_ln  = '';
	my $ref_parse_ln = \$parse_ln;
	my $ref_dyna_ln  = \$dyna_ln;
	if ($BUFFERED) {
		$ref_parse_ln = $rpl_mk;
		$ref_dyna_ln  = \$dyna_ln;
	} else {
		# assume its a ref to a global or global itself
		$markup_file = $rpl_mk;
		$ref_dyna_ln = $ref_parse_ln;
	#	seek ($markup_file, 0, 0);
	}
	my $complete_comment = 0;
	my $complete_cdata   = 0;
	my @Tags = ();
	my $havroot = 0;
	my $last_content_pos = 0;
	my $data_slug = 0;
	my $content = '';
	my $altcontent = undef;

	my $slug_pos = 0;
	my $begin_pos = 0;

	$self->{'refcontent'} = \$content;
	$self->{'BUFFERED'} = $BUFFERED;
	$self->{'refparsln'} = $ref_parse_ln;
	$self->{'EVT_begin_pos'} = 0;
	$self->{'EVT_end_pos'}   = 0;

	$self->{'LC_crlf_pos'} = 0;
	$self->{'LC_begin_pos'} = 0;
	$self->{'LC_end_pos'}   = 0;
	$self->{'LC_mtcount'} = 0;

	if ($copyxml) {
		## test write a copy
		open (MYDATA, '>', 'ittcopy.txt') or die "ittcopy.txt";
	}

	while (!$data_slug)
	{
		# stream processing (if not buffered)
		if (!$BUFFERED) {
			if (!($_ = <$markup_file>)) {
				# just parse what we have
				$data_slug = 1;
				  # boundry check for runnaway
				  #if (($complete_comment+$complete_cdata) > 0) {}
			} else {
				$self->{'line'}++;
				$$ref_parse_ln .= $_;

				## buffer if needing comment/cdata closure
				next if ($complete_comment && !/-->/);
				next if ($complete_cdata   && !/\]\]>/);

				## reset comment/cdata flags
				$complete_comment = 0;
				$complete_cdata   = 0;

				## flag serialized comments/cdata buffering
				if (/(<!--)|(<!\[CDATA\[)/)
				{
					if (defined $1) { # complete comment
						if ($$ref_parse_ln !~ /<!--.*?-->/s) {
							$complete_comment = 1;
							next;
						}
					}
					elsif (defined $2) { # complete cdata
						if ($$ref_parse_ln !~ /<!\[CDATA\[.*?\]\]>/s) {
							$complete_cdata = 1;
							next;
						}
					}
				}
				## buffer until '>' or eof
				next if (!/>/);
			}
		} else {
			$self->{'line'} = 1;
			$data_slug = 1;
		}

		$slug_pos = 0;
		$begin_pos = 0;
		$self->{'LC_mtcount'} = 0;

		## REGEX Parsing loop
		while (!$self->{'kill_parse'} && $$ref_parse_ln =~ /$RxParseXP1/g)
		{
			## handle contents
			if (defined $15) {
				$content .= $15;
				$last_content_pos = pos($$ref_parse_ln);
				next;
			}
			## valid content here ... can be taken off
			print "-"x20,"\n" if ($self->{'debug'});
			if (length ($content)) {

				# update line/column after match_throttle (content)
				# --------------------------------------------------
				$self->{'LC_end_pos'} = $last_content_pos;
				if ($self->{'match_throttle'} > 0 && $self->{'LC_mtcount'}++ > $self->{'match_throttle'}) {
					updateLineColumn($self);
				}
				# save the positions that triggered the event
				# -------------------------------------------
				$self->{'EVT_begin_pos'} = $begin_pos;
				$self->{'EVT_end_pos'}   = $last_content_pos;
				  #** print original_string($self)."\n";

				$begin_pos = $last_content_pos;



				## check reserved characters in content
				if ($content =~ /[<>]/) {
					#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
					## mark-up characters in content
					throwX($self, 'OVR', '01', $content);
				}
				if (!scalar(@Tags)) {
					#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
					if ($content =~ /[^\s]/s) {
						## content at root level
						throwX($self, 'OVR', '02', $content);
					}
				}
				# substitute special xml characters, then call content handler with $content
				# ------------------------------------------------------
				# $content has to be a constant if xml reserved chars
				# are found, copy altered string to pass to handler
				# otherwise pass original $content
				# ------------------------------------------------------
				if (defined ($altcontent = convertEntities ($self, \$content))) {
					$self->{'hchar'}($self, $$altcontent);
				} else {
					$self->{'hchar'}($self, $content);
				}
				#print "15    $content\n" if ($self->{'debug'});
				#print "-"x20,"\n" if ($self->{'debug'});
				$content = '';
			}

			# update line/column after match_throttle (markup)
			# -------------------------------------------------
			$self->{'LC_end_pos'}   = pos($$ref_parse_ln);
			if ($self->{'match_throttle'} > 0 && $self->{'LC_mtcount'}++ > $self->{'match_throttle'}) {
				updateLineColumn($self);
			}
			# save the positions that triggered the event
			# -------------------------------------------
			$self->{'EVT_begin_pos'} = $begin_pos;
			$self->{'EVT_end_pos'}   = pos($$ref_parse_ln);
			  #** print original_string($self)."\n";

			$begin_pos = pos($$ref_parse_ln);


			#if ($show_pos && $debug) {
			#	my $rr = pos($$ref_parse_ln);
			#	print "$rr  ";
			#}

			## <tag> or </tag> or <tag/>
			if (defined $2)
			{
				my ($l1,$l3) = (length($1),length($3));
				if (($l1+$l3)==0) { ## <tag>
					if (!scalar(@Tags) && $havroot) {
						## new root node <tag>
						throwX($self, 'OVR', '03', $2);
					}
					push @Tags,$2;
					$havroot = 1;
					# call start tag handler with $2
					$self->{'hstart'}($self, $2);
				}
				elsif ($l1==1 && $l3==0) { ## </tag>
					my $pval = pop @Tags;
					if (!defined $pval) {
						## missing start tag </tag>
						throwX($self, 'OVR', '04', $2);
					}
					elsif ($2 ne $pval) {
						## expected closing tag </tag>
						throwX($self, 'OVR', '05', $pval);
					}
					# call end tag handler with $2
					$self->{'hend'}($self, $2);
				}
				elsif ($l1==0 && $l3==1) { ## <tag/>
					if (!scalar(@Tags) && $havroot) {
						## new root node <tag/>
						throwX($self, 'OVR', '06', $2);
					}
					$havroot = 1; # first and only <root/>
					# call start tag handler, then end tag handler, with $2
					$self->{'hstart'}($self, $2);
					$self->{'hend'}($self, $2);
				} else {
					## <//node//> errors
					## hard error, just report
					throwX($self, 'HARD', '07', "$1$2$3");
				}
				#print "2  TAG:  $1$2$3\n" if ($self->{'debug'});
			}
			## <tag attrib/> or <tag attrib>
			elsif (defined $5)
			{
				my $l7 = length($7);

				## attributes
				my $attref  = getAttrARRAY($self, $6);

				if (ref($attref)) {
					if ($l7==0) { ## <tag attrib>
						if (!scalar(@Tags) && $havroot) {
							## new root node
							throwX($self, 'OVR', '03', $5);
						}
						push @Tags,$5;
						$havroot = 1;
						# call start tag handler with $5 and attributes @{$attref}
						$self->{'hstart'}($self, $5, @{$attref});
					}
					elsif ($l7==1) {  ## <tag attrib/>
						if (!scalar(@Tags) && $havroot) {
							## new root node
							throwX($self, 'OVR', '06', $7);
						}
						$havroot = 1; # first and only <root attrib/>
						# call start tag handler with $5 and attributes @{$attref}, then end tag handler with $5
						$self->{'hstart'}($self, $5, @{$attref});
						$self->{'hend'}($self, $5);
					} else {
						## syntax error
						## hard error, just report
						throwX($self, 'HARD', '07', "$5$6$7");
					}
				} else {
					## missing or extra token
					## hard error, just report
					throwX($self, 'HARD', '08', $attref);
				}
				#if ($self->{'debug'}) {
				#	print "5,6  TAG:  $5  Attr:  $6$7\n" ;
				#}
			}
			## XMLDECL or PI (processing instruction)
			elsif (defined $8)
			{
				my $pi = $8;

				# XML declaration ?
				if ($pi =~ /^xml(.*?)$/) {
					my $attref  = getAttrARRAY($self, $1);

					if (ref($attref)) {
						#if (!scalar(@{$attref})) {
						#	## missing xmldecl parameters
						#	throwX($self, 'OVR', '15', $pi);
						#}
						my ($version,$encoding,$standalone);
						while (my ($name,$val) = splice (@{$attref}, 0, 2)) {
							if ('version' eq lc($name) && !defined $version) {
								if ($val !~ /^[0-9]\.[0-9]+$/) {
									## invalid version character data in xmldecl
									throwX($self, 'OVR', '16', "$name = '$val'");
								}
								$version = $val;
							} elsif ('encoding' eq lc($name) && !defined $encoding) {
								if ($val !~ /^[A-Za-z][\w\.-]*$/) {
									## invalid encoding character data in xmldecl
									throwX($self, 'OVR', '17', "$name = '$val'");
								}
								$encoding = $val;
							} elsif ('standalone' eq lc($name) && !defined $standalone) {
								if ($val !~ /^(?:yes)|(?:no)$/) {
									## invalid standalone character data in xmldecl
									throwX($self, 'OVR', '18', "$name = '$val'");
								}
								$standalone = ($val eq 'yes' ? 1 : 0);
							} else {
								## unknown xmldecl parameter
								throwX($self, 'OVR', '19', "$name = '$val'");
							}
						}
						if (!defined $version) {
							# missing version in xmldecl
							## hard error, just report
							throwX($self, 'HARD', '20', $pi);
						} else {
							# call xmldecl handler
							$self->{'hxmldecl'}($self, $version, $encoding, $standalone);
						}

					} else {
						## missing or extra token in xmldecl
						## hard error, just report
						throwX($self, 'HARD', '14', $attref);
					}
				}

				# PI - processing instruction
				elsif ($pi =~ /$RxPi/) {
					# call pi handler
					$self->{'hproc'}($self, $1, $2);
				} else {
					# unknown PI data
					throwX($self, 'HARD', '21', $pi);
				}
				#print "8  VERSION:  $8\n" if ($self->{'debug'});
			}

			## META - html (not used)
			#elsif (defined $4) {
			#	# If doctype is HTML then META is not closed
			#	# parse meta data, call handler
			#	$self->{'hmeta'}($self, $4);
			#	#print "4  META:  $4\n" if ($self->{'debug'});
			#}

			## DOCTYPE
			elsif (defined $9) {
				# parse doctype, call handler
				$self->{'hdoctype'}($self, $9);
				#print "9  DOCTYPE:  $9\n" if ($self->{'debug'});
			}
			## CDATA
			elsif (defined $10) {
				if (!scalar(@Tags)) {
					## CDATA content at root
					throwX($self, 'OVR', '09', $10);
				}
				# call cdata handler
				$self->{'hcdata'}($self, $10);
				#print "10  CDATA:  $10\n" if ($self->{'debug'});
			}
			## COMMENT
			elsif (defined $11) {
				# call comment handler
				$self->{'hcomment'}($self, $11);
				#print "11  COMMENT:  $11\n" if ($self->{'debug'});
			}
			## ENTITY
			elsif (defined $13) {
				# parse entity, call handler
				my ($entdata, $entdata_added, $entname) = ($13, undef, '');
				if ($entdata =~ /$RxENTITY/) {
					if (defined $1) {
						# general entity replacement
						$entdata_added = addEntity($self, 0, $1, $3);
						$entname = "&$1";
					} else {
						# parameter entity replacement
						$entdata_added = addEntity($self, 1, $2, $3);
						$entname = "&$2";
					}
				}
				else {
					# unknown ENTITY data
					#
				}
				if (defined $entdata_added) {
					$self->{'hentity'}($self, $entname, $$entdata_added);
				} else {
					$self->{'hentity'}($self, $entname, $entdata);
				}
				#print "13  ENTITY:  $13\n" if ($self->{'debug'});
			}
			## ATTLIST - noninternal
			elsif (defined $12) {
				# parse attlist, call handler
				$self->{'hattlist'}($self, $12);
				#print "12  ATTLIST:  $12\n" if ($self->{'debug'});
			}
			## ELMENT - noninternal
			elsif (defined $14) {
				# parse element, call handler
				$self->{'helement'}($self, $14);
				#print "14  ELEMENT:  $14\n" if ($self->{'debug'});
			}

			#######################################################################################
			# Slug Position - for writing (testing placeholder for processor engines xp2,3,4,5)
			if ($copyxml) {
				print MYDATA substr($$ref_parse_ln, $slug_pos, pos($$ref_parse_ln)-$slug_pos);
			}
			$slug_pos  = pos($$ref_parse_ln);
			#######################################################################################

		}
		$$ref_dyna_ln = $content;
		$content = '';
	}

	if ($copyxml) { # testing placeholder
		close MYDATA;
	}

	if ($self->{'kill_parse'}) {
		$self->{'refparsln'} = undef;
		$self->{'refcontent'} = undef;
		return 1;
	}
	if (!$havroot) {
		# not valid xml
		throwX($self, 'OVR', '10');
	}
	if (scalar(@Tags)) {
		my $str = '';
		while (defined (my $etag = pop @Tags)) {
			$str .= ", /$etag";
		}
		$str =~ s/^, +//;
		# missing end tag
		throwX($self, 'OVR', '11', $str);
	}
	if ($$ref_dyna_ln =~ /[^\s]/s) {
		$self->{'refparsln'} = $ref_dyna_ln;
		$self->{'EVT_begin_pos'} = 0;
		$self->{'EVT_end_pos'}   = length($$ref_dyna_ln);
		if ($$ref_dyna_ln =~ /[<>]/) {
			# mark-up characters in content
			throwX($self, 'OVR', '12', $$ref_dyna_ln);
		} else {
			# content at root level (end)
			throwX($self, 'OVR', '13', $$ref_dyna_ln);
		}
	}
	$self->{'refcontent'} = undef;
	$self->{'refparsln'} = undef;
	return 1;
}

sub getAttrARRAY
{
	my ($self, $attrstr) = @_;
	my $aref = [];
	my ($alt_attval, $attval, $rx);
	
	while ($attrstr =~ s/$RxAttr//) {
		push @{$aref},$1;
		if ($2 eq "'") {
			$rx = \$RxAttr_DL1;
		} else {
			$rx = \$RxAttr_DL2;
		}
		if ($attrstr =~ s/$$rx//) {
			if (defined $1) {
				push @{$aref},$1;
				next;
			}
			$attval = $2;
			if (defined ($alt_attval = convertEntities ($self, \$attval))) {
				push @{$aref},$$alt_attval;
				next;
			}
			push @{$aref},$attval;
			next;
		}
		# bad, return that part of string which is in error
		return $attrstr;
	}
	if ($attrstr=~/$RxAttr_RM/) {
		$attrstr =~ s/^\s+//s; $attrstr =~ s/\s+$//s;
		# bad, return that part of string which is in error
		return $attrstr if (length($attrstr));
	}
	# normal, return a ref .. all was good
	return $aref;
}

sub convertEntities
{
	my ($self, $str_ref, $opts) = @_;
	my $alt_str = '';
	my $res = 0;
	my ($entchr);

	# Usage info:
	# Option bitmask: 1=char reference, 2=general reference, 4=parameter reference
	# Default option is char and general references (&)
	# Ignore Parameter references (%) in Attvalue and Content
	# Process PE's in DTD and Entity decls

	$opts = 3 unless defined $opts;

	while ($$str_ref =~ /$self->{'RxEntConv'}/gc)
	{
		# Unicode character reference
		if (defined $4)	{
			# decimal
			if (($opts & 1) && defined ($entchr = getEntityUchar($self, $4))) {
				$alt_str .= "$1$entchr";
				$res = 1;
			} else {
				$alt_str .= "$1$2#$4;";
			}
		} elsif (defined $5) {
			# hex
			if (($opts & 1) && length($5) < 9 && defined ($entchr = getEntityUchar($self, hex($5)))) {
				$alt_str .= "$1$entchr";
				$res = 1;
			} else {
				$alt_str .= "$1$2#$5;";
			}
		}
		else {
			# General reference
			if ($2 eq '&') {
				if (($opts & 2) && exists $self->{'general_ent_subst'}->{$3}) {
					$alt_str .= $1;

					# expand general references,
					# bypass if seen in the recursion ring
					# ----
					if (defined $self->{'ring_ent_subst'}->{$3}) {
						$alt_str .= "$1$2$3;";
					} else {
						# recurse expansion
						# ----
						my ($entname, $alt_entval) = ($3, undef);
						my $entval = $self->{'general_ent_subst'}->{$entname};
						$self->{'ring_ent_subst'}->{$entname} = 1;
	
						if (defined ($alt_entval = convertEntities ($self, \$entval, 2))) {
							$alt_str .= $$alt_entval;
						} else {
							$alt_str .= $self->{'general_ent_subst'}->{$entname};
						}
						$self->{'ring_ent_subst'}->{$entname} = undef;
						$res = 1;
					}
				} else {
					$alt_str .= "$1$2$3;";
				}
			} else {
			# Parameter reference
				if (($opts & 4) && exists $self->{'parameter_ent_subst'}->{$3}) {
					$alt_str .= "$1$self->{'parameter_ent_subst'}->{$3}";
					$res = 1;
				} else {
					$alt_str .= "$1$2$3;";
				}
			}
		}
	}
	if ($res) {
		$alt_str .= substr $$str_ref, pos($$str_ref);
		return \$alt_str;
	}
	return undef;
}

sub getEntityUchar
{
	my ($self, $code) = @_;
	if (($code >= 0x01 && $code <= 0xD7FF) ||
	    ($code >= 0xE000 && $code <= 0xFFFD) ||
	    ($code >= 0x10000 && $code <= 0x10FFFF)) {
		return chr($code);
	}
	return undef;
}

sub addEntity
{
	my ($self, $peflag, $entname, $entval) = @_;

	# Non-normalized, internal entities only
	# (no external defs yet, ie:SYSTEM/PUBLIC/NDATA)
	return undef unless
	($entval =~ s/^\s*'([^']*?)'\s*$/$1/s || $entval =~ s/^\s*"([^"]*?)"\s*$/$1/s);

	# Replacement text: convert parameter and character references only
	my ($alt_entval);
	if (defined ($alt_entval = convertEntities ($self, \$entval, 5))) {
		$entval = $$alt_entval;
	}
	my $enttype = 'general_ent_subst';
	$enttype = 'parameter_ent_subst' if ($peflag);

	if (exists $self->{'$enttype'}->{$entname}) {
		# warn, pre-existing ent name
		return undef;
	}
	$self->{$enttype}->{$entname} = $entval;
	$self->{'Entities'} .= "|(?:$entname)";
	# recompile regexp
	$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
	return \$entval;
}


# default handlers
# ------------------

sub dflt_parsestart {my ($self, $parm)  = @_;print "RXParse $VERSION\nnew parse _: $parm\n" if ($self->{'debug'});}

#sub dflt_start {
#	my ($self, $el, @attr) = @_;
#	if ($self->{'debug'}) {
#		print "start _:  $el\n"; 
#		while (my ($name,$val) = splice (@attr, 0,2)) {
#			print " "x12,"$name = $val\n";
#		}
#	}
#}
# or 
sub dflt_start {
	my ($self, $el, %atts) = @_;
	if ($self->{'debug'}) {
		print "start _:  $el\n";
		foreach my $name (keys %atts) {
			print " "x12,"$name = $atts{$name}\n";
		}
	}
}
sub dflt_char {
	my ($self, $str) = @_;
	if ($self->{'debug'}) {
		print "char  _:  $str\n";
		print "-"x20,"\n";
	}
}
sub dflt_end     {my ($self, $el)  = @_;print "end   _: /$el\n" if ($self->{'debug'});}
sub dflt_cdata   {my ($self, $str) = @_;print "cdata _:  $str\n" if ($self->{'debug'});}
sub dflt_comment {my ($self, $str) = @_;print "comnt _:  $str\n" if ($self->{'debug'});}
sub dflt_meta    {my ($self, $str) = @_;print "meta  _:  $str\n" if ($self->{'debug'});}
sub dflt_attlist {my ($self, $parm) = @_;print "attlist_h _:  $parm\n" if ($self->{'debug'});}
sub dflt_doctype {my ($self, $parm) = @_;print "doctype_h _:  $parm\n" if ($self->{'debug'});}
sub dflt_element {my ($self, $parm) = @_;print "element_h _:  $parm\n" if ($self->{'debug'});}

sub dflt_entity  {
	my ($self, $entname, $entval) = @_;
	if ($self->{'debug'}) {
		print "entity_h  _:  $entname = $entval\n";
	}
}

sub dflt_xmldecl {
	my ($self, $version, $encoding, $standalone) = @_;

	if ($self->{'debug'}) {
		print "xmldecl_h _:  version = $version\n" if (defined $encoding);
		print " "x14,"encoding = $encoding\n" if (defined $encoding);
		print " "x14,"standalone = $standalone\n" if (defined $standalone);
	}
}
sub dflt_proc {
	my ($self, $target, $data) = @_;

	if ($self->{'debug'}) {
		print "proc_h    _:  target = $target\n";
		print " "x14,"data = $data\n";
	}
}
sub dflt_error {my ($self, $errlvl, $errno, $estr, $estr_basic) = @_;print "$estr\n" if ($self->{'debug'});}


# ======================
# RXParse global init
# ======================
sub InitParser
{
  %Dflth = (
   'hparsestart' => \&dflt_parsestart,
   'hstart' => \&dflt_start,
   'hend'   => \&dflt_end,
   'hchar'  => \&dflt_char,
   'hcdata' => \&dflt_cdata,
   'hcomment' => \&dflt_comment,
   'hmeta'    => \&dflt_meta,
   'hattlist' => \&dflt_attlist,
   'hentity'  => \&dflt_entity,
   'hdoctype' => \&dflt_doctype,
   'helement' => \&dflt_element,
   'hxmldecl' => \&dflt_xmldecl,
   'hproc'    => \&dflt_proc,
   'herror' => \&dflt_error,
  );
  @UC_Nstart = (
    "\\x{C0}-\\x{D6}",
    "\\x{D8}-\\x{F6}",
    "\\x{F8}-\\x{2FF}",
    "\\x{370}-\\x{37D}",
    "\\x{37F}-\\x{1FFF}",
    "\\x{200C}-\\x{200D}",
    "\\x{2070}-\\x{218F}",
    "\\x{2C00}-\\x{2FEF}",
    "\\x{3001}-\\x{D7FF}",
    "\\x{F900}-\\x{FDCF}",
    "\\x{FDF0}-\\x{FFFD}",
    "\\x{10000}-\\x{EFFFF}",
  ); 
  @UC_Nchar = (
    "\\x{B7}",
    "\\x{0300}-\\x{036F}",
    "\\x{203F}-\\x{2040}",
  );
  $Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
  $Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
  $Name  = "(?:$Nstrt$Nchar*?)";
  #die "$Name\n";

  $RxParseXP1 =
qr/(?:<(?:(?:(\/*)($Name)\s*(\/*))|(?:META(.*?))|(?:($Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(?:(?:DOCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?))|(?:ELEMENT(.*?)))))>)|(.+?)/s;
  #                (  <(  (  1   12     2   3   3)|(     *4  *4)|(  5     56(                              ) 6   7   7)|(    8   8  )|(  !(  (         9   9)|(           0   0    )|(    1       1  )|(
2   2)|(        3   3)|(         4   4))))>)|5   5

  $RxAttr = qr/^\s+($Name)\s*=\s*("|')/;

  $RxAttr_DL1 = qr/^(?:([^'&]*?)|([^']*?))'/;
  $RxAttr_DL2 = qr/^(?:([^"&]*?)|([^"]*?))"/;
  $RxAttr_RM = qr/[^\s\n]+/;
  $RxPi = qr/^($Name)\s+(.*?)$/s;

  # Keyword processing, only for standalone external DTD
  #[52]    AttlistDecl    ::=    '<!ATTLIST' S Name AttDef* S? '>' 
  #[53]    AttDef    ::=    S Name S AttType S DefaultDecl
  #[45]    elementdecl    ::=    '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration] 
  #[46]    contentspec    ::=    'EMPTY' | 'ANY' | Mixed | children  
 

  $RxENTITY = qr/^\s+(?:($Name)|(?:%\s+($Name)))\s+(.*?)$/s;
  #                  (  1     1|(      2     2))   3   3
  %dflt_general_ent_subst = (
    'amp' =>'&',
    'gt'  =>'>',
    'lt'  =>'<',
    'apos'=>"'",
    'quot'=>"\""
  );
  %dflt_parameter_ent_subst = ();

  %ErrMsg = (
    '01' => "\"mark-up or reserved characters in content (line %s, col %s), malformed element? '%s'\", \$line, \$col, \$datastr",
    '02' => "\"content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
    '03' => "\"element wants to be new root node (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
    '04' => "\"missing start tag for '/%s' (line %s, col %s)\", \$datastr, \$line, \$col",
    '05' => "\"expected closing tag '/%s' (line %s, col %s)\",  \$datastr, \$line, \$col",
    '06' => "\"element wants to be new root node (line %s, col %s): '%s/'\", \$line, \$col, \$datastr",
    '07' => "\"tag syntax '%s' (line %s, col %s)\",  \$datastr, \$line, \$col",
    '08' => "\"invalid, missing or extra tokens in attribute asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '09' => "\"CDATA content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
    '10' => "\"not a valid xml document\"",
    '11' => "\"missing end tag '%s'\", \$datastr",
    '12' => "\"mark-up or reserved characters in content (end), malformed element? '%s'\", \$datastr",
    '13' => "\"content at root level (end): '%s'\", \$datastr",
    '14' => "\"invalid, missing or extra tokens in xmldecl asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '15' => "\"missing xmldecl parameters (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '16' => "\"invalid 'version' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '17' => "\"invalid 'encoding' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '18' => "\"invalid 'standalone' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '19' => "\"unknown xmldecl parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '20' => "\"missing xmldecl 'version' parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
    '21' => "\"unknown or missing processing instruction parameters  (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
    '30' => "\"already in parse\"",
    '31' => "\"data source not defined\"",
    '32' => "\"handler '%s' is not a CODE reference\", \$datastr",
    '33' => "\"data source not string or filehandle, nor reference to one\"",
  );
}

sub throwX
{
	my ($self, $errlvl, $errno, $datastr) = @_;
	my ($estr, $estr_basic) = ('','');

	updateLineColumn($self);
	my ($line, $col) =  ($self->{'line'}, $self->{'col'});

	die "No such error message ($errno)\n" if (!exists $ErrMsg{$errno});

	my $ctmpl = "\$estr_basic = sprintf ($ErrMsg{$errno});";
	eval $ctmpl;
	$estr = "rp_error_$errno, $estr_basic";
	$self->{'parse_errors'}++;

	# call error handler
	$self->{'herror'}($self, $errlvl, $errno, $estr, $estr_basic);

	if ($errlvl eq 'FATAL') {
		Cleanup($self);
		if ($copyxml) { # testing placeholder
			close MYDATA;
		}
		croak $estr."\n";
	}
	elsif (!$self->{'ignore_errors'} && ($errlvl eq 'HARD' || $errlvl eq 'OVR')) {
		Cleanup($self);
		if ($copyxml) { # testing placeholder
			close MYDATA;
		}
		croak $estr."\n";
	}
}



1;

__END__



- Rptest.pl

#######################
# Useage examples
#######################

use strict;
use warnings;

use RXParse;

use Benchmark ':hireswallclock';

my $t0 = new Benchmark;

my $p = new RXParse();


# example user handler
if (0) {
	$p->setHandlers('error' => \&eh);
	sub eh
	{
		my ($p, $errlvl, $errno, $estr, $estr_basic) = @_;
		print "$errlvl, $errno, $estr\n";
#		print STDERR "$errlvl, $errno, $estr\n";
#		print STDERR "$estr\n";
	}
}

#my $fname = "DotNetConfig.xsd";
#my $fname = "scriptCS.snippet";
#my $fname = "SampleInfo.XML";

my $fname = "test.txt";

my $parse_errors = 0;

# BUFFERED
if (1) {
	open DATA, $fname or die "can't open $fname...";
	my $parse_ln = "";
#	$p->setMode( 'debug'=> 1);
#	$p->setMode( 'ignore_errors'=> 1 );
	$p->setMode( 'debug'=> 1, 'ignore_errors'=> 1 );
	$parse_ln = join ('', <DATA>);
	$parse_errors = $p->parse(\$parse_ln);
	#$parse_errors = $p->parse($parse_ln);
	close DATA;
}
# STREAM
else {
	open DATA, $fname or die "can't open $fname...";
#	$p->setMode( 'debug'=> 1);
#	$p->setMode( 'ignore_errors'=> 1 );
	$p->setMode( 'debug'=> 1, 'ignore_errors'=> 1 );
	$parse_errors = $p->parse(*DATA);
	close DATA;
	#open my $fref, "config.html" or die "can't open config.html...";
	#$parse_errors = $p->parse($fref, 1);
	#close $fref;
}

print STDERR "Parse errors = $parse_errors\n";

my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print STDERR "the code took:",timestr($td),"\n";
exit;


__END__

use strict;
use warnings;

use RXParse;


my $parse_ln = '
<html>
  <head>
    <title>$100.00 Dollars</title>
  </head>
  <body>
    <img src="foo.img" alt="$100.00 USD">
    <div>size: 50 x 50</div>
    <div>discount: $75.25</div>
  </body>
</html>
';



my $p = RXParse->new();
#my $p = new RXParse();
$p->setDebugMode(1);
$p->parse(\$parse_ln);






------------------------------

Date: 6 Apr 2001 21:33:47 GMT (Last modified)
From: Perl-Users-Request@ruby.oce.orst.edu (Perl-Users-Digest Admin) 
Subject: Digest Administrivia (Last modified: 6 Apr 01)
Message-Id: <null>


Administrivia:

#The Perl-Users Digest is a retransmission of the USENET newsgroup
#comp.lang.perl.misc.  For subscription or unsubscription requests, send
#the single line:
#
#	subscribe perl-users
#or:
#	unsubscribe perl-users
#
#to almanac@ruby.oce.orst.edu.  

NOTE: due to the current flood of worm email banging on ruby, the smtp
server on ruby has been shut off until further notice. 

To submit articles to comp.lang.perl.announce, send your article to
clpa@perl.com.

#To request back copies (available for a week or so), send your request
#to almanac@ruby.oce.orst.edu with the command "send perl-users x.y",
#where x is the volume number and y is the issue number.

#For other requests pertaining to the digest, send mail to
#perl-users-request@ruby.oce.orst.edu. Do not waste your time or mine
#sending perl questions to the -request address, I don't have time to
#answer them even if I did know the answer.


------------------------------
End of Perl-Users Digest V11 Issue 1542
***************************************


home help back first fref pref prev next nref lref last post