[30299] in Perl-Users-Digest
Perl-Users Digest, Issue: 1542 Volume: 11
daemon@ATHENA.MIT.EDU (Perl-Users Digest)
Tue May 13 21:14:18 2008
Date: Tue, 13 May 2008 18:14:10 -0700 (PDT)
From: Perl-Users Digest <Perl-Users-Request@ruby.OCE.ORST.EDU>
To: Perl-Users@ruby.OCE.ORST.EDU (Perl-Users Digest)
Perl-Users Digest Tue, 13 May 2008 Volume: 11 Number: 1542
Today's topics:
RXParse 1.3 sln@netherlands.co
Digest Administrivia (Last modified: 6 Apr 01) (Perl-Users-Digest Admin)
----------------------------------------------------------------------
Date: Tue, 13 May 2008 16:54:36 -0700
From: sln@netherlands.co
Subject: RXParse 1.3
Message-Id: <tbak24tgbul0rrs37gup62r7aeo40v6mq0@4ax.com>
RXParse 1.3 parse/edit/filter module
Webmasters please adhere to the copyright notice at the top of the RXParse.pl header.
Version 1.3 changes:
- Added methods:
line(),
column(),
original_string(),
setMatchThrottle(),
Errors(),
RXProcessor()
- Completely new code for line/column and markup string and a performance throttling
- Testing hooks for writing xml for modifications on XP2,3,4,5 engines
- Staged for XP2,3 engines
Notes:
* Looking for testers and feedback, email me.
* Looking for donations to keep this effort going, email me.
* Please note that there is a sample usage 'pl' file at the bottom. For now, just use this as a guide. Documentation will arrive soon.
Thanks
robic0(at)adelphia.net
- RXParse.pm
#################################################################
# AUTHOR: robic0, copyright (c) 2006-2008
# Reproduction or publication of contents,
# or distribution in a comercial product, is
# strictly prohibited without the authors concent.
# robic0(at)adelphia.net
#################################################################
# XML/Xhtml - RXParse parse/edit/filter module
my $VERSION = 1.3;
# ------------------------------------------------------
# Compliant with w3c XML: 1.1
# Resources:
# Extensible Markup Language (XML) 1.1
# W3C Recommendation 04 February 2004,
# 15 April 2004
# http://www.w3.org/TR/xml11
#################################################################
$|=1;
package RXParse;
use strict;
use warnings;
use Carp;
use vars qw(@ISA);
@ISA = qw();
#==========================
# RXParse package globals
#==========================
my (
%Dflth,
%ErrMsg,
$Nstrt,$Nchar,$Name,
@UC_Nstart,@UC_Nchar,
$RxParseXP1,
$RxAttr,
$RxAttr_DL1,
$RxAttr_DL2,
$RxAttr_RM,
$RxPi,
$RxENTITY,
%dflt_general_ent_subst,
%dflt_parameter_ent_subst
);
my $parsinitflg = 0;
if (!$parsinitflg) {
InitParser();
$parsinitflg = 1;
}
#========================
# RXParse user methods
#========================
sub new
{
my ($class, @args) = @_;
my $self = {
'debug' => 0,
'ignore_errors' => 0,
'parse_errors' => 0,
'proctype' => 'NONE',
'line' => 0,
'col' => 0,
'match_throttle' => 400,
};
Cleanup($self);
setDfltHandlers($self);
return bless ($self, $class);
}
sub original_content
{
my $self = shift;
if (defined $self->{'refcontent'} &&
ref($self->{'refcontent'}) eq 'SCALAR') {
return ${$self->{'refcontent'}};
}
return "";
}
sub original_string
{
my $self = shift;
if (defined $self->{'refparsln'} &&
ref($self->{'refparsln'}) eq 'SCALAR')
{
my $count = $self->{'EVT_end_pos'} - $self->{'EVT_begin_pos'};
return '' if (!($count > 0));
return substr(${$self->{'refparsln'}}, $self->{'EVT_begin_pos'}, $self->{'EVT_end_pos'}-$self->{'EVT_begin_pos'});
# (alternate, this is too slow but shouldn't be)
#else {
# my $savpos = pos(${$self->{'refparsln'}});
# pos(${$self->{'refparsln'}}) = $self->{'EVT_begin_pos'};
#
# while (${$self->{'refparsln'}} =~ /(.{1,$count})/gs) {
# pos(${$self->{'refparsln'}}) = $savpos;
# return $1;
# }
#}
}
return '';
}
sub setMatchThrottle
{
# This is the number of non-error matches before updating the line/column position.
# 400 is the default, 0 disables. This value should be algorithmically adjusted based
# on file or buffer size, or users knowledge.
# On clean xml this should be set to 0 for the best performance.
# Automatic updates will be made on errors, or calls to get line/column, which resets the counter.
# --------------------------------------------------------------------
my ($self, $throttle) = @_;
return if ($throttle = undef);
$self->{'match_throttle'} = $throttle;
}
sub line
{
my $self = shift;
updateLineColumn($self);
return $self->{'line'};
}
sub column
{
my $self = shift;
updateLineColumn($self);
return $self->{'col'};
}
sub setMode
{
my ($self, @args) = @_;
if (scalar(@args)) {
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
if (lc($name) eq 'debug') {
$self->{'debug'} = 0;
$self->{'debug'} = 1 if (defined $val && $val);
}
elsif (lc($name) eq 'ignore_errors') {
$self->{'ignore_errors'} = 0;
$self->{'ignore_errors'} = 1 if (defined $val && $val);
}
# add more here
}
}
}
sub stopParse
{
my $self = shift;
$self->{'kill_parse'} = 1;
}
sub Errors
{
my $self = shift;
return $self->{'parse_errors'};
}
sub RXProcessor
{
my $self = shift;
return $self->{'proctype'};
}
sub setDfltHandlers
{
my ($self, $name) = @_;
if (defined $name) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $Dflth{$hname}) {
$self->{$hname} = $Dflth{$hname};
}
} else {
foreach my $key (keys %Dflth) {
$self->{$key} = $Dflth{$key};
}
}
}
sub setHandlers
{
my ($self, @args) = @_;
my %oldh = ();
if (scalar(@args)) {
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $self->{$hname}) {
$oldh{$name} = $self->{$hname};
if (ref($val) eq 'CODE') {
$self->{$hname} = $val;
} else {
# fatal error if not a CODE ref
throwX($self, 'FATAL', '32', $name);
}
}
}
}
return %oldh;
}
sub parse ## XP1 processor
{
my ($self, $data, @args) = @_;
$self->{'parse_errors'} = 0;
if ($self->{'InParse'}) {
# fatal error if already in parse
throwX($self, 'FATAL', '30');
}
unless (defined $data) {
# fatal error if data source not defined
throwX($self, 'FATAL', '31');
}
$self->{'InParse'} = 1;
$self->{'line'} = 0;
$self->{'col'} = 0;
$self->{'LC_crlf_pos'} = 0;
$self->{'BUFFERED'} = 0;
$self->{'proctype'} = 'XP1';
if (ref($data) eq 'SCALAR') {
# call parse handler
$self->{'hparsestart'}($self, "SCALAR ref");
XP1 ($self, 1, $data);
}
elsif (ref(\$data) eq 'SCALAR') {
# call parse handler
$self->{'hparsestart'}($self, "SCALAR string");
XP1 ($self, 1, \$data);
} else {
if (ref($data) ne 'GLOB' && ref(\$data) ne 'GLOB') {
# data source not string or filehandle, nor reference to one
throwX($self, 'FATAL', '33');
}
# call new_parse handler
$self->{'hparsestart'}($self, "GLOB ref or filehandle");
XP1 ($self, 0, $data);
}
Cleanup($self);
return $self->{'parse_errors'};
}
#==========================
# RXParse non-user methods
#==========================
my $copyxml = 0; # copy xml; set to 1 to copy (testing placeholder for XP2,3,4,5 engines)
sub Cleanup
{
my $self = shift;
InitEntities($self);
$self->{'refcontent'} = undef;
$self->{'refparsln'} = undef;
$self->{'InParse'} = 0;
$self->{'kill_parse'} = 0;
}
sub InitEntities
{
my $self = shift;
# initial compiled regexp
$self->{'Entities'} = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?:([0-9]+)|(x[0-9a-fA-F]+)))";
# ( 4 4|5 5)
$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
# 1 12 23 3
# initial entity hash
$self->{'general_ent_subst'} = {%dflt_general_ent_subst};
$self->{'parameter_ent_subst'} = {%dflt_parameter_ent_subst};
$self->{'ring_ent_subst'} = {};
}
sub updateLineColumn
{
my $self = shift;
if (defined $self->{'refparsln'} &&
ref($self->{'refparsln'}) eq 'SCALAR')
{
my $beginpos = 0;
my $endpos = $self->{'LC_end_pos'};
if ($self->{'BUFFERED'}) {
$beginpos = $self->{'LC_begin_pos'};
}
my $save_pos = pos(${$self->{'refparsln'}});
pos(${$self->{'refparsln'}}) = $beginpos;
while (${$self->{'refparsln'}} =~ /\n/g)
{
last if (pos(${$self->{'refparsln'}}) > $endpos);
if ($self->{'BUFFERED'}) {
$self->{'line'}++;
}
$self->{'LC_crlf_pos'} = pos(${$self->{'refparsln'}});
}
pos(${$self->{'refparsln'}}) = $save_pos;
$self->{'col'} = $endpos-$self->{'LC_crlf_pos'};
$self->{'LC_begin_pos'} = $endpos;
}
$self->{'LC_mtcount'} = 0;
}
sub XP1 # XP1 processor, parse only, non-edit
{
my ($self, $BUFFERED, $rpl_mk) = @_;
my ($markup_file);
my $parse_ln = '';
my $dyna_ln = '';
my $ref_parse_ln = \$parse_ln;
my $ref_dyna_ln = \$dyna_ln;
if ($BUFFERED) {
$ref_parse_ln = $rpl_mk;
$ref_dyna_ln = \$dyna_ln;
} else {
# assume its a ref to a global or global itself
$markup_file = $rpl_mk;
$ref_dyna_ln = $ref_parse_ln;
# seek ($markup_file, 0, 0);
}
my $complete_comment = 0;
my $complete_cdata = 0;
my @Tags = ();
my $havroot = 0;
my $last_content_pos = 0;
my $data_slug = 0;
my $content = '';
my $altcontent = undef;
my $slug_pos = 0;
my $begin_pos = 0;
$self->{'refcontent'} = \$content;
$self->{'BUFFERED'} = $BUFFERED;
$self->{'refparsln'} = $ref_parse_ln;
$self->{'EVT_begin_pos'} = 0;
$self->{'EVT_end_pos'} = 0;
$self->{'LC_crlf_pos'} = 0;
$self->{'LC_begin_pos'} = 0;
$self->{'LC_end_pos'} = 0;
$self->{'LC_mtcount'} = 0;
if ($copyxml) {
## test write a copy
open (MYDATA, '>', 'ittcopy.txt') or die "ittcopy.txt";
}
while (!$data_slug)
{
# stream processing (if not buffered)
if (!$BUFFERED) {
if (!($_ = <$markup_file>)) {
# just parse what we have
$data_slug = 1;
# boundry check for runnaway
#if (($complete_comment+$complete_cdata) > 0) {}
} else {
$self->{'line'}++;
$$ref_parse_ln .= $_;
## buffer if needing comment/cdata closure
next if ($complete_comment && !/-->/);
next if ($complete_cdata && !/\]\]>/);
## reset comment/cdata flags
$complete_comment = 0;
$complete_cdata = 0;
## flag serialized comments/cdata buffering
if (/(<!--)|(<!\[CDATA\[)/)
{
if (defined $1) { # complete comment
if ($$ref_parse_ln !~ /<!--.*?-->/s) {
$complete_comment = 1;
next;
}
}
elsif (defined $2) { # complete cdata
if ($$ref_parse_ln !~ /<!\[CDATA\[.*?\]\]>/s) {
$complete_cdata = 1;
next;
}
}
}
## buffer until '>' or eof
next if (!/>/);
}
} else {
$self->{'line'} = 1;
$data_slug = 1;
}
$slug_pos = 0;
$begin_pos = 0;
$self->{'LC_mtcount'} = 0;
## REGEX Parsing loop
while (!$self->{'kill_parse'} && $$ref_parse_ln =~ /$RxParseXP1/g)
{
## handle contents
if (defined $15) {
$content .= $15;
$last_content_pos = pos($$ref_parse_ln);
next;
}
## valid content here ... can be taken off
print "-"x20,"\n" if ($self->{'debug'});
if (length ($content)) {
# update line/column after match_throttle (content)
# --------------------------------------------------
$self->{'LC_end_pos'} = $last_content_pos;
if ($self->{'match_throttle'} > 0 && $self->{'LC_mtcount'}++ > $self->{'match_throttle'}) {
updateLineColumn($self);
}
# save the positions that triggered the event
# -------------------------------------------
$self->{'EVT_begin_pos'} = $begin_pos;
$self->{'EVT_end_pos'} = $last_content_pos;
#** print original_string($self)."\n";
$begin_pos = $last_content_pos;
## check reserved characters in content
if ($content =~ /[<>]/) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
## mark-up characters in content
throwX($self, 'OVR', '01', $content);
}
if (!scalar(@Tags)) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
if ($content =~ /[^\s]/s) {
## content at root level
throwX($self, 'OVR', '02', $content);
}
}
# substitute special xml characters, then call content handler with $content
# ------------------------------------------------------
# $content has to be a constant if xml reserved chars
# are found, copy altered string to pass to handler
# otherwise pass original $content
# ------------------------------------------------------
if (defined ($altcontent = convertEntities ($self, \$content))) {
$self->{'hchar'}($self, $$altcontent);
} else {
$self->{'hchar'}($self, $content);
}
#print "15 $content\n" if ($self->{'debug'});
#print "-"x20,"\n" if ($self->{'debug'});
$content = '';
}
# update line/column after match_throttle (markup)
# -------------------------------------------------
$self->{'LC_end_pos'} = pos($$ref_parse_ln);
if ($self->{'match_throttle'} > 0 && $self->{'LC_mtcount'}++ > $self->{'match_throttle'}) {
updateLineColumn($self);
}
# save the positions that triggered the event
# -------------------------------------------
$self->{'EVT_begin_pos'} = $begin_pos;
$self->{'EVT_end_pos'} = pos($$ref_parse_ln);
#** print original_string($self)."\n";
$begin_pos = pos($$ref_parse_ln);
#if ($show_pos && $debug) {
# my $rr = pos($$ref_parse_ln);
# print "$rr ";
#}
## <tag> or </tag> or <tag/>
if (defined $2)
{
my ($l1,$l3) = (length($1),length($3));
if (($l1+$l3)==0) { ## <tag>
if (!scalar(@Tags) && $havroot) {
## new root node <tag>
throwX($self, 'OVR', '03', $2);
}
push @Tags,$2;
$havroot = 1;
# call start tag handler with $2
$self->{'hstart'}($self, $2);
}
elsif ($l1==1 && $l3==0) { ## </tag>
my $pval = pop @Tags;
if (!defined $pval) {
## missing start tag </tag>
throwX($self, 'OVR', '04', $2);
}
elsif ($2 ne $pval) {
## expected closing tag </tag>
throwX($self, 'OVR', '05', $pval);
}
# call end tag handler with $2
$self->{'hend'}($self, $2);
}
elsif ($l1==0 && $l3==1) { ## <tag/>
if (!scalar(@Tags) && $havroot) {
## new root node <tag/>
throwX($self, 'OVR', '06', $2);
}
$havroot = 1; # first and only <root/>
# call start tag handler, then end tag handler, with $2
$self->{'hstart'}($self, $2);
$self->{'hend'}($self, $2);
} else {
## <//node//> errors
## hard error, just report
throwX($self, 'HARD', '07', "$1$2$3");
}
#print "2 TAG: $1$2$3\n" if ($self->{'debug'});
}
## <tag attrib/> or <tag attrib>
elsif (defined $5)
{
my $l7 = length($7);
## attributes
my $attref = getAttrARRAY($self, $6);
if (ref($attref)) {
if ($l7==0) { ## <tag attrib>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX($self, 'OVR', '03', $5);
}
push @Tags,$5;
$havroot = 1;
# call start tag handler with $5 and attributes @{$attref}
$self->{'hstart'}($self, $5, @{$attref});
}
elsif ($l7==1) { ## <tag attrib/>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX($self, 'OVR', '06', $7);
}
$havroot = 1; # first and only <root attrib/>
# call start tag handler with $5 and attributes @{$attref}, then end tag handler with $5
$self->{'hstart'}($self, $5, @{$attref});
$self->{'hend'}($self, $5);
} else {
## syntax error
## hard error, just report
throwX($self, 'HARD', '07', "$5$6$7");
}
} else {
## missing or extra token
## hard error, just report
throwX($self, 'HARD', '08', $attref);
}
#if ($self->{'debug'}) {
# print "5,6 TAG: $5 Attr: $6$7\n" ;
#}
}
## XMLDECL or PI (processing instruction)
elsif (defined $8)
{
my $pi = $8;
# XML declaration ?
if ($pi =~ /^xml(.*?)$/) {
my $attref = getAttrARRAY($self, $1);
if (ref($attref)) {
#if (!scalar(@{$attref})) {
# ## missing xmldecl parameters
# throwX($self, 'OVR', '15', $pi);
#}
my ($version,$encoding,$standalone);
while (my ($name,$val) = splice (@{$attref}, 0, 2)) {
if ('version' eq lc($name) && !defined $version) {
if ($val !~ /^[0-9]\.[0-9]+$/) {
## invalid version character data in xmldecl
throwX($self, 'OVR', '16', "$name = '$val'");
}
$version = $val;
} elsif ('encoding' eq lc($name) && !defined $encoding) {
if ($val !~ /^[A-Za-z][\w\.-]*$/) {
## invalid encoding character data in xmldecl
throwX($self, 'OVR', '17', "$name = '$val'");
}
$encoding = $val;
} elsif ('standalone' eq lc($name) && !defined $standalone) {
if ($val !~ /^(?:yes)|(?:no)$/) {
## invalid standalone character data in xmldecl
throwX($self, 'OVR', '18', "$name = '$val'");
}
$standalone = ($val eq 'yes' ? 1 : 0);
} else {
## unknown xmldecl parameter
throwX($self, 'OVR', '19', "$name = '$val'");
}
}
if (!defined $version) {
# missing version in xmldecl
## hard error, just report
throwX($self, 'HARD', '20', $pi);
} else {
# call xmldecl handler
$self->{'hxmldecl'}($self, $version, $encoding, $standalone);
}
} else {
## missing or extra token in xmldecl
## hard error, just report
throwX($self, 'HARD', '14', $attref);
}
}
# PI - processing instruction
elsif ($pi =~ /$RxPi/) {
# call pi handler
$self->{'hproc'}($self, $1, $2);
} else {
# unknown PI data
throwX($self, 'HARD', '21', $pi);
}
#print "8 VERSION: $8\n" if ($self->{'debug'});
}
## META - html (not used)
#elsif (defined $4) {
# # If doctype is HTML then META is not closed
# # parse meta data, call handler
# $self->{'hmeta'}($self, $4);
# #print "4 META: $4\n" if ($self->{'debug'});
#}
## DOCTYPE
elsif (defined $9) {
# parse doctype, call handler
$self->{'hdoctype'}($self, $9);
#print "9 DOCTYPE: $9\n" if ($self->{'debug'});
}
## CDATA
elsif (defined $10) {
if (!scalar(@Tags)) {
## CDATA content at root
throwX($self, 'OVR', '09', $10);
}
# call cdata handler
$self->{'hcdata'}($self, $10);
#print "10 CDATA: $10\n" if ($self->{'debug'});
}
## COMMENT
elsif (defined $11) {
# call comment handler
$self->{'hcomment'}($self, $11);
#print "11 COMMENT: $11\n" if ($self->{'debug'});
}
## ENTITY
elsif (defined $13) {
# parse entity, call handler
my ($entdata, $entdata_added, $entname) = ($13, undef, '');
if ($entdata =~ /$RxENTITY/) {
if (defined $1) {
# general entity replacement
$entdata_added = addEntity($self, 0, $1, $3);
$entname = "&$1";
} else {
# parameter entity replacement
$entdata_added = addEntity($self, 1, $2, $3);
$entname = "&$2";
}
}
else {
# unknown ENTITY data
#
}
if (defined $entdata_added) {
$self->{'hentity'}($self, $entname, $$entdata_added);
} else {
$self->{'hentity'}($self, $entname, $entdata);
}
#print "13 ENTITY: $13\n" if ($self->{'debug'});
}
## ATTLIST - noninternal
elsif (defined $12) {
# parse attlist, call handler
$self->{'hattlist'}($self, $12);
#print "12 ATTLIST: $12\n" if ($self->{'debug'});
}
## ELMENT - noninternal
elsif (defined $14) {
# parse element, call handler
$self->{'helement'}($self, $14);
#print "14 ELEMENT: $14\n" if ($self->{'debug'});
}
#######################################################################################
# Slug Position - for writing (testing placeholder for processor engines xp2,3,4,5)
if ($copyxml) {
print MYDATA substr($$ref_parse_ln, $slug_pos, pos($$ref_parse_ln)-$slug_pos);
}
$slug_pos = pos($$ref_parse_ln);
#######################################################################################
}
$$ref_dyna_ln = $content;
$content = '';
}
if ($copyxml) { # testing placeholder
close MYDATA;
}
if ($self->{'kill_parse'}) {
$self->{'refparsln'} = undef;
$self->{'refcontent'} = undef;
return 1;
}
if (!$havroot) {
# not valid xml
throwX($self, 'OVR', '10');
}
if (scalar(@Tags)) {
my $str = '';
while (defined (my $etag = pop @Tags)) {
$str .= ", /$etag";
}
$str =~ s/^, +//;
# missing end tag
throwX($self, 'OVR', '11', $str);
}
if ($$ref_dyna_ln =~ /[^\s]/s) {
$self->{'refparsln'} = $ref_dyna_ln;
$self->{'EVT_begin_pos'} = 0;
$self->{'EVT_end_pos'} = length($$ref_dyna_ln);
if ($$ref_dyna_ln =~ /[<>]/) {
# mark-up characters in content
throwX($self, 'OVR', '12', $$ref_dyna_ln);
} else {
# content at root level (end)
throwX($self, 'OVR', '13', $$ref_dyna_ln);
}
}
$self->{'refcontent'} = undef;
$self->{'refparsln'} = undef;
return 1;
}
sub getAttrARRAY
{
my ($self, $attrstr) = @_;
my $aref = [];
my ($alt_attval, $attval, $rx);
while ($attrstr =~ s/$RxAttr//) {
push @{$aref},$1;
if ($2 eq "'") {
$rx = \$RxAttr_DL1;
} else {
$rx = \$RxAttr_DL2;
}
if ($attrstr =~ s/$$rx//) {
if (defined $1) {
push @{$aref},$1;
next;
}
$attval = $2;
if (defined ($alt_attval = convertEntities ($self, \$attval))) {
push @{$aref},$$alt_attval;
next;
}
push @{$aref},$attval;
next;
}
# bad, return that part of string which is in error
return $attrstr;
}
if ($attrstr=~/$RxAttr_RM/) {
$attrstr =~ s/^\s+//s; $attrstr =~ s/\s+$//s;
# bad, return that part of string which is in error
return $attrstr if (length($attrstr));
}
# normal, return a ref .. all was good
return $aref;
}
sub convertEntities
{
my ($self, $str_ref, $opts) = @_;
my $alt_str = '';
my $res = 0;
my ($entchr);
# Usage info:
# Option bitmask: 1=char reference, 2=general reference, 4=parameter reference
# Default option is char and general references (&)
# Ignore Parameter references (%) in Attvalue and Content
# Process PE's in DTD and Entity decls
$opts = 3 unless defined $opts;
while ($$str_ref =~ /$self->{'RxEntConv'}/gc)
{
# Unicode character reference
if (defined $4) {
# decimal
if (($opts & 1) && defined ($entchr = getEntityUchar($self, $4))) {
$alt_str .= "$1$entchr";
$res = 1;
} else {
$alt_str .= "$1$2#$4;";
}
} elsif (defined $5) {
# hex
if (($opts & 1) && length($5) < 9 && defined ($entchr = getEntityUchar($self, hex($5)))) {
$alt_str .= "$1$entchr";
$res = 1;
} else {
$alt_str .= "$1$2#$5;";
}
}
else {
# General reference
if ($2 eq '&') {
if (($opts & 2) && exists $self->{'general_ent_subst'}->{$3}) {
$alt_str .= $1;
# expand general references,
# bypass if seen in the recursion ring
# ----
if (defined $self->{'ring_ent_subst'}->{$3}) {
$alt_str .= "$1$2$3;";
} else {
# recurse expansion
# ----
my ($entname, $alt_entval) = ($3, undef);
my $entval = $self->{'general_ent_subst'}->{$entname};
$self->{'ring_ent_subst'}->{$entname} = 1;
if (defined ($alt_entval = convertEntities ($self, \$entval, 2))) {
$alt_str .= $$alt_entval;
} else {
$alt_str .= $self->{'general_ent_subst'}->{$entname};
}
$self->{'ring_ent_subst'}->{$entname} = undef;
$res = 1;
}
} else {
$alt_str .= "$1$2$3;";
}
} else {
# Parameter reference
if (($opts & 4) && exists $self->{'parameter_ent_subst'}->{$3}) {
$alt_str .= "$1$self->{'parameter_ent_subst'}->{$3}";
$res = 1;
} else {
$alt_str .= "$1$2$3;";
}
}
}
}
if ($res) {
$alt_str .= substr $$str_ref, pos($$str_ref);
return \$alt_str;
}
return undef;
}
sub getEntityUchar
{
my ($self, $code) = @_;
if (($code >= 0x01 && $code <= 0xD7FF) ||
($code >= 0xE000 && $code <= 0xFFFD) ||
($code >= 0x10000 && $code <= 0x10FFFF)) {
return chr($code);
}
return undef;
}
sub addEntity
{
my ($self, $peflag, $entname, $entval) = @_;
# Non-normalized, internal entities only
# (no external defs yet, ie:SYSTEM/PUBLIC/NDATA)
return undef unless
($entval =~ s/^\s*'([^']*?)'\s*$/$1/s || $entval =~ s/^\s*"([^"]*?)"\s*$/$1/s);
# Replacement text: convert parameter and character references only
my ($alt_entval);
if (defined ($alt_entval = convertEntities ($self, \$entval, 5))) {
$entval = $$alt_entval;
}
my $enttype = 'general_ent_subst';
$enttype = 'parameter_ent_subst' if ($peflag);
if (exists $self->{'$enttype'}->{$entname}) {
# warn, pre-existing ent name
return undef;
}
$self->{$enttype}->{$entname} = $entval;
$self->{'Entities'} .= "|(?:$entname)";
# recompile regexp
$self->{'RxEntConv'} = qr/(.*?)(&|%)($self->{'Entities'});/s;
return \$entval;
}
# default handlers
# ------------------
sub dflt_parsestart {my ($self, $parm) = @_;print "RXParse $VERSION\nnew parse _: $parm\n" if ($self->{'debug'});}
#sub dflt_start {
# my ($self, $el, @attr) = @_;
# if ($self->{'debug'}) {
# print "start _: $el\n";
# while (my ($name,$val) = splice (@attr, 0,2)) {
# print " "x12,"$name = $val\n";
# }
# }
#}
# or
sub dflt_start {
my ($self, $el, %atts) = @_;
if ($self->{'debug'}) {
print "start _: $el\n";
foreach my $name (keys %atts) {
print " "x12,"$name = $atts{$name}\n";
}
}
}
sub dflt_char {
my ($self, $str) = @_;
if ($self->{'debug'}) {
print "char _: $str\n";
print "-"x20,"\n";
}
}
sub dflt_end {my ($self, $el) = @_;print "end _: /$el\n" if ($self->{'debug'});}
sub dflt_cdata {my ($self, $str) = @_;print "cdata _: $str\n" if ($self->{'debug'});}
sub dflt_comment {my ($self, $str) = @_;print "comnt _: $str\n" if ($self->{'debug'});}
sub dflt_meta {my ($self, $str) = @_;print "meta _: $str\n" if ($self->{'debug'});}
sub dflt_attlist {my ($self, $parm) = @_;print "attlist_h _: $parm\n" if ($self->{'debug'});}
sub dflt_doctype {my ($self, $parm) = @_;print "doctype_h _: $parm\n" if ($self->{'debug'});}
sub dflt_element {my ($self, $parm) = @_;print "element_h _: $parm\n" if ($self->{'debug'});}
sub dflt_entity {
my ($self, $entname, $entval) = @_;
if ($self->{'debug'}) {
print "entity_h _: $entname = $entval\n";
}
}
sub dflt_xmldecl {
my ($self, $version, $encoding, $standalone) = @_;
if ($self->{'debug'}) {
print "xmldecl_h _: version = $version\n" if (defined $encoding);
print " "x14,"encoding = $encoding\n" if (defined $encoding);
print " "x14,"standalone = $standalone\n" if (defined $standalone);
}
}
sub dflt_proc {
my ($self, $target, $data) = @_;
if ($self->{'debug'}) {
print "proc_h _: target = $target\n";
print " "x14,"data = $data\n";
}
}
sub dflt_error {my ($self, $errlvl, $errno, $estr, $estr_basic) = @_;print "$estr\n" if ($self->{'debug'});}
# ======================
# RXParse global init
# ======================
sub InitParser
{
%Dflth = (
'hparsestart' => \&dflt_parsestart,
'hstart' => \&dflt_start,
'hend' => \&dflt_end,
'hchar' => \&dflt_char,
'hcdata' => \&dflt_cdata,
'hcomment' => \&dflt_comment,
'hmeta' => \&dflt_meta,
'hattlist' => \&dflt_attlist,
'hentity' => \&dflt_entity,
'hdoctype' => \&dflt_doctype,
'helement' => \&dflt_element,
'hxmldecl' => \&dflt_xmldecl,
'hproc' => \&dflt_proc,
'herror' => \&dflt_error,
);
@UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
@UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
$Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
$Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
$Name = "(?:$Nstrt$Nchar*?)";
#die "$Name\n";
$RxParseXP1 =
qr/(?:<(?:(?:(\/*)($Name)\s*(\/*))|(?:META(.*?))|(?:($Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(?:(?:DOCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?))|(?:ELEMENT(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( *4 *4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3)|( 4 4))))>)|5 5
$RxAttr = qr/^\s+($Name)\s*=\s*("|')/;
$RxAttr_DL1 = qr/^(?:([^'&]*?)|([^']*?))'/;
$RxAttr_DL2 = qr/^(?:([^"&]*?)|([^"]*?))"/;
$RxAttr_RM = qr/[^\s\n]+/;
$RxPi = qr/^($Name)\s+(.*?)$/s;
# Keyword processing, only for standalone external DTD
#[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
#[53] AttDef ::= S Name S AttType S DefaultDecl
#[45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration]
#[46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
$RxENTITY = qr/^\s+(?:($Name)|(?:%\s+($Name)))\s+(.*?)$/s;
# ( 1 1|( 2 2)) 3 3
%dflt_general_ent_subst = (
'amp' =>'&',
'gt' =>'>',
'lt' =>'<',
'apos'=>"'",
'quot'=>"\""
);
%dflt_parameter_ent_subst = ();
%ErrMsg = (
'01' => "\"mark-up or reserved characters in content (line %s, col %s), malformed element? '%s'\", \$line, \$col, \$datastr",
'02' => "\"content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'03' => "\"element wants to be new root node (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'04' => "\"missing start tag for '/%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'05' => "\"expected closing tag '/%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'06' => "\"element wants to be new root node (line %s, col %s): '%s/'\", \$line, \$col, \$datastr",
'07' => "\"tag syntax '%s' (line %s, col %s)\", \$datastr, \$line, \$col",
'08' => "\"invalid, missing or extra tokens in attribute asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
'09' => "\"CDATA content at root level (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'10' => "\"not a valid xml document\"",
'11' => "\"missing end tag '%s'\", \$datastr",
'12' => "\"mark-up or reserved characters in content (end), malformed element? '%s'\", \$datastr",
'13' => "\"content at root level (end): '%s'\", \$datastr",
'14' => "\"invalid, missing or extra tokens in xmldecl asignment (line %s, col %s): %s\", \$line, \$col, \$datastr",
'15' => "\"missing xmldecl parameters (line %s, col %s): %s\", \$line, \$col, \$datastr",
'16' => "\"invalid 'version' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'17' => "\"invalid 'encoding' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'18' => "\"invalid 'standalone' character data in xmldecl (line %s, col %s): %s\", \$line, \$col, \$datastr",
'19' => "\"unknown xmldecl parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
'20' => "\"missing xmldecl 'version' parameter (line %s, col %s): %s\", \$line, \$col, \$datastr",
'21' => "\"unknown or missing processing instruction parameters (line %s, col %s): '%s'\", \$line, \$col, \$datastr",
'30' => "\"already in parse\"",
'31' => "\"data source not defined\"",
'32' => "\"handler '%s' is not a CODE reference\", \$datastr",
'33' => "\"data source not string or filehandle, nor reference to one\"",
);
}
sub throwX
{
my ($self, $errlvl, $errno, $datastr) = @_;
my ($estr, $estr_basic) = ('','');
updateLineColumn($self);
my ($line, $col) = ($self->{'line'}, $self->{'col'});
die "No such error message ($errno)\n" if (!exists $ErrMsg{$errno});
my $ctmpl = "\$estr_basic = sprintf ($ErrMsg{$errno});";
eval $ctmpl;
$estr = "rp_error_$errno, $estr_basic";
$self->{'parse_errors'}++;
# call error handler
$self->{'herror'}($self, $errlvl, $errno, $estr, $estr_basic);
if ($errlvl eq 'FATAL') {
Cleanup($self);
if ($copyxml) { # testing placeholder
close MYDATA;
}
croak $estr."\n";
}
elsif (!$self->{'ignore_errors'} && ($errlvl eq 'HARD' || $errlvl eq 'OVR')) {
Cleanup($self);
if ($copyxml) { # testing placeholder
close MYDATA;
}
croak $estr."\n";
}
}
1;
__END__
- Rptest.pl
#######################
# Useage examples
#######################
use strict;
use warnings;
use RXParse;
use Benchmark ':hireswallclock';
my $t0 = new Benchmark;
my $p = new RXParse();
# example user handler
if (0) {
$p->setHandlers('error' => \&eh);
sub eh
{
my ($p, $errlvl, $errno, $estr, $estr_basic) = @_;
print "$errlvl, $errno, $estr\n";
# print STDERR "$errlvl, $errno, $estr\n";
# print STDERR "$estr\n";
}
}
#my $fname = "DotNetConfig.xsd";
#my $fname = "scriptCS.snippet";
#my $fname = "SampleInfo.XML";
my $fname = "test.txt";
my $parse_errors = 0;
# BUFFERED
if (1) {
open DATA, $fname or die "can't open $fname...";
my $parse_ln = "";
# $p->setMode( 'debug'=> 1);
# $p->setMode( 'ignore_errors'=> 1 );
$p->setMode( 'debug'=> 1, 'ignore_errors'=> 1 );
$parse_ln = join ('', <DATA>);
$parse_errors = $p->parse(\$parse_ln);
#$parse_errors = $p->parse($parse_ln);
close DATA;
}
# STREAM
else {
open DATA, $fname or die "can't open $fname...";
# $p->setMode( 'debug'=> 1);
# $p->setMode( 'ignore_errors'=> 1 );
$p->setMode( 'debug'=> 1, 'ignore_errors'=> 1 );
$parse_errors = $p->parse(*DATA);
close DATA;
#open my $fref, "config.html" or die "can't open config.html...";
#$parse_errors = $p->parse($fref, 1);
#close $fref;
}
print STDERR "Parse errors = $parse_errors\n";
my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print STDERR "the code took:",timestr($td),"\n";
exit;
__END__
use strict;
use warnings;
use RXParse;
my $parse_ln = '
<html>
<head>
<title>$100.00 Dollars</title>
</head>
<body>
<img src="foo.img" alt="$100.00 USD">
<div>size: 50 x 50</div>
<div>discount: $75.25</div>
</body>
</html>
';
my $p = RXParse->new();
#my $p = new RXParse();
$p->setDebugMode(1);
$p->parse(\$parse_ln);
------------------------------
Date: 6 Apr 2001 21:33:47 GMT (Last modified)
From: Perl-Users-Request@ruby.oce.orst.edu (Perl-Users-Digest Admin)
Subject: Digest Administrivia (Last modified: 6 Apr 01)
Message-Id: <null>
Administrivia:
#The Perl-Users Digest is a retransmission of the USENET newsgroup
#comp.lang.perl.misc. For subscription or unsubscription requests, send
#the single line:
#
# subscribe perl-users
#or:
# unsubscribe perl-users
#
#to almanac@ruby.oce.orst.edu.
NOTE: due to the current flood of worm email banging on ruby, the smtp
server on ruby has been shut off until further notice.
To submit articles to comp.lang.perl.announce, send your article to
clpa@perl.com.
#To request back copies (available for a week or so), send your request
#to almanac@ruby.oce.orst.edu with the command "send perl-users x.y",
#where x is the volume number and y is the issue number.
#For other requests pertaining to the digest, send mail to
#perl-users-request@ruby.oce.orst.edu. Do not waste your time or mine
#sending perl questions to the -request address, I don't have time to
#answer them even if I did know the answer.
------------------------------
End of Perl-Users Digest V11 Issue 1542
***************************************