[32339] in Perl-Users-Digest
Perl-Users Digest, Issue: 3606 Volume: 11
daemon@ATHENA.MIT.EDU (Perl-Users Digest)
Sun Feb 5 14:09:20 2012
Date: Sun, 5 Feb 2012 11:09:04 -0800 (PST)
From: Perl-Users Digest <Perl-Users-Request@ruby.OCE.ORST.EDU>
To: Perl-Users@ruby.OCE.ORST.EDU (Perl-Users Digest)
Perl-Users Digest Sun, 5 Feb 2012 Volume: 11 Number: 3606
Today's topics:
Loop in regex match (Seymour J.)
Digest Administrivia (Last modified: 6 Apr 01) (Perl-Users-Digest Admin)
----------------------------------------------------------------------
Date: Sun, 05 Feb 2012 11:13:17 -0500
From: Shmuel (Seymour J.) Metz <spamtrap@library.lspace.org.invalid>
Subject: Loop in regex match
Message-Id: <4f2eaa9d$10$fuzhry+tra$mr2ice@news.patriot.net>
I have an regex that appears to hang where I would expect it to fail
due to \d not matching a comma. If I remove the \d then the regex
correctly matches the partial string. the match that hangs is at line
452 of the script:
if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d )/xi)
The actual script is
extproc perl.exe -SW
# extproc F:\Perl\bin\perl -SW
#!/usr/bin/perl -W
use 5.010;
use Data::Dumper;
use Regexp::Common qw /net URI/;
use Socket;
use strict;
my $decOctetPat = qr/ \d |
[1-9] \d |
1 \d \d |
2 [0-4] \d |
25 [0-5]
/x;
my $IPv4addressPat = qr/ (?:$decOctetPat\.){3} $decOctetPat /x;
my $IPv6h16 = qr/[[:xdigit:]]{1,4}/;
my $IPv6ls32 = qr/ $IPv6h16 \: $IPv6h16 | $IPv4addressPat /x;
my $IPv6AddrPat = qr/ (?: (?: $IPv6h16 \: ){6} $IPv6ls32 ) |
(?: \:\: (?: $IPv6h16 \: ){5} $IPv6ls32 ) |
(?: (?: $IPv6h16 )? \:\: (?: $IPv6h16 \: ){4} $IPv6ls32 ) |
(?: (?: $IPv6h16 \: $IPv6h16 )? \:\: (?: $IPv6h16 \: ){3} $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){2} $IPv6h16 )? \:\: (?: $IPv6h16 \: ){2} $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){3} $IPv6h16 )? \:\: $IPv6h16 \: $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){5} $IPv6h16 )? \:\: $IPv6ls32 ) |
(?: (?: (?: $IPv6h16 \: ){6} $IPv6h16 )? \:\: )
/x;
my $domainPat = qr/[[:alnum:]]+
[[:alnum:]-]*
(?:\. [[:alnum:]]+ [[:alnum:]-]*)*
/x;
my $addressLiteralPat = qr/\[
(?:$IPv4addressPat |
$IPv6AddrPat
)
\]
/x;
my $atextPat = qr"(?:[\w!#\$%&'*+/=?^`{|}~-]+)";
#y $FWS = qr/ (?:[ \t]*\15?\12)? [ \t]+ /x;
my $FWS = qr/ (?:[ \t]*\15?\n)? [ \t]+ /x;
my $atomPat = qr/$atextPat+/x;
my $ctext = '[\x21-\x27\x2A-\x5B\x5D-\x7E]';
my $quotedPairPat = qr/ \\ [\x20-\x7E] /x;
my $commentPat =qr/
\(
(?:$FWS?
(?:$ctext | $quotedPairPat | (?R))
)*
$FWS?
\)
/x;
my $CFWS = qr/
(?: (?:$FWS+ $commentPat)+ $FWS?) |
$FWS
/x;
my $dayPat = qr/$CFWS? (?<DAY>\d{1,2}) $CFWS?/x;
# per RFC 5322 case matters
my $day_of_weekPat = qr/
$CFWS
(?<DAY_OF_WEEK>
Mon |
Tue |
Wed |
Thu |
Fri |
Sat |
Sun
)
$CFWS?
/x;
my $dotStringPat = qr/$atextPat+ (?:\. $atextPat)*/x;
my $dtextPat = '[\x21-\x50\x54-\x7E]';
my $hourPat = qr/(?<HOUR>
(\d\d)
(?(?{$^N > 24})
(*FAIL)
)
)
/x;
my $idLeftPat = qr/$dotStringPat/;
my $LinkPat = qr/TCP | $atomPat/xi;
my $noFoldLiteralPat = qr/\[ $dtextPat* \]/x;
my $idRightPat = qr/$dotStringPat | $noFoldLiteralPat/x;
my $msgIdPat = qr/\s* \< $idLeftPat \@ $idRightPat \> \s*/x;
my $minutePat = qr/(?<MINUTE>
(\d\d)
(?(?{$^N > 59})
(*FAIL)
)
)
/x;
# per RFC 5322 case matters
my $monthPat = qr/
(?<MONTH>
Jan |
Feb |
Mat |
Apr |
May |
Jum |
Jul |
Aug |
Sep |
Oct |
Mov |
Dec
)
/x;
my $obs_zonePat = qr/
CDT
CST
EDT
EST
GMT
MDT
MST
PDT
PST
UT
[A-IK-Za-ik-z]
/x;
my $qtextPat = '[\x20-\x21\x23-\x5B\x5d-\x7E]';
my $QcontentPat = qr/$qtextPat | $quotedPairPat/x;
my $QuotedStringPat = qr/"$QcontentPat*"/;
my $rDNSstat =qr/
\s*
(?:\s* \( may \s be \s forged \) ) |
(?: \s* \( misconfigured \s sender \) ) |
(?: \s* RDNS \s failed)
/x;
# Non-5321 prefix to domain in TCP-INFO
my $RecLocalPat = '(?:(?:IDENT:)?[\w+-]+[\w\.+-]*@)?';
my $secondPat = qr/(?<SECOND>
(\d\d)
(?(?{$^N > 59})
(*FAIL)
)
)
/x;
# Malformed Received headers may have 'RDNS failed' after the IP address
# or a dotted quad without framing []
my $TCPinfoPat =qr/
(?<IP>$addressLiteralPat) (?:\s* RDNS \s failed)? |
(?<IP>$IPv4addressPat) |
(?:$RecLocalPat
(?<RDNS>$domainPat)
$FWS
(?<IP>$addressLiteralPat)
$rDNSstat*
)
/x;
my $time_of_day = qr/$hourPat : $minutePat (?: : $secondPat)?/x;
my $yearPat = qr/$FWS (?<YEAR>\d{2,4}) $FWS/x;
# RFC 5322 semantic constraint not applied in order to match malformed zones.
my $zonePat = qr/$FWS
(?<ZONE>
(?:
(?:
[+-]
\d\d\d\d
) |
$obs_zonePat
)
)
/x;
# RFC 5322 shows spaces in day and year, not here
my $datePat = qr/$dayPat $monthPat $yearPat/x;
# Received: FROM non-5321 tokens seen in the wild
my $Non5321DomainPat = qr/
\. |
$IPv4addressPat |
\d+
/x;
# Malformed Received headers may have a leading hyphen in a
# domain name, a period as a domain name or an address
# literal without TCPINFO. They may also have an IPv4
# address expressed as a hexadecimal, decimal or octal constant.
my $ExtendedDomainPat = qr/
(?:(?<HELO>-?$domainPat) \s (?<IP>$addressLiteralPat)) |
(?:(?<HELO>-?$domainPat) (?:$FWS \( $TCPinfoPat \))?) |
(?<IP>$addressLiteralPat) (?:$FWS \( $TCPinfoPat \))? |
$Non5321DomainPat (?:$FWS \( $TCPinfoPat \))?
/x;
my $localPartPat = qr/$dotStringPat | $QuotedStringPat/x;
my $MailboxPat = qr/(?<LOCAL_PART>$localPartPat) \@ (?<DOMAIN>$domainPat | $addressLiteralPat)/x;
my $protocolPat = qr/SMTP | ESMTP | $atomPat/xi;
# I don't expect to see source routing in the wild
my $RecPathPat = qr/
\<
(?:\@ $domainPat (?:, \@ $domainPat)* :)?
$MailboxPat
\>
/x;
# Can't use $RE{net}{domain} due to malformed domain names
my $RecHELOpat = "(?<HELO>(?:-?$domainPat)|" .
"\\.|" .
"(?:\\[$RE{net}{IPv4}\\])|" .
"$RE{net}{IPv4}|" .
"\\d+)";
# Road Runner Received: FROM
my $RRfromPat = qr/
(?<HELO>$RE{net}{IPv4})
\s+
\(
Forwarded-For:
\s
\[
(?<IP>$RE{net}{IPv4})
\]
\)
/x;
# QMAIL Received: FROM
my $QMfromPat = qr/(?<IP>$RE{net}{IPv4})
\s+
\(
\[
(?<RDNS>$domainPat)
\]
:
\d+
\s+
"
\w+
\s*
\[
(?<HELO>$domainPat)
\]
"
[^)]*
\)';
/x;
my $RecSrcPat = qr/$RecLocalPat
(?<RDNS>$domainPat)?
\s*
\[
(?<IP>$RE{net}{IPv4})
\]
\s*
(?:\(may\sbe\sforged\))?
\s*
(?:\(misconfigured\ssender\))?
\s*
(?:\s*RDNS\sfailed)?
/x;
# The RFC 5321 syntax for From-domain does not allow an address literal without
# TCP-info in parentheses, but Yahoo creates a Stamp in that format.
# Some software puts significant information in comments beyond the
# TCPINFO of the Extended-Domain.
my $RecFromPat = qr/^
FROM
$FWS
(?<FROM>
$ExtendedDomainPat |
(?:
(
\[
(?<IP>$IPv4addressPat)
\]
)
\s*
\(
HELO=$RecHELOpat
\)
) |
(?:(?<RDNS>$domainPat)
\s+
\(
\[
(?<IP>$IPv4addressPat)
\]
\s+
HELO=$RecHELOpat
\)
) |
(?:(?<RDNS>$domainPat)
\s+
\(
HELO
\s
$RecHELOpat
\)
\s+
\(
\[
(?<IP>$IPv4addressPat)
\]
\)
) |
$QMfromPat |
$RRfromPat
)
/xi;
# per RFC 5321 it's CFWS "BY" FWS Extended-Domain
# in the wild it's CFWS "BY" FWS Domain FWS '(' MTA ')'
my $RecByPat = qr!$CFWS
BY
$FWS
(?<BY1>
(?:$domainPat |
\[ $RE{net}{IPv4} \]
)
)
(?:
$FWS
\(
(?<BY2>[\s\w\./-]+)
\)
)?
!xi;
my $RecForPat = qr/$CFWS FOR $FWS (?: $RecPathPat | $MailboxPat)/xi;
my $RecIdPat = qr/$CFWS ID $FWS (?<ID>$atomPat | $msgIdPat)/xi;
my $RecViaPat = qr/$CFWS VIA $FWS (?<LINK>$LinkPat)/xi;
# m$ lookout violates RFC 5321 syntax
my $RecWithMS = qr/Microsoft \s+ (?:ESMTP|SMTP) (?:\s+ Server | SVC\(\d+(?:\.\d+)*\))/xi;
my $RecWithPat = qr/$CFWS
WITH $FWS
(?:
(?:ESMTP|SMTP) |
$RecWithMS |
NNFMP # Yahoo
)
(?:
$FWS
\(
SMTP
[\d\w\.-]*
\)
)?
/xi;
my $RecOptInfo = qr/
(?<VIA>$RecViaPat)?
(?<WITH>$RecWithPat)?
(?:$RecIdPat)?
(?<FOR>$RecForPat)?
/xi;
my $timePat = qr/$time_of_day $zonePat/x;
# RFC 5322 shows spaces in day and year, not here
my $date_timePat = qr/
(?: $day_of_weekPat [,])?
$datePat
$timePat
(?:$CFWS)?
/x;
my $RecPat = qr/^
$RecFromPat
$RecByPat
$RecOptInfo
$CFWS?
\;
# $date_timePat
$datePat
# $timePat
# $time_of_day
$hourPat
/x;
my @testheaders = (<<'EOF1',<<'EOF2',<<'EOF3',<<'EOF4',<<'EOF5',<<'EOF6',<<EOF7,<<EOF8);
from amethyst.nstc.com (majordomo@amethyst.nstc.com [207.166.196.179]) by mail.acm.org (8.8.5/8.7.5) with ESMTP id AAA44952 for <Shmuel@ACM.Org>; Wed, 6 Jan 1999 00:06:57 -0500
EOF1
(from majordomo@localhost)
by amethyst.nstc.com (8.9.1/8.9.1/nstc.com) id AAA16796
for freemail-outgoing; Wed, 6 Jan 1999 00:26:52 -0500
EOF2
from devel.nacs.net (IDENT:root@devel.nacs.net [207.166.192.85])
by amethyst.nstc.com (8.9.1/8.9.1/nstc.com) with ESMTP id AAA16789
for <freemail@nstc.com>; Wed, 6 Jan 1999 00:26:49 -0500
EOF3
from relay1.mnsinc.com (relay1.mnsinc.com [206.55.3.25])
by devel.nacs.net (8.8.7/8.8.8) with ESMTP id WAA10733
for <freemail@nstc.com>; Tue, 5 Jan 1999 22:44:35 -0500
EOF4
from U86 (u86.os2bbs.com [206.55.10.86])
by relay1.mnsinc.com (8.9.0/8.9.0) with SMTP id WAA29325
for <freemail@nstc.com>; Tue, 5 Jan 1999 22:39:39 -0500 (EST)
EOF5
from localhost (localhost [127.0.0.1])
by lincoln-at-leros.patriot.net (Postfix) with ESMTP id 12BBE55E73
for <marianne@patriot.net>; Fri, 27 Jan 2012 09:23:59 -0500 (EST)
EOF6
from mail.acm.org [199.222.69.4] by piglet.toward.com with ESMTP
(SMTPD32-4.06) id AF982F028E; Wed, 06 Jan 1999 00:07:36 EDT
EOF7
from mail.acm.org [199.222.69.4] by piglet.toward.com with ESMTP
(foo) id AF982F028E; Wed, 06 Jan 1999 00:07:36 EDT
EOF8
msg("\n\@testheaders has " . scalar @testheaders . " lines\n");
foreach (@testheaders) {
msg("\n\t --> $_\n");
my $RecTest = qr/
$RecFromPat
$RecByPat?
$RecOptInfo
$CFWS?
\;
(?: $day_of_weekPat [,])?
$datePat
$timePat
(?:$CFWS)?
/xi;
if (/$RecTest/) {
msg("\nMatched \$RecTest\n");
foreach my $key (sort keys %+) {
print STDERR "\$+{$key}=$+{$key}\n";
}
msg("\n");
foreach my $key (sort keys %-) {
print STDERR "\$-{$key}=",grep defined, @{$-{$key}},"\n";
}
} else {
msg("\nDid not match\$RecTest\n");
}
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? (?<DAY>\d{1,2}) )/xi) {
# if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d{1,2} )/xi) {
if (/($RecFromPat $RecByPat? $RecOptInfo \; $CFWS? \d )/xi) {
msg("\nMatched $1\n");
# msg("\nDumper(\%+):\n");
# msg(Dumper(%+),"\n");
# msg("\nDumper(\%-):\n");
# msg(Dumper(%-),"\n");
foreach my $key (sort keys %+) {
print STDERR "\$+{$key}=$+{$key}\n";
}
msg("\n");
foreach my $key (sort keys %-) {
print STDERR "\$-{$key}=",grep defined, @{$-{$key}},"\n";
}
} else {
msg("\nDid not match\n");
}
msg("\n");
}
sub msg {
print STDERR @_;
}
1;
__END__
--
Shmuel (Seymour J.) Metz, SysProg and JOAT <http://patriot.net/~shmuel>
Unsolicited bulk E-mail subject to legal action. I reserve the
right to publicly post or ridicule any abusive E-mail. Reply to
domain Patriot dot net user shmuel+news to contact me. Do not
reply to spamtrap@library.lspace.org
------------------------------
Date: 6 Apr 2001 21:33:47 GMT (Last modified)
From: Perl-Users-Request@ruby.oce.orst.edu (Perl-Users-Digest Admin)
Subject: Digest Administrivia (Last modified: 6 Apr 01)
Message-Id: <null>
Administrivia:
To submit articles to comp.lang.perl.announce, send your article to
clpa@perl.com.
Back issues are available via anonymous ftp from
ftp://cil-www.oce.orst.edu/pub/perl/old-digests.
#For other requests pertaining to the digest, send mail to
#perl-users-request@ruby.oce.orst.edu. Do not waste your time or mine
#sending perl questions to the -request address, I don't have time to
#answer them even if I did know the answer.
------------------------------
End of Perl-Users Digest V11 Issue 3606
***************************************