Nicholas Clark
2010-02-14 16:45:05 UTC
In perl.git, the branch blead has been updated
<http://perl5.git.perl.org/perl.git/commitdiff/d931b1bedc59d4625c59047dfda2b1bd83ff9f71?hp=6e3b7bfa2b063f4ce0c55f84474edb7d2c652387>
- Log -----------------------------------------------------------------
commit d931b1bedc59d4625c59047dfda2b1bd83ff9f71
Author: Nicholas Clark <***@ccl4.org>
Date: Sun Feb 14 12:31:44 2010 +0000
Convert Perl_sv_pos_u2b_proper() to Perl_sv_pos_u2b_flags().
Change from a value/return offset pointer to passing a Unicode offset, and
returning a byte offset. The optional length value/return pointer remains.
Add a flags argument, passed to SvPV_flags(). This allows the caller to
specify whether mg_get() should be called on sv.
M embed.fnc
M embed.h
M global.sym
M mg.c
M pp.c
M proto.h
M sv.c
commit 1c90055717b05b3f652bf543a46419001314c53e
Author: Nicholas Clark <***@ccl4.org>
Date: Sun Feb 14 16:04:35 2010 +0000
Remove a vestigial STRLEN case and convert a label to lowercase.
(Tweaking 777f7c561610dee6.)
M mg.c
M pp.c
commit 777f7c561610dee641c77666e5a4a0d9ac1d4230
Author: Eric Brine <***@adaelis.com>
Date: Thu Feb 11 20:28:29 2010 -0500
Removes 32-bit limit on substr arguments. The full range of IV and UV is available for the pos and len arguments, with safe conversion to STRLEN where it's smaller than an IV.
M embed.fnc
M embed.h
M global.sym
M mg.c
M pp.c
M proto.h
M sv.c
M t/re/substr.t
-----------------------------------------------------------------------
Summary of changes:
embed.fnc | 2 +
embed.h | 2 +
global.sym | 1 +
mg.c | 21 ++++----
pp.c | 145 +++++++++++++++++++++++++++++++++++++--------------------
proto.h | 5 ++
sv.c | 83 +++++++++++++++++++++++---------
t/re/substr.t | 42 ++++++++++++++++-
8 files changed, 215 insertions(+), 86 deletions(-)
diff --git a/embed.fnc b/embed.fnc
index 7463274..769481b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1165,6 +1165,8 @@ ApdR |SV* |sv_newmortal
Apd |SV* |sv_newref |NULLOK SV *const sv
Ap |char* |sv_peek |NULLOK SV* sv
Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *const lenp
+Apd |STRLEN |sv_pos_u2b_flags|NN SV *const sv|STRLEN uoffset \
+ |NULLOK STRLEN *const lenp|U32 flags
Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp
Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp
Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp
diff --git a/embed.h b/embed.h
index 246106b..eda36a7 100644
--- a/embed.h
+++ b/embed.h
@@ -967,6 +967,7 @@
#define sv_newref Perl_sv_newref
#define sv_peek Perl_sv_peek
#define sv_pos_u2b Perl_sv_pos_u2b
+#define sv_pos_u2b_flags Perl_sv_pos_u2b_flags
#define sv_pos_b2u Perl_sv_pos_b2u
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
@@ -3371,6 +3372,7 @@
#define sv_newref(a) Perl_sv_newref(aTHX_ a)
#define sv_peek(a) Perl_sv_peek(aTHX_ a)
#define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c)
+#define sv_pos_u2b_flags(a,b,c,d) Perl_sv_pos_u2b_flags(aTHX_ a,b,c,d)
#define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b)
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
diff --git a/global.sym b/global.sym
index f0361df..a99548e 100644
--- a/global.sym
+++ b/global.sym
@@ -567,6 +567,7 @@ Perl_sv_newmortal
Perl_sv_newref
Perl_sv_peek
Perl_sv_pos_u2b
+Perl_sv_pos_u2b_flags
Perl_sv_pos_b2u
Perl_sv_pvn_force
Perl_sv_pvutf8n_force
diff --git a/mg.c b/mg.c
index b9a1464..cc01547 100644
--- a/mg.c
+++ b/mg.c
@@ -2008,19 +2008,19 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
SV * const lsv = LvTARG(sv);
const char * const tmps = SvPV_const(lsv,len);
- I32 offs = LvTARGOFF(sv);
- I32 rem = LvTARGLEN(sv);
+ STRLEN offs = LvTARGOFF(sv);
+ STRLEN rem = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_GETSUBSTR;
PERL_UNUSED_ARG(mg);
if (SvUTF8(lsv))
- sv_pos_u2b(lsv, &offs, &rem);
- if (offs > (I32)len)
+ offs = sv_pos_u2b_flags(lsv, offs, &rem, SV_CONST_RETURN);
+ if (offs > len)
offs = len;
- if (rem + offs > (I32)len)
+ if (rem > len - offs)
rem = len - offs;
- sv_setpvn(sv, tmps + offs, (STRLEN)rem);
+ sv_setpvn(sv, tmps + offs, rem);
if (SvUTF8(lsv))
SvUTF8_on(sv);
return 0;
@@ -2033,22 +2033,22 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
const char * const tmps = SvPV_const(sv, len);
SV * const lsv = LvTARG(sv);
- I32 lvoff = LvTARGOFF(sv);
- I32 lvlen = LvTARGLEN(sv);
+ STRLEN lvoff = LvTARGOFF(sv);
+ STRLEN lvlen = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_SETSUBSTR;
PERL_UNUSED_ARG(mg);
if (DO_UTF8(sv)) {
sv_utf8_upgrade(lsv);
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ lvoff = sv_pos_u2b_flags(lsv, lvoff, &lvlen, SV_CONST_RETURN);
sv_insert(lsv, lvoff, lvlen, tmps, len);
LvTARGLEN(sv) = sv_len_utf8(sv);
SvUTF8_on(lsv);
}
else if (lsv && SvUTF8(lsv)) {
const char *utf8;
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ lvoff = sv_pos_u2b_flags(lsv, lvoff, &lvlen, SV_CONST_RETURN);
LvTARGLEN(sv) = len;
utf8 = (char*)bytes_to_utf8((U8*)tmps, &len);
sv_insert(lsv, lvoff, lvlen, utf8, len);
@@ -2059,7 +2059,6 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
LvTARGLEN(sv) = len;
}
-
return 0;
}
diff --git a/pp.c b/pp.c
index 2f4703b..3e2ed48 100644
--- a/pp.c
+++ b/pp.c
@@ -3079,15 +3079,19 @@ PP(pp_substr)
{
dVAR; dSP; dTARGET;
SV *sv;
- I32 len = 0;
STRLEN curlen;
STRLEN utf8_curlen;
- I32 pos;
- I32 rem;
- I32 fail;
+ SV * pos_sv;
+ IV pos1_iv;
+ int pos1_is_uv;
+ IV pos2_iv;
+ int pos2_is_uv;
+ SV * len_sv;
+ IV len_iv = 0;
+ int len_is_uv = 1;
const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
const char *tmps;
- const I32 arybase = CopARYBASE_get(PL_curcop);
+ const IV arybase = CopARYBASE_get(PL_curcop);
SV *repl_sv = NULL;
const char *repl = NULL;
STRLEN repl_len;
@@ -3103,9 +3107,13 @@ PP(pp_substr)
repl = SvPV_const(repl_sv, repl_len);
repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv);
}
- len = POPi;
+ len_sv = POPs;
+ len_iv = SvIV(len_sv);
+ len_is_uv = SvIOK_UV(len_sv);
}
- pos = POPi;
+ pos_sv = POPs;
+ pos1_iv = SvIV(pos_sv);
+ pos1_is_uv = SvIOK_UV(pos_sv);
sv = POPs;
PUTBACK;
if (repl_sv) {
@@ -3127,51 +3135,80 @@ PP(pp_substr)
else
utf8_curlen = 0;
- if (pos >= arybase) {
- pos -= arybase;
- rem = curlen-pos;
- fail = rem;
- if (num_args > 2) {
- if (len < 0) {
- rem += len;
- if (rem < 0)
- rem = 0;
- }
- else if (rem > len)
- rem = len;
+ if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */
+ UV pos1_uv = pos1_iv-arybase;
+ /* Overflow can occur when $[ < 0 */
+ if (arybase < 0 && pos1_uv < (UV)pos1_iv)
+ goto bound_fail;
+ pos1_iv = pos1_uv;
+ pos1_is_uv = 1;
+ }
+ else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) {
+ goto bound_fail; /* $[=3; substr($_,2,...) */
+ }
+ else { /* pos < $[ */
+ if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */
+ pos1_iv = curlen;
+ pos1_is_uv = 1;
+ } else {
+ if (curlen) {
+ pos1_is_uv = curlen-1 > ~(UV)pos1_iv;
+ pos1_iv += curlen;
+ }
}
}
- else {
- pos += curlen;
- if (num_args < 3)
- rem = curlen;
- else if (len >= 0) {
- rem = pos+len;
- if (rem > (I32)curlen)
- rem = curlen;
+ if (pos1_is_uv || pos1_iv > 0) {
+ if ((UV)pos1_iv > curlen)
+ goto bound_fail;
+ }
+
+ if (num_args > 2) {
+ if (!len_is_uv && len_iv < 0) {
+ pos2_iv = curlen + len_iv;
+ if (curlen)
+ pos2_is_uv = curlen-1 > ~(UV)len_iv;
+ else
+ pos2_is_uv = 0;
+ } else { /* len_iv >= 0 */
+ if (!pos1_is_uv && pos1_iv < 0) {
+ pos2_iv = pos1_iv + len_iv;
+ pos2_is_uv = (UV)len_iv > (UV)IV_MAX;
+ } else {
+ if ((UV)len_iv > curlen-(UV)pos1_iv)
+ pos2_iv = curlen;
+ else
+ pos2_iv = pos1_iv+len_iv;
+ pos2_is_uv = 1;
+ }
}
- else {
- rem = curlen+len;
- if (rem < pos)
- rem = pos;
- }
- if (pos < 0)
- pos = 0;
- fail = rem;
- rem -= pos;
- }
- if (fail < 0) {
- if (lvalue || repl)
- Perl_croak(aTHX_ "substr outside of string");
- Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
- RETPUSHUNDEF;
}
else {
- const I32 upos = pos;
- const I32 urem = rem;
- if (utf8_curlen)
- sv_pos_u2b(sv, (I32 *)&pos, (I32 *)&rem);
- tmps += pos;
+ pos2_iv = curlen;
+ pos2_is_uv = 1;
+ }
+
+ if (!pos2_is_uv && pos2_iv < 0) {
+ if (!pos1_is_uv && pos1_iv < 0)
+ goto bound_fail;
+ pos2_iv = 0;
+ }
+ else if (!pos1_is_uv && pos1_iv < 0)
+ pos1_iv = 0;
+
+ if ((UV)pos2_iv < (UV)pos1_iv)
+ pos2_iv = pos1_iv;
+ if ((UV)pos2_iv > curlen)
+ pos2_iv = curlen;
+
+ {
+ /* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */
+ const STRLEN pos = (STRLEN)( (UV)pos1_iv );
+ const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv );
+ STRLEN byte_len = len;
+ STRLEN byte_pos = utf8_curlen
+ ? sv_pos_u2b_flags(sv, pos, &byte_len, SV_CONST_RETURN) : pos;
+
+ tmps += byte_pos;
/* we either return a PV or an LV. If the TARG hasn't been used
* before, or is of that type, reuse it; otherwise use a mortal
* instead. Note that LVs can have an extended lifetime, so also
@@ -3185,7 +3222,7 @@ PP(pp_substr)
}
}
- sv_setpvn(TARG, tmps, rem);
+ sv_setpvn(TARG, tmps, byte_len);
#ifdef USE_LOCALE_COLLATE
sv_unmagic(TARG, PERL_MAGIC_collxfrm);
#endif
@@ -3202,7 +3239,7 @@ PP(pp_substr)
}
if (!SvOK(sv))
sv_setpvs(sv, "");
- sv_insert_flags(sv, pos, rem, repl, repl_len, 0);
+ sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0);
if (repl_is_utf8)
SvUTF8_on(sv);
SvREFCNT_dec(repl_sv_copy);
@@ -3232,13 +3269,19 @@ PP(pp_substr)
SvREFCNT_dec(LvTARG(TARG));
LvTARG(TARG) = SvREFCNT_inc_simple(sv);
}
- LvTARGOFF(TARG) = upos;
- LvTARGLEN(TARG) = urem;
+ LvTARGOFF(TARG) = pos;
+ LvTARGLEN(TARG) = len;
}
}
SPAGAIN;
PUSHs(TARG); /* avoid SvSETMAGIC here */
RETURN;
+
+bound_fail:
+ if (lvalue || repl)
+ Perl_croak(aTHX_ "substr outside of string");
+ Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
+ RETPUSHUNDEF;
}
PP(pp_vec)
diff --git a/proto.h b/proto.h
index 4a343be..01be297 100644
--- a/proto.h
+++ b/proto.h
@@ -3374,6 +3374,11 @@ PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 *
#define PERL_ARGS_ASSERT_SV_POS_U2B \
assert(offsetp)
+PERL_CALLCONV STRLEN Perl_sv_pos_u2b_flags(pTHX_ SV *const sv, STRLEN uoffset, STRLEN *const lenp, U32 flags)
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_SV_POS_U2B_FLAGS \
+ assert(sv)
+
PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_SV_POS_B2U \
diff --git a/sv.c b/sv.c
index 4ab41f6..57231ec 100644
--- a/sv.c
+++ b/sv.c
@@ -6240,62 +6240,99 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start
/*
-=for apidoc sv_pos_u2b
+=for apidoc sv_pos_u2b_flags
Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
lenp is non-zero, it does the same to lenp, but this time starting from
-the offset, rather than from the start of the string. Handles magic and
-type coercion.
+the offset, rather than from the start of the string. Handles type coercion.
+I<flags> is passed to C<SvPV_flags>, and usually should be
+C<SV_GMAGIC|SV_CONST_RETURN> to handle magic.
=cut
*/
/*
- * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * sv_pos_u2b_flags() uses, like sv_pos_b2u(), the mg_ptr of the potential
* PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
* byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
*
*/
-void
-Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+STRLEN
+Perl_sv_pos_u2b_flags(pTHX_ SV *const sv, STRLEN uoffset, STRLEN *const lenp,
+ U32 flags)
{
const U8 *start;
STRLEN len;
+ STRLEN boffset;
- PERL_ARGS_ASSERT_SV_POS_U2B;
-
- if (!sv)
- return;
+ PERL_ARGS_ASSERT_SV_POS_U2B_FLAGS;
- start = (U8*)SvPV_const(sv, len);
+ start = (U8*)SvPV_flags(sv, len, flags);
if (len) {
- STRLEN uoffset = (STRLEN) *offsetp;
const U8 * const send = start + len;
MAGIC *mg = NULL;
- const STRLEN boffset = sv_pos_u2b_cached(sv, &mg, start, send,
- uoffset, 0, 0);
-
- *offsetp = (I32) boffset;
+ boffset = sv_pos_u2b_cached(sv, &mg, start, send, uoffset, 0, 0);
if (lenp) {
/* Convert the relative offset to absolute. */
- const STRLEN uoffset2 = uoffset + (STRLEN) *lenp;
+ const STRLEN uoffset2 = uoffset + *lenp;
const STRLEN boffset2
= sv_pos_u2b_cached(sv, &mg, start, send, uoffset2,
uoffset, boffset) - boffset;
*lenp = boffset2;
}
- }
- else {
- *offsetp = 0;
- if (lenp)
- *lenp = 0;
+ } else {
+ if (lenp)
+ *lenp = 0;
+ boffset = 0;
}
- return;
+ return boffset;
+}
+
+/*
+=for apidoc sv_pos_u2b
+
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
+the start of the string, to a count of the equivalent number of bytes; if
+lenp is non-zero, it does the same to lenp, but this time starting from
+the offset, rather than from the start of the string. Handles magic and
+type coercion.
+
+Use C<sv_pos_u2b_flags> in preference, which correctly handles strings longer
+than 2Gb.
+
+=cut
+*/
+
+/*
+ * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
+ *
+ */
+
+/* This function is subject to size and sign problems */
+
+void
+Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+{
+ STRLEN uoffset = (STRLEN)*offsetp;
+
+ PERL_ARGS_ASSERT_SV_POS_U2B;
+
+ if (lenp) {
+ STRLEN ulen = (STRLEN)*lenp;
+ *offsetp = (I32)sv_pos_u2b_flags(sv, (STRLEN)*offsetp, &ulen,
+ SV_GMAGIC|SV_CONST_RETURN);
+ *lenp = (I32)ulen;
+ } else {
+ *offsetp = (I32)sv_pos_u2b_flags(sv, (STRLEN)*offsetp, NULL,
+ SV_GMAGIC|SV_CONST_RETURN);
+ }
}
/* Create and update the UTF8 magic offset cache, with the proffered utf8/
diff --git a/t/re/substr.t b/t/re/substr.t
index c3fa6e1..d0717ba 100644
--- a/t/re/substr.t
+++ b/t/re/substr.t
@@ -24,7 +24,7 @@ $SIG{__WARN__} = sub {
require './test.pl';
-plan(334);
+plan(360);
run_tests() unless caller;
@@ -201,6 +201,11 @@ is($w--, 1);
eval{substr($a,1) = "" ; }; # P=R=S Q
like($@, $FATAL_MSG);
+$b = substr($a,-7,-6) ; # warn # Q R P S
+is($w--, 1);
+eval{substr($a,-7,-6) = "" ; }; # Q R P S
+like($@, $FATAL_MSG);
+
my $a = 'zxcvbnm';
substr($a,2,0) = '';
is($a, 'zxcvbnm');
@@ -682,4 +687,39 @@ is($x, "\x{100}\x{200}\xFFb");
is(substr($a,1,1), 'b');
}
+# [perl #62646] offsets exceeding 32 bits on 64-bit system
+SKIP: {
+ skip("32-bit system", 24) unless ~0 > 0xffffffff;
+ my $a = "abc";
+ my $s;
+ my $r;
+
+ utf8::downgrade($a);
+ for (1..2) {
+ $w = 0;
+ $r = substr($a, 0xffffffff, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ $r = substr($a, 0xffffffff+1, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff+1, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ utf8::upgrade($a);
+ }
+}
+
}
--
Perl5 Master Repository
<http://perl5.git.perl.org/perl.git/commitdiff/d931b1bedc59d4625c59047dfda2b1bd83ff9f71?hp=6e3b7bfa2b063f4ce0c55f84474edb7d2c652387>
- Log -----------------------------------------------------------------
commit d931b1bedc59d4625c59047dfda2b1bd83ff9f71
Author: Nicholas Clark <***@ccl4.org>
Date: Sun Feb 14 12:31:44 2010 +0000
Convert Perl_sv_pos_u2b_proper() to Perl_sv_pos_u2b_flags().
Change from a value/return offset pointer to passing a Unicode offset, and
returning a byte offset. The optional length value/return pointer remains.
Add a flags argument, passed to SvPV_flags(). This allows the caller to
specify whether mg_get() should be called on sv.
M embed.fnc
M embed.h
M global.sym
M mg.c
M pp.c
M proto.h
M sv.c
commit 1c90055717b05b3f652bf543a46419001314c53e
Author: Nicholas Clark <***@ccl4.org>
Date: Sun Feb 14 16:04:35 2010 +0000
Remove a vestigial STRLEN case and convert a label to lowercase.
(Tweaking 777f7c561610dee6.)
M mg.c
M pp.c
commit 777f7c561610dee641c77666e5a4a0d9ac1d4230
Author: Eric Brine <***@adaelis.com>
Date: Thu Feb 11 20:28:29 2010 -0500
Removes 32-bit limit on substr arguments. The full range of IV and UV is available for the pos and len arguments, with safe conversion to STRLEN where it's smaller than an IV.
M embed.fnc
M embed.h
M global.sym
M mg.c
M pp.c
M proto.h
M sv.c
M t/re/substr.t
-----------------------------------------------------------------------
Summary of changes:
embed.fnc | 2 +
embed.h | 2 +
global.sym | 1 +
mg.c | 21 ++++----
pp.c | 145 +++++++++++++++++++++++++++++++++++++--------------------
proto.h | 5 ++
sv.c | 83 +++++++++++++++++++++++---------
t/re/substr.t | 42 ++++++++++++++++-
8 files changed, 215 insertions(+), 86 deletions(-)
diff --git a/embed.fnc b/embed.fnc
index 7463274..769481b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1165,6 +1165,8 @@ ApdR |SV* |sv_newmortal
Apd |SV* |sv_newref |NULLOK SV *const sv
Ap |char* |sv_peek |NULLOK SV* sv
Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *const lenp
+Apd |STRLEN |sv_pos_u2b_flags|NN SV *const sv|STRLEN uoffset \
+ |NULLOK STRLEN *const lenp|U32 flags
Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp
Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp
Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp
diff --git a/embed.h b/embed.h
index 246106b..eda36a7 100644
--- a/embed.h
+++ b/embed.h
@@ -967,6 +967,7 @@
#define sv_newref Perl_sv_newref
#define sv_peek Perl_sv_peek
#define sv_pos_u2b Perl_sv_pos_u2b
+#define sv_pos_u2b_flags Perl_sv_pos_u2b_flags
#define sv_pos_b2u Perl_sv_pos_b2u
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
@@ -3371,6 +3372,7 @@
#define sv_newref(a) Perl_sv_newref(aTHX_ a)
#define sv_peek(a) Perl_sv_peek(aTHX_ a)
#define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c)
+#define sv_pos_u2b_flags(a,b,c,d) Perl_sv_pos_u2b_flags(aTHX_ a,b,c,d)
#define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b)
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
diff --git a/global.sym b/global.sym
index f0361df..a99548e 100644
--- a/global.sym
+++ b/global.sym
@@ -567,6 +567,7 @@ Perl_sv_newmortal
Perl_sv_newref
Perl_sv_peek
Perl_sv_pos_u2b
+Perl_sv_pos_u2b_flags
Perl_sv_pos_b2u
Perl_sv_pvn_force
Perl_sv_pvutf8n_force
diff --git a/mg.c b/mg.c
index b9a1464..cc01547 100644
--- a/mg.c
+++ b/mg.c
@@ -2008,19 +2008,19 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
SV * const lsv = LvTARG(sv);
const char * const tmps = SvPV_const(lsv,len);
- I32 offs = LvTARGOFF(sv);
- I32 rem = LvTARGLEN(sv);
+ STRLEN offs = LvTARGOFF(sv);
+ STRLEN rem = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_GETSUBSTR;
PERL_UNUSED_ARG(mg);
if (SvUTF8(lsv))
- sv_pos_u2b(lsv, &offs, &rem);
- if (offs > (I32)len)
+ offs = sv_pos_u2b_flags(lsv, offs, &rem, SV_CONST_RETURN);
+ if (offs > len)
offs = len;
- if (rem + offs > (I32)len)
+ if (rem > len - offs)
rem = len - offs;
- sv_setpvn(sv, tmps + offs, (STRLEN)rem);
+ sv_setpvn(sv, tmps + offs, rem);
if (SvUTF8(lsv))
SvUTF8_on(sv);
return 0;
@@ -2033,22 +2033,22 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
const char * const tmps = SvPV_const(sv, len);
SV * const lsv = LvTARG(sv);
- I32 lvoff = LvTARGOFF(sv);
- I32 lvlen = LvTARGLEN(sv);
+ STRLEN lvoff = LvTARGOFF(sv);
+ STRLEN lvlen = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_SETSUBSTR;
PERL_UNUSED_ARG(mg);
if (DO_UTF8(sv)) {
sv_utf8_upgrade(lsv);
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ lvoff = sv_pos_u2b_flags(lsv, lvoff, &lvlen, SV_CONST_RETURN);
sv_insert(lsv, lvoff, lvlen, tmps, len);
LvTARGLEN(sv) = sv_len_utf8(sv);
SvUTF8_on(lsv);
}
else if (lsv && SvUTF8(lsv)) {
const char *utf8;
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ lvoff = sv_pos_u2b_flags(lsv, lvoff, &lvlen, SV_CONST_RETURN);
LvTARGLEN(sv) = len;
utf8 = (char*)bytes_to_utf8((U8*)tmps, &len);
sv_insert(lsv, lvoff, lvlen, utf8, len);
@@ -2059,7 +2059,6 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
LvTARGLEN(sv) = len;
}
-
return 0;
}
diff --git a/pp.c b/pp.c
index 2f4703b..3e2ed48 100644
--- a/pp.c
+++ b/pp.c
@@ -3079,15 +3079,19 @@ PP(pp_substr)
{
dVAR; dSP; dTARGET;
SV *sv;
- I32 len = 0;
STRLEN curlen;
STRLEN utf8_curlen;
- I32 pos;
- I32 rem;
- I32 fail;
+ SV * pos_sv;
+ IV pos1_iv;
+ int pos1_is_uv;
+ IV pos2_iv;
+ int pos2_is_uv;
+ SV * len_sv;
+ IV len_iv = 0;
+ int len_is_uv = 1;
const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
const char *tmps;
- const I32 arybase = CopARYBASE_get(PL_curcop);
+ const IV arybase = CopARYBASE_get(PL_curcop);
SV *repl_sv = NULL;
const char *repl = NULL;
STRLEN repl_len;
@@ -3103,9 +3107,13 @@ PP(pp_substr)
repl = SvPV_const(repl_sv, repl_len);
repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv);
}
- len = POPi;
+ len_sv = POPs;
+ len_iv = SvIV(len_sv);
+ len_is_uv = SvIOK_UV(len_sv);
}
- pos = POPi;
+ pos_sv = POPs;
+ pos1_iv = SvIV(pos_sv);
+ pos1_is_uv = SvIOK_UV(pos_sv);
sv = POPs;
PUTBACK;
if (repl_sv) {
@@ -3127,51 +3135,80 @@ PP(pp_substr)
else
utf8_curlen = 0;
- if (pos >= arybase) {
- pos -= arybase;
- rem = curlen-pos;
- fail = rem;
- if (num_args > 2) {
- if (len < 0) {
- rem += len;
- if (rem < 0)
- rem = 0;
- }
- else if (rem > len)
- rem = len;
+ if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */
+ UV pos1_uv = pos1_iv-arybase;
+ /* Overflow can occur when $[ < 0 */
+ if (arybase < 0 && pos1_uv < (UV)pos1_iv)
+ goto bound_fail;
+ pos1_iv = pos1_uv;
+ pos1_is_uv = 1;
+ }
+ else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) {
+ goto bound_fail; /* $[=3; substr($_,2,...) */
+ }
+ else { /* pos < $[ */
+ if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */
+ pos1_iv = curlen;
+ pos1_is_uv = 1;
+ } else {
+ if (curlen) {
+ pos1_is_uv = curlen-1 > ~(UV)pos1_iv;
+ pos1_iv += curlen;
+ }
}
}
- else {
- pos += curlen;
- if (num_args < 3)
- rem = curlen;
- else if (len >= 0) {
- rem = pos+len;
- if (rem > (I32)curlen)
- rem = curlen;
+ if (pos1_is_uv || pos1_iv > 0) {
+ if ((UV)pos1_iv > curlen)
+ goto bound_fail;
+ }
+
+ if (num_args > 2) {
+ if (!len_is_uv && len_iv < 0) {
+ pos2_iv = curlen + len_iv;
+ if (curlen)
+ pos2_is_uv = curlen-1 > ~(UV)len_iv;
+ else
+ pos2_is_uv = 0;
+ } else { /* len_iv >= 0 */
+ if (!pos1_is_uv && pos1_iv < 0) {
+ pos2_iv = pos1_iv + len_iv;
+ pos2_is_uv = (UV)len_iv > (UV)IV_MAX;
+ } else {
+ if ((UV)len_iv > curlen-(UV)pos1_iv)
+ pos2_iv = curlen;
+ else
+ pos2_iv = pos1_iv+len_iv;
+ pos2_is_uv = 1;
+ }
}
- else {
- rem = curlen+len;
- if (rem < pos)
- rem = pos;
- }
- if (pos < 0)
- pos = 0;
- fail = rem;
- rem -= pos;
- }
- if (fail < 0) {
- if (lvalue || repl)
- Perl_croak(aTHX_ "substr outside of string");
- Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
- RETPUSHUNDEF;
}
else {
- const I32 upos = pos;
- const I32 urem = rem;
- if (utf8_curlen)
- sv_pos_u2b(sv, (I32 *)&pos, (I32 *)&rem);
- tmps += pos;
+ pos2_iv = curlen;
+ pos2_is_uv = 1;
+ }
+
+ if (!pos2_is_uv && pos2_iv < 0) {
+ if (!pos1_is_uv && pos1_iv < 0)
+ goto bound_fail;
+ pos2_iv = 0;
+ }
+ else if (!pos1_is_uv && pos1_iv < 0)
+ pos1_iv = 0;
+
+ if ((UV)pos2_iv < (UV)pos1_iv)
+ pos2_iv = pos1_iv;
+ if ((UV)pos2_iv > curlen)
+ pos2_iv = curlen;
+
+ {
+ /* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */
+ const STRLEN pos = (STRLEN)( (UV)pos1_iv );
+ const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv );
+ STRLEN byte_len = len;
+ STRLEN byte_pos = utf8_curlen
+ ? sv_pos_u2b_flags(sv, pos, &byte_len, SV_CONST_RETURN) : pos;
+
+ tmps += byte_pos;
/* we either return a PV or an LV. If the TARG hasn't been used
* before, or is of that type, reuse it; otherwise use a mortal
* instead. Note that LVs can have an extended lifetime, so also
@@ -3185,7 +3222,7 @@ PP(pp_substr)
}
}
- sv_setpvn(TARG, tmps, rem);
+ sv_setpvn(TARG, tmps, byte_len);
#ifdef USE_LOCALE_COLLATE
sv_unmagic(TARG, PERL_MAGIC_collxfrm);
#endif
@@ -3202,7 +3239,7 @@ PP(pp_substr)
}
if (!SvOK(sv))
sv_setpvs(sv, "");
- sv_insert_flags(sv, pos, rem, repl, repl_len, 0);
+ sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0);
if (repl_is_utf8)
SvUTF8_on(sv);
SvREFCNT_dec(repl_sv_copy);
@@ -3232,13 +3269,19 @@ PP(pp_substr)
SvREFCNT_dec(LvTARG(TARG));
LvTARG(TARG) = SvREFCNT_inc_simple(sv);
}
- LvTARGOFF(TARG) = upos;
- LvTARGLEN(TARG) = urem;
+ LvTARGOFF(TARG) = pos;
+ LvTARGLEN(TARG) = len;
}
}
SPAGAIN;
PUSHs(TARG); /* avoid SvSETMAGIC here */
RETURN;
+
+bound_fail:
+ if (lvalue || repl)
+ Perl_croak(aTHX_ "substr outside of string");
+ Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
+ RETPUSHUNDEF;
}
PP(pp_vec)
diff --git a/proto.h b/proto.h
index 4a343be..01be297 100644
--- a/proto.h
+++ b/proto.h
@@ -3374,6 +3374,11 @@ PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 *
#define PERL_ARGS_ASSERT_SV_POS_U2B \
assert(offsetp)
+PERL_CALLCONV STRLEN Perl_sv_pos_u2b_flags(pTHX_ SV *const sv, STRLEN uoffset, STRLEN *const lenp, U32 flags)
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_SV_POS_U2B_FLAGS \
+ assert(sv)
+
PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_SV_POS_B2U \
diff --git a/sv.c b/sv.c
index 4ab41f6..57231ec 100644
--- a/sv.c
+++ b/sv.c
@@ -6240,62 +6240,99 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start
/*
-=for apidoc sv_pos_u2b
+=for apidoc sv_pos_u2b_flags
Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
lenp is non-zero, it does the same to lenp, but this time starting from
-the offset, rather than from the start of the string. Handles magic and
-type coercion.
+the offset, rather than from the start of the string. Handles type coercion.
+I<flags> is passed to C<SvPV_flags>, and usually should be
+C<SV_GMAGIC|SV_CONST_RETURN> to handle magic.
=cut
*/
/*
- * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * sv_pos_u2b_flags() uses, like sv_pos_b2u(), the mg_ptr of the potential
* PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
* byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
*
*/
-void
-Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+STRLEN
+Perl_sv_pos_u2b_flags(pTHX_ SV *const sv, STRLEN uoffset, STRLEN *const lenp,
+ U32 flags)
{
const U8 *start;
STRLEN len;
+ STRLEN boffset;
- PERL_ARGS_ASSERT_SV_POS_U2B;
-
- if (!sv)
- return;
+ PERL_ARGS_ASSERT_SV_POS_U2B_FLAGS;
- start = (U8*)SvPV_const(sv, len);
+ start = (U8*)SvPV_flags(sv, len, flags);
if (len) {
- STRLEN uoffset = (STRLEN) *offsetp;
const U8 * const send = start + len;
MAGIC *mg = NULL;
- const STRLEN boffset = sv_pos_u2b_cached(sv, &mg, start, send,
- uoffset, 0, 0);
-
- *offsetp = (I32) boffset;
+ boffset = sv_pos_u2b_cached(sv, &mg, start, send, uoffset, 0, 0);
if (lenp) {
/* Convert the relative offset to absolute. */
- const STRLEN uoffset2 = uoffset + (STRLEN) *lenp;
+ const STRLEN uoffset2 = uoffset + *lenp;
const STRLEN boffset2
= sv_pos_u2b_cached(sv, &mg, start, send, uoffset2,
uoffset, boffset) - boffset;
*lenp = boffset2;
}
- }
- else {
- *offsetp = 0;
- if (lenp)
- *lenp = 0;
+ } else {
+ if (lenp)
+ *lenp = 0;
+ boffset = 0;
}
- return;
+ return boffset;
+}
+
+/*
+=for apidoc sv_pos_u2b
+
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
+the start of the string, to a count of the equivalent number of bytes; if
+lenp is non-zero, it does the same to lenp, but this time starting from
+the offset, rather than from the start of the string. Handles magic and
+type coercion.
+
+Use C<sv_pos_u2b_flags> in preference, which correctly handles strings longer
+than 2Gb.
+
+=cut
+*/
+
+/*
+ * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
+ *
+ */
+
+/* This function is subject to size and sign problems */
+
+void
+Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+{
+ STRLEN uoffset = (STRLEN)*offsetp;
+
+ PERL_ARGS_ASSERT_SV_POS_U2B;
+
+ if (lenp) {
+ STRLEN ulen = (STRLEN)*lenp;
+ *offsetp = (I32)sv_pos_u2b_flags(sv, (STRLEN)*offsetp, &ulen,
+ SV_GMAGIC|SV_CONST_RETURN);
+ *lenp = (I32)ulen;
+ } else {
+ *offsetp = (I32)sv_pos_u2b_flags(sv, (STRLEN)*offsetp, NULL,
+ SV_GMAGIC|SV_CONST_RETURN);
+ }
}
/* Create and update the UTF8 magic offset cache, with the proffered utf8/
diff --git a/t/re/substr.t b/t/re/substr.t
index c3fa6e1..d0717ba 100644
--- a/t/re/substr.t
+++ b/t/re/substr.t
@@ -24,7 +24,7 @@ $SIG{__WARN__} = sub {
require './test.pl';
-plan(334);
+plan(360);
run_tests() unless caller;
@@ -201,6 +201,11 @@ is($w--, 1);
eval{substr($a,1) = "" ; }; # P=R=S Q
like($@, $FATAL_MSG);
+$b = substr($a,-7,-6) ; # warn # Q R P S
+is($w--, 1);
+eval{substr($a,-7,-6) = "" ; }; # Q R P S
+like($@, $FATAL_MSG);
+
my $a = 'zxcvbnm';
substr($a,2,0) = '';
is($a, 'zxcvbnm');
@@ -682,4 +687,39 @@ is($x, "\x{100}\x{200}\xFFb");
is(substr($a,1,1), 'b');
}
+# [perl #62646] offsets exceeding 32 bits on 64-bit system
+SKIP: {
+ skip("32-bit system", 24) unless ~0 > 0xffffffff;
+ my $a = "abc";
+ my $s;
+ my $r;
+
+ utf8::downgrade($a);
+ for (1..2) {
+ $w = 0;
+ $r = substr($a, 0xffffffff, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ $r = substr($a, 0xffffffff+1, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff+1, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ utf8::upgrade($a);
+ }
+}
+
}
--
Perl5 Master Repository