$BJ8;zNs=hM}$N9bB.2=(B?

e$B$?$@$N0lNc$G$9$,!“@h=5$N6bMKF|$K>>K$5$s$K8+$;$?$H$-$Ke(B
50e$BIC$0$i$$$+$+$C$?%W%m%0%i%$O$=$N8e$Ne(B commit e$B$G$*$=$/$J$j!"e(B 65e$BIC$0$i$$$+$+$C$F$$$^$9!#%W%m%0%i%$O0J2<$NDL$je(B
(e$B0JA0e(B Wolfgang Na’dasi-Donner e$B$,e(B ruby-core e$B$K=P$7$?e(B
e$B$b$N$,%Y!<%9$K$J$C$F$$$^$9!#e(B)
e$BFC$K6bMKF|$NM<J}$”$?$jJ8;zNs=hM}$de(B UTF-8 e$B$N=hM}$rAa$/$9$ke(B
commit e$B$,?'!9$“$j$^$7$?$N$G!”$J$<$GCY$/$J$k$N$+$r$A$g$C$He(B
e$B5?Ld$K;W$C$F$$$^$9!#e(B

e$B59$7$/$*4j$$$7$^$9!#e(B Martin.

-- encoding: utf-8 --

def scan_s
s = “e$B>>K\9T90e(B e$B>>9>e(B e$BEg:,e(B” * 5000
i = 0
10000.upto(30000) { |i| i+=1 if s[i]==‘a’ }
puts i
end

scan_s

#-#-# Martin J. Du"rst, Assoc. Professor, Aoyama Gakuin University
#-#-# http://www.sw.it.aoyama.ac.jp mailto:[email protected]

e$B@.@%$G$9!#e(B

Martin D. wrote:

-- encoding: utf-8 --

def scan_s
s = “e$B>>K\9T90e(B e$B>>9>e(B e$BEg:,e(B” * 5000
i = 0
10000.upto(30000) { |i| i+=1 if s[i]==‘a’ }
puts i
end

scan_s

e$B$3$l$O8=:_9bB.2=$,F/$+$J$$%1!<%9$K$J$j$^$9!#$J$N$G!"9bB.2=$KH<$&JQ99$GA}e(B
e$B$($?J,4t$K$h$C$F5U$KCY$/$J$C$F$7$^$C$F$$$k$G$7$g$&!#e(B(e$B$?$V$se(B
use
search_nonascii e$B$+$Je(B)

e$B0l9TB-$7$F!“e(B
def scan_s
s = “e$B>>K\9T90e(B e$B>>9>e(B e$BEg:,e(B” * 5000
s.valid_encoding?
i = 0
10000.upto(30000) { |i| i+=1 if s[i]==‘a’ }
puts i
end
e$B$H$9$k$H?tIC$G=$o$k$h$&$K$J$k$H;W$$$^$9!#$G!"$3$N$h$&$J%1!<%9$K$$$$Fe(B
validation
e$B$r$I$3$+$N%?%$%_%s%0$G$3$C$=$j$+$1$i$l$k$H$$$$$s$G$9$,!”$I$&e(B
e$B$9$k$N$,$$$$$N$+$J$!$HG:$s$G$$$^$9!#e(B

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:33838] e$BJ8;zNs=hM}$N9bB.2=e(B ?”
on Mon, 18 Feb 2008 16:10:34 +0900, Martin D.
[email protected] writes:

|e$B$?$@$N0lNc$G$9$,!“@h=5$N6bMKF|$K>>K$5$s$K8+$;$?$H$-$Ke(B
|50e$BIC$0$i$$$+$+$C$?%W%m%0%i%$O$=$N8e$Ne(B commit e$B$G$*$=$/$J$j!"e(B |65e$BIC$0$i$$$+$+$C$F$$$^$9!#%W%m%0%i%$O0J2<$NDL$je(B
|(e$B0JA0e(B Wolfgang Na’dasi-Donner e$B$,e(B ruby-core e$B$K=P$7$?e(B
|e$B$b$N$,%Y!<%9$K$J$C$F$$$^$9!#e(B)
|e$BFC$K6bMKF|$NM<J}$”$?$jJ8;zNs=hM}$de(B UTF-8 e$B$N=hM}$rAa$/$9$ke(B
|commit e$B$,?'!9$“$j$^$7$?$N$G!”$J$<$GCY$/$J$k$N$+$r$A$g$C$He(B
|e$B5?Ld$K;W$C$F$$$^$9!#e(B

strlene$B$G$Oe(Bcoderangee$B$,@_Dj$5$l$J$$$+$i7k6Ie(BUTF-8e$B$N:GE,2=$,F/$+e(B
e$B$J$$$+$i$N$h$&$G$9!#BP:v$H$7$F$Oe(B

  • UTF-8e$B:GE,2=$re(BENC_CODERANGE_VALIDe$B$G$J$/$F$bF0$/$h$&$K$9$ke(B
    (e$BFq$7$=$&e(B)
  • strlene$B$Ge(Bcoderange_scan()e$BAjEv$rF1;~$K9T$&e(B

e$B$N$$$:$l$+$,9M$($i$l$^$9!#$H$j$"$($:!"8e<T$NBP:v$r<BAu$7$F$_e(B
e$B$?$H$3$m!"e(BMartine$B$5$s$NNc$G$Oe(B23.95se$B$,e(B2.89se$B$K$J$j$^$7$?!#e(B

e$B%Q%C%A$G$9!#$@$l$+$,%/%j!<%s%"%C%W$7$F$/$l$k$H$&$l$7$$!#e(B

diff --git a/string.c b/string.c
— a/string.c
+++ b/string.c
@@ -619,10 +619,64 @@ rb_enc_strlen(const char *p, const char *e,
rb_encoding *enc)
return c;
}

+long
+rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int
*cr)
+{

  • long c;
  • const char *q;
  • int ret;
  • *cr = 0;
  • if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  •    return (e - p) / rb_enc_mbminlen(enc);
    
  • }
  • else if (rb_enc_asciicompat(enc)) {
  • *cr = ENC_CODERANGE_7BIT;
  •    c = 0;
    
  •    while (p < e) {
    
  •        if (ISASCII(*p)) {
    
  •            q = search_nonascii(p, e);
    
  •            if (!q) {
    
  •                return c + (e - p);
    
  • }
  •            c += q - p;
    
  •            p = q;
    
  •        }
    
  •        ret = rb_enc_precise_mbclen(p, e, enc);
    
  •        if (MBCLEN_CHARFOUND_P(ret)) {
    
  • if (*cr != ENC_CODERANGE_BROKEN)
  •    *cr = ENC_CODERANGE_VALID;
    
  • p += MBCLEN_CHARFOUND_LEN(ret);
  •        }
    
  •  else {
    
  • *cr = ENC_CODERANGE_BROKEN;
  • p++;
  •  }
    
  •        c++;
    
  •    }
    
  •    return c;
    
  • }
  • for (c=0; p<e; c++) {
  • ret = rb_enc_precise_mbclen(p, e, enc);
  • if (MBCLEN_CHARFOUND_P(ret)) {
  •  if (*cr != ENC_CODERANGE_BROKEN)
    
  • *cr = ENC_CODERANGE_VALID;
  •  p += MBCLEN_CHARFOUND_LEN(ret);
    
  • }
  • else {
  •  *cr = ENC_CODERANGE_BROKEN;
    
  •  p++;
    
  • }
  • }
  • return c;
    +}

static long
str_strlen(VALUE str, rb_encoding *enc)
{
const char *p, *e;

  • int n, cr;

    if (single_byte_optimizable(str)) return RSTRING_LEN(str);
    if (!enc) enc = STR_ENC_GET(str);
    @@ -661,7 +715,11 @@ str_strlen(VALUE str, rb_encoding *enc)
    return len;
    }
    #endif

  • return rb_enc_strlen(p, e, enc);
  • n = rb_enc_strlen_cr(p, e, enc, &cr);
  • if (cr) {
  •    ENC_CODERANGE_SET(str, cr);
    
  • }
  • return n;
    }

/*

In article [email protected],
“NARUSE, Yui” [email protected] writes:

validation e$B$r$I$3$+$N%?%$%_%s%0$G$3$C$=$j$+$1$i$l$k$H$$$$$s$G$9$,!"$I$&e(B
e$B$9$k$N$,$$$$$N$+$J$!$HG:$s$G$$$^$9!#e(B

e$B$^$:$OJ8;zNs%j%F%i%k$8$c$J$$$G$9$+$M$'!#e(B

e$B@.@%$G$9!#e(B

Yukihiro M. wrote:

strlene$B$G$Oe(Bcoderangee$B$,@_Dj$5$l$J$$$+$i7k6Ie(BUTF-8e$B$N:GE,2=$,F/$+e(B
e$B$J$$$+$i$N$h$&$G$9!#BP:v$H$7$F$Oe(B

  • UTF-8e$B:GE,2=$re(BENC_CODERANGE_VALIDe$B$G$J$/$F$bF0$/$h$&$K$9$ke(B
    (e$BFq$7$=$&e(B)

e$B%o!<%I$4$H$K$^$H$a$F?t$($k$H$$$&<jK!<+BN$O2DG=$@$H;W$&$N$G$9$,!"e(B
validation
e$B$r9T$$$D$D?t$($k$H$J$k$H?tG\C10L$GCY$/$J$C$F$7$^$&$G$7$g$&$M!#e(B

e$B%o!<%IC10L$Ge(B validation
e$B$C$F$N$b9M$($F$O$$$k$N$G$9$,!"Aa$/$H$be(B10e$BG\L$K~$Je(B
e$B$&$($Ke(B 1
e$BEY$7$+9T$o$J$$$N$G%7%9%F%`$KBP$9$k%$%s%Q%/%H$,Dc$/!"0lJ}$G$+$Je(B
e$B$jJ#;($K$J$j$=$&$J$N$G!"AGD>$K%P%$%HC10L$GFI$s$@J}$,$h$5$=$&$G$9!#e(B

  • strlene$B$Ge(Bcoderange_scan()e$BAjEv$rF1;~$K9T$&e(B

e$B$3$A$i$K2C$($F!“e(B[ruby-dev:33851] e$B$NJ8;zNs%j%F%i%k$de(B IO
coderange_scan
e$B<+BN$N:GE,2=$,$”$j$($k$+$J$H;W$C$F$$$^$9!#e(B

e$B%Q%C%A$G$9!#$@$l$+$,%/%j!<%s%"%C%W$7$F$/$l$k$H$&$l$7$$!#e(B
e$B$H$j$"$($:!"e(B

  • if (*cr != ENC_CODERANGE_BROKEN)
  •    *cr = ENC_CODERANGE_VALID;
    

e$B$H$$$&J,4t$r$J$/$7$F$_$^$7$?!#e(B

— string.c (revision 15548)
+++ string.c (working copy)
@@ -619,10 +619,63 @@ rb_enc_strlen(const char *p, const char
return c;
}

+long
+rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int
*cr)
+{

  • long c;
  • const char *q;
  • int ret;
  • *cr = 0;
  • if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  •   return (e - p) / rb_enc_mbminlen(enc);
    
  • }
  • else if (rb_enc_asciicompat(enc)) {
  •   c = 0;
    
  •   while (p < e) {
    
  •       if (ISASCII(*p)) {
    
  •           q = search_nonascii(p, e);
    
  •           if (!q) {
    
  •               return c + (e - p);
    
  •           }
    
  •           c += q - p;
    
  •           p = q;
    
  •       }
    
  •       ret = rb_enc_precise_mbclen(p, e, enc);
    
  •       if (MBCLEN_CHARFOUND_P(ret)) {
    
  •           *cr |= ENC_CODERANGE_VALID;
    
  •           p += MBCLEN_CHARFOUND_LEN(ret);
    
  •       }
    
  •       else {
    
  •           *cr = ENC_CODERANGE_BROKEN;
    
  •           p++;
    
  •       }
    
  •       c++;
    
  •   }
    
  •   if (!*cr) *cr = ENC_CODERANGE_7BIT;
    
  •   return c;
    
  • }
  • for (c=0; p<e; c++) {
  •   ret = rb_enc_precise_mbclen(p, e, enc);
    
  •   if (MBCLEN_CHARFOUND_P(ret)) {
    
  •       *cr |= ENC_CODERANGE_VALID;
    
  •       p += MBCLEN_CHARFOUND_LEN(ret);
    
  •   }
    
  •   else {
    
  •       *cr = ENC_CODERANGE_BROKEN;
    
  •       p++;
    
  •   }
    
  • }
  • if (!*cr) *cr = ENC_CODERANGE_7BIT;
  • return c;
    +}

static long
str_strlen(VALUE str, rb_encoding *enc)
{
const char *p, *e;

  • int n, cr;

    if (single_byte_optimizable(str)) return RSTRING_LEN(str);
    if (!enc) enc = STR_ENC_GET(str);
    @@ -661,7 +714,11 @@ str_strlen(VALUE str, rb_encoding *enc)
    return len;
    }
    #endif

  • return rb_enc_strlen(p, e, enc);
  • n = rb_enc_strlen_cr(p, e, enc, &cr);
  • if (cr) {
  •    ENC_CODERANGE_SET(str, cr);
    
  • }
  • return n;
    }

/*

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:33860] Re: e$BJ8;zNs=hM}$N9bB.2=e(B ?”
on Tue, 19 Feb 2008 19:40:44 +0900, “NARUSE, Yui”
[email protected] writes:

|> * strlene$B$Ge(Bcoderange_scan()e$BAjEv$rF1;~$K9T$&e(B
|
|e$B$3$A$i$K2C$($F!“e(B[ruby-dev:33851] e$B$NJ8;zNs%j%F%i%k$de(B IO coderange_scan
|e$B<+BN$N:GE,2=$,$”$j$($k$+$J$H;W$C$F$$$^$9!#e(B
|
|> e$B%Q%C%A$G$9!#$@$l$+$,%/%j!<%s%“%C%W$7$F$/$l$k$H$&$l$7$$!#e(B
|e$B$H$j$”$($:!"e(B
|+ if (*cr != ENC_CODERANGE_BROKEN)
|+ *cr = ENC_CODERANGE_VALID;
|e$B$H$$$&J,4t$r$J$/$7$F$_$^$7$?!#e(B

e$B%3%_%C%H$7$F$/$@$5$$$^$;$s$+!#%j%F%i%k$Ne(Bcoderange_scane$B$O$=$Ne(B
e$B$&$A<j$r$D$1$?$$$G$9$M!#e(B