SSE2$BHG(B strlen

e$B@.@%$G$9!#e(B

e$B%5%$%%&%:$N8w@.$5$s$N=j$Ke(B SSE2 e$B$rMQ$$$?e(B strlen
e$B$,:$C$F$$$k$N$G$9$,!"e(B
e$B$3$l$r<h$j9~$_$^$;$s$+!#e(B
http://labs.cybozu.co.jp/blog/mitsunari/

SSE2 e$B$Oe(B CPU e$B0MB8$,!&!&!&$H$*;W$$$K$J$k$+$b$7$l$^$;$s$,!"e(B
e$BK\2H$Ne(B Intel e$B$G$Oe(B Pentium4 (2000e$BG/e(B11e$B7ne(B)
e$B$+$iEk:$5$l$F$$$^$9$7!“e(B
AMD e$B$G$be(B Athlon64 e$B$+$iEk:$5$l$F$$$^$9!#e(B
e$B$^$?!“e(BAMD64 e$B5,3J$G$Oe(B SSE2
e$B$,I,?$G$”$k$?$a!”:#8e$NN><T$Ne(B CPUe$B!"e(B
e$B$5$i$Ke(B (e$B$b$7EP>l$9$l$Pe(B) e$BB>$Ne(B AMD64 e$B$Je(B CPU
e$B$G$b287C$r<u$1$k$3$H$,$G$-$^$9!#e(B

e$B5vBzEy$O$o$?$7$,$d$j$^$9!#e(B

#if SSE2

ifdef _WIN32

include <intrin.h>

define bsf(x) (_BitScanForward((unsigned long*)(&x), x), x)

else

include <xmmintrin.h>

define bsf(x) __builtin_ctz(x)

endif

size_t
rb_strlen(const char *p)
{
const char const top = p;
__m128i c16 = _mm_set1_epi8(0);
/
16 byte alignment */
size_t ip = (size_t)(p);
size_t n = ip & 15;
if (n > 0) {
ip &= ~15;
__m128i x = (const __m128i)ip;
__m128i a = _mm_cmpeq_epi8(x, c16);
unsigned int mask = _mm_movemask_epi8(a);
mask &= -(1 << n);
if (mask) {
return bsf(mask) - n;
}
p += 16 - n;
}
for (;:wink: {
__m128i x = (const __m128i)&p[0];
__m128i y = (const __m128i)&p[16];
__m128i a = _mm_cmpeq_epi8(x, c16);
__m128i b = _mm_cmpeq_epi8(y, c16);
unsigned int mask = (_mm_movemask_epi8(b) << 16) |
_mm_movemask_epi8(a);
if (mask) {
return p + bsf(mask) - top;
}
p += 32;
}
}
#else

define rb_strlen(s) strlen(s)

#endif

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:35054] SSE2e$BHGe(B strlen”
on Thu, 12 Jun 2008 01:44:14 +0900, “NARUSE, Yui”
[email protected] writes:

|e$B%5%$%%&%:$N8w@.$5$s$N=j$Ke(B SSE2 e$B$rMQ$$$?e(B strlen e$B$,:$C$F$$$k$N$G$9$,!"e(B
|e$B$3$l$r<h$j9~$_$^$;$s$+!#e(B
|mitsunari@cybozu labs
|
|SSE2 e$B$Oe(B CPU e$B0MB8$,!&!&!&$H$*;W$$$K$J$k$+$b$7$l$^$;$s$,!"e(B
|e$BK\2H$Ne(B Intel e$B$G$Oe(B Pentium4 (2000e$BG/e(B11e$B7ne(B) e$B$+$iEk:$5$l$F$$$^$9$7!“e(B
|AMD e$B$G$be(B Athlon64 e$B$+$iEk:$5$l$F$$$^$9!#e(B
|e$B$^$?!“e(BAMD64 e$B5,3J$G$Oe(B SSE2 e$B$,I,?$G$”$k$?$a!”:#8e$NN><T$Ne(B CPUe$B!"e(B
|e$B$5$i$Ke(B (e$B$b$7EP>l$9$l$Pe(B) e$BB>$Ne(B AMD64 e$B$Je(B CPU e$B$G$b287C$r<u$1$k$3$H$,$G$-$^$9!#e(B

e$B%P%$%J%j$N8_49@-$J$I!“<c435$$K$J$kE@$O$”$j$^$9$,!“<h$j9~$s$Ge(B
e$B$b$$$$$s$8$c$J$$$G$7$g$&$+!#BLL$@$C$?$iLa$;$P$$$$$o$1$@$7!#e(B
e$BIaDL$Ne(Bstrlen()e$B$O$”$^$jEP>l$7$J$$$N$G$I$N$/$i$$2~A1$5$l$k$+$Oe(B
e$BL$CN?t$G$9$,!#e(B

In message [email protected]
on Thu, 12 Jun 2008 01:44:14 +0900,
“NARUSE, Yui” [email protected] wrote:

SSE2 e$B$Oe(B CPU e$B0MB8$,!&!&!&$H$*;W$$$K$J$k$+$b$7$l$^$;$s$,!"e(B
e$BK\2H$Ne(B Intel e$B$G$Oe(B Pentium4 (2000e$BG/e(B11e$B7ne(B) e$B$+$iEk:$5$l$F$$$^$9$7!“e(B
AMD e$B$G$be(B Athlon64 e$B$+$iEk:$5$l$F$$$^$9!#e(B
e$B:G=E$K!”$3$&$$$&8E$$e(BCPUe$B$GF0$+$7$?$H$-$O$I$&$J$k$N$G$7$g$&$+!#e(B

cpu0 at mainbus0: (uniprocessor)
cpu0: Intel Celeron (686-class), 601.40 MHz, id 0x683
cpu0: features 383f9ff<FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,SEP,MTRR>
cpu0: features 383f9ff<PGE,MCA,CMOV,PAT,PSE36,MMX>
cpu0: features 383f9ff<FXSR,SSE>
cpu0: I-cache 16 KB 32B/line 4-way, D-cache 16 KB 32B/line 4-way
cpu0: L2 cache 128 KB 32B/line 4-way
cpu0: ITLB 32 4 KB entries 4-way, 2 4 MB entries fully associative
cpu0: DTLB 64 4 KB entries 4-way, 8 4 MB entries 4-way
cpu0: 8 page colors

e$B$"$H!"e(B(e$B$d$C$Q$j$A$g$C$H8E$$e(B)e$B>JEENO$Je(BCPUe$B$H$+!#e(B:-)

cpu0 at mainbus0: (uniprocessor)
cpu0: VIA C3 Samuel 2/Ezra (686-class), 533.38 MHz, id 0x673
cpu0: features 80803035<FPU,DE,TSC,MSR,MTRR>
cpu0: features 80803035<PGE,MMX>
cpu0: features 80803035<3DNOW>
cpu0: “VIA Samuel 2”
cpu0: I-cache 64 KB 32B/line 4-way, D-cache 64 KB 32B/line 4-way
cpu0: L2 cache 64 KB 32B/line 4-way
cpu0: ITLB 128 4 KB entries 8-way
cpu0: DTLB 128 4 KB entries 8-way
cpu0: 4 page colors

e$B!!$5$5$@$G$9!%e(B

Takahiro K. wrote:

e$B:G=E$K!"$3$&$$$&8E$$e(BCPUe$B$GF0$+$7$?$H$-$O$I$&$J$k$N$G$7$g$&$+!#e(B

e$B!!e(B__SSE2__ e$B$r8+$k$+$iBg>fIW!$$H$$$&OC$8$c$J$$$G$7$g$&$+!%e(B

e$B!!$=$l$O$H$b$+$/!$I>2A%G!<%?$,8+$F$_$?$$$+$b!%e(B

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:35060] Re: SSE2e$BHGe(B strlen”
on Thu, 12 Jun 2008 10:18:23 +0900, SASADA Koichi [email protected]
writes:

|Takahiro K. wrote:
|> e$B:G=E$K!"$3$&$$$&8E$$e(BCPUe$B$GF0$+$7$?$H$-$O$I$&$J$k$N$G$7$g$&$+!#e(B
|
|e$B!!e(B__SSE2__ e$B$r8+$k$+$iBg>fIW!$$H$$$&OC$8$c$J$$$G$7$g$&$+!%e(B

e$B$G$b%3%s%Q%$%k;~$N%A%‘%C%/$G$9$h$M!#?7$7$$e(BCPUe$B$N$"$k4D6-$G%3e(B
e$B%s%Q%$%k$7$?%P%$%J%j$O8E$$e(BCPUe$B$G;H$($J$$$H$$$&$3$H$K$J$k$s$Ge(B
e$B$7$g$&$+$M$’!#e(B

|e$B!!$=$l$O$H$b$+$/!$I>2A%G!<%?$,8+$F$_$?$$$+$b!%e(B

e$BF10U!#e(B

e$B@.@%$G$9!#e(B

e$B3N$+$K%P%$%J%j$r$=$N$^$^0$9$He(Bx86e$B$G$OF0$-$^$;$s$M!#e(B
x86 e$B$G$Oe(B --with-sse
e$B$,;XDj$5$l$?>l9g$N$_$H$+$K$7$?J}$,$$$$$N$+$J!#e(B

e$B8!>Z%G!<%?$OJLESMQ0U$7$^$9e(B

08/06/12 e$B$Ke(B Takahiro K.[email protected]
e$B$5$s$O=q$-$^$7$?e(B:

e$B@>;3OB9-$G$9!#e(B

At Thu, 12 Jun 2008 10:44:08 +0900,
Yukihiro M. wrote:

|> e$B:G=E$K!"$3$&$$$&8E$$e(BCPUe$B$GF0$+$7$?$H$-$O$I$&$J$k$N$G$7$g$&$+!#e(B
|
|e$B!!e(B__SSE2__ e$B$r8+$k$+$iBg>fIW!$$H$$$&OC$8$c$J$$$G$7$g$&$+!%e(B

e$B$G$b%3%s%Q%$%k;~$N%A%‘%C%/$G$9$h$M!#?7$7$$e(BCPUe$B$N$"$k4D6-$G%3e(B
e$B%s%Q%$%k$7$?%P%$%J%j$O8E$$e(BCPUe$B$G;H$($J$$$H$$$&$3$H$K$J$k$s$Ge(B
e$B$7$g$&$+$M$’!#e(B

e$B$3$N>l9g$K;H$($k$N$+$I$&$+$O$o$+$j$^$;$s$,!“e(B
glibce$B$K$Oe(BHWCAPe$B$H$$$&;EAH$_$,$”$k$h$&$G$9!#e(B

http://gotom.jp/~gotom/pub/2003-07-debuan-03summer/debuan-2003-summer.txt

On Thu, 12 Jun 2008 11:08:23 +0900
In article [email protected]
[[ruby-dev:35066] Re: SSE2e$BHGe(B strlen]
Kazuhiro NISHIYAMA [email protected] wrote:

e$B$G$b%3%s%Q%$%k;~$N%A%‘%C%/$G$9$h$M!#?7$7$$e(BCPUe$B$N$"$k4D6-$G%3e(B
e$B%s%Q%$%k$7$?%P%$%J%j$O8E$$e(BCPUe$B$G;H$($J$$$H$$$&$3$H$K$J$k$s$Ge(B
e$B$7$g$&$+$M$’!#e(B

e$B$3$N>l9g$K;H$($k$N$+$I$&$+$O$o$+$j$^$;$s$,!“e(B
glibce$B$K$Oe(BHWCAPe$B$H$$$&;EAH$_$,$”$k$h$&$G$9!#e(B

http://gotom.jp/~gotom/pub/2003-07-debuan-03summer/debuan-2003-summer.txt

e$B8=:_$Ne(BLinuxe$B$G$O3N$+e(B sse2
e$B$NH=JL$O4{Dj$G%%s$K$J$C$F$$$k$N$G!";H$($k$H;We(B
e$B$$$^$9!#>e$NJ8=q$O!"e(BDebian e$B$Oe(B mmx e$B$K2C$($Fe(B cmov
e$B$NH=JL$r4{Dj$G%
%s$K$7$?e(B
e$B7P0^$+$H!#%Q%C%1!<%8%a%s%F%J$KH=CG$7$F$b$i$($P$h$$$N$G$O$J$$$G$7$g$&$+!#e(B

e$B$3$N$"$?$j$NOC!"e(BBINARY HACKS (ISBN 4873112885)
e$B$K:$C$F$$$^$7$?!#e(B

e$B$“$H$O$b$&!”%i%s%?%$%`$Ge(Bsse2e$B$NB8H]$rH=CG$7$F!"F0E*$K@Z$jBX$($k$+$G$9$M!#e(B

e$B!!$5$5$@$G$9!%e(B

Yukihiro M. wrote:

|Takahiro K. wrote:
|> e$B:G=E$K!"$3$&$$$&8E$$e(BCPUe$B$GF0$+$7$?$H$-$O$I$&$J$k$N$G$7$g$&$+!#e(B
|
|e$B!!e(B__SSE2__ e$B$r8+$k$+$iBg>fIW!$$H$$$&OC$8$c$J$$$G$7$g$&$+!%e(B

e$B$G$b%3%s%Q%$%k;~$N%A%’%C%/$G$9$h$M!#?7$7$$e(BCPUe$B$N$"$k4D6-$G%3e(B
e$B%s%Q%$%k$7$?%P%$%J%j$O8E$$e(BCPUe$B$G;H$($J$$$H$$$&$3$H$K$J$k$s$Ge(B
e$B$7$g$&$+$M$’!#e(B

e$B!!$J$k$[$I!$$=$s$JLdBj$,!%e(B

In message [email protected]
on Thu, 12 Jun 2008 11:42:47 +0900,
Tietew [email protected] wrote:

e$B$G$b%3%s%Q%$%k;~$N%A%‘%C%/$G$9$h$M!#?7$7$$e(BCPUe$B$N$"$k4D6-$G%3e(B
e$B%s%Q%$%k$7$?%P%$%J%j$O8E$$e(BCPUe$B$G;H$($J$$$H$$$&$3$H$K$J$k$s$Ge(B
e$B$7$g$&$+$M$’!#e(B
e$B$3$N$h$&$J8E$$e(BCPUe$B$N%^%7%s$G$O!"9bB.$J%^%7%s$G:n$C$?%P%$%J%j!&%Q%C%1!<e(B
e$B%8$r;}$C$F$C$F;H$&$H$$$&%1!<%9$bB?$$$+$H;W$$$^$9!#e(B

e$B8=:_$Ne(BLinuxe$B$G$O3N$+e(B sse2 e$B$NH=JL$O4{Dj$G%%s$K$J$C$F$$$k$N$G!";H$($k$H;We(B
e$B$$$^$9!#>e$NJ8=q$O!"e(BDebian e$B$Oe(B mmx e$B$K2C$($Fe(B cmov e$B$NH=JL$r4{Dj$G%
%s$K$7$?e(B
e$B7P0^$+$H!#%Q%C%1!<%8%a%s%F%J$KH=CG$7$F$b$i$($P$h$$$N$G$O$J$$$G$7$g$&$+!#e(B
NetBSDe$B$G$O!"e(Bsysctl(9)e$B$J!"e(B

machdep.sse = 1
machdep.sse2 = 1

e$B$H$$$C$?$"$?$j!"e(BMacOS 10.5.3e$B$G$O!"e(B

hw.optional.sse4_2: 0
hw.optional.sse4_1: 0
hw.optional.supplementalsse3: 1
hw.optional.sse3: 1
hw.optional.sse2: 1
hw.optional.sse: 1
machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR
PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3
MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM

e$B$H$$$C$?e(Bsysctle$B$N%N!<%I$,$"$k$h$&$G$9!#e(B

On Thu, 12 Jun 2008 11:42:47 +0900
In article [email protected]
[[ruby-dev:35067] Re: SSE2e$BHGe(B strlen]
Tietew [email protected] wrote:

e$B$$$^$9!#>e$NJ8=q$O!"e(BDebian e$B$Oe(B mmx e$B$K2C$($Fe(B cmov e$B$NH=JL$r4{Dj$G%*%s$K$7$?e(B
e$B7P0^$+$H!#%Q%C%1!<%8%a%s%F%J$KH=CG$7$F$b$i$($P$h$$$N$G$O$J$$$G$7$g$&$+!#e(B

Debian etch e$B$@$H$3$s$J46$8$G$9!#e(B

tietew@argon:~$ strace ruby -e0
:
open(“/usr/local/lib/tls/i686/sse2/cmov/libruby.so.1.8”, O_RDONLY) = -1
ENOENT (No such file or directory)
open(“/usr/local/lib/tls/i686/sse2/libruby.so.1.8”, O_RDONLY) = -1
ENOENT (No such file or directory)
open(“/usr/local/lib/tls/i686/cmov/libruby.so.1.8”, O_RDONLY) = -1
ENOENT (No such file or directory)
open(“/usr/local/lib/tls/i686/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/tls/sse2/cmov/libruby.so.1.8”, O_RDONLY) = -1
ENOENT (No such file or directory)
open(“/usr/local/lib/tls/sse2/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/tls/cmov/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/tls/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No such
file or directory)
open(“/usr/local/lib/i686/sse2/cmov/libruby.so.1.8”, O_RDONLY) = -1
ENOENT (No such file or directory)
open(“/usr/local/lib/i686/sse2/libruby.so.1.8”, O_RDONLY) = -1 ENOENT
(No such file or directory)
open(“/usr/local/lib/i686/cmov/libruby.so.1.8”, O_RDONLY) = -1 ENOENT
(No such file or directory)
open(“/usr/local/lib/i686/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/sse2/cmov/libruby.so.1.8”, O_RDONLY) = -1 ENOENT
(No such file or directory)
open(“/usr/local/lib/sse2/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/cmov/libruby.so.1.8”, O_RDONLY) = -1 ENOENT (No
such file or directory)
open(“/usr/local/lib/libruby.so.1.8”, O_RDONLY) = 3
:

e$B@.@%$G$9!#e(B

SASADA Koichi wrote:

e$B!!$=$l$O$H$b$+$/!$I>2A%G!<%?$,8+$F$_$?$$$+$b!%e(B

e$B?’!9%Y%s%A$N<h$jJ}$r9)IW$7$F$_$?$j$7$?$N$G$9$,!"e(B
e$B$$$`$M0J2<$N$h$&$J798~$N$h$&$G$9!#e(B

r17098 e$B$Ne(B test_m17n_comb.rb e$B$N>l9g$G!"e(B

RHEL4 Xeon5150 @ 2.66GHz i386

builtin: 18.21s user 0.02s system 99% cpu 18.244 total
SSE2: 18.25s user 0.02s system 99% cpu 18.286 total

FreeBSD7 Athlon64BE-2300 2.0GHz amd64

builtin: 19.87s user 0.07s system 99% cpu 19.997 total
SSE2: 20.18s user 0.09s system 99% cpu 20.308 total

e$B$D$^$k$H$3$m!"$[$H$s$IJQ$o$i$J$$$+<c43CY$/$J$k$h$&$G$9!#e(B
e$B$H$$$&$o$1$G!"e(BSSE2 e$BHGe(B strlen e$B$K$D$$$F$O<h$j2<$2$^$9!#e(B

e$BAa$/$J$k$H$7$?$ie(B 32bit e$B4D6-$G$Ne(B search_nonascii e$B$de(B
UTF-8 e$BJ8;z%+%&%s%H$+$J$!!#e(B