e$B@.@%$G$9!#e(B
e$B%5%$%%&%:$N8w@.$5$s$N=j$Ke(B SSE2 e$B$rMQ$$$?e(B strlen
e$B$,:$C$F$$$k$N$G$9$,!"e(B
e$B$3$l$r<h$j9~$_$^$;$s$+!#e(B
http://labs.cybozu.co.jp/blog/mitsunari/
SSE2 e$B$Oe(B CPU e$B0MB8$,!&!&!&$H$*;W$$$K$J$k$+$b$7$l$^$;$s$,!"e(B
e$BK\2H$Ne(B Intel e$B$G$Oe(B Pentium4 (2000e$BG/e(B11e$B7ne(B)
e$B$+$iEk:$5$l$F$$$^$9$7!“e(B
AMD e$B$G$be(B Athlon64 e$B$+$iEk:$5$l$F$$$^$9!#e(B
e$B$^$?!“e(BAMD64 e$B5,3J$G$Oe(B SSE2
e$B$,I,?$G$”$k$?$a!”:#8e$NN><T$Ne(B CPUe$B!"e(B
e$B$5$i$Ke(B (e$B$b$7EP>l$9$l$Pe(B) e$BB>$Ne(B AMD64 e$B$Je(B CPU
e$B$G$b287C$r<u$1$k$3$H$,$G$-$^$9!#e(B
e$B5vBzEy$O$o$?$7$,$d$j$^$9!#e(B
#if SSE2
ifdef _WIN32
include <intrin.h>
define bsf(x) (_BitScanForward((unsigned long*)(&x), x), x)
else
include <xmmintrin.h>
define bsf(x) __builtin_ctz(x)
endif
size_t
rb_strlen(const char *p)
{
const char const top = p;
__m128i c16 = _mm_set1_epi8(0);
/ 16 byte alignment */
size_t ip = (size_t)(p);
size_t n = ip & 15;
if (n > 0) {
ip &= ~15;
__m128i x = (const __m128i)ip;
__m128i a = _mm_cmpeq_epi8(x, c16);
unsigned int mask = _mm_movemask_epi8(a);
mask &= -(1 << n);
if (mask) {
return bsf(mask) - n;
}
p += 16 - n;
}
for (; {
__m128i x = (const __m128i)&p[0];
__m128i y = (const __m128i)&p[16];
__m128i a = _mm_cmpeq_epi8(x, c16);
__m128i b = _mm_cmpeq_epi8(y, c16);
unsigned int mask = (_mm_movemask_epi8(b) << 16) |
_mm_movemask_epi8(a);
if (mask) {
return p + bsf(mask) - top;
}
p += 32;
}
}
#else
define rb_strlen(s) strlen(s)
#endif