Use of murmur hash in st.c

nobu · September 26, 2009, 11:09am

e$B$J$+$@$G$9!#e(B

String#hashe$B$K$Oe(BMurmurHashe$B$r;H$C$F$$$^$9$,!"e(Bstrhashe$B$O$^$@e(BFNV-1ae$B$Ne(B
e$B$^$^$G$9!#e(Bst.ce$B$be(BMurmure$B$KE}0l$7$F$$$$$N$G$O$J$$$+$H;W$$$^$9!#e(B

Index: include/ruby/intern.h

— include/ruby/intern.h (revision 25102)
+++ include/ruby/intern.h (working copy)
@@ -640,4 +640,7 @@ st_index_t rb_hash_uint32(st_index_t, ui
st_index_t rb_hash_uint(st_index_t, st_index_t);
st_index_t rb_hash_end(st_index_t);
+#define rb_hash_uint32(h, i) st_hash_uint32(h, i)
+#define rb_hash_uint(h, i) st_hash_uint(h, i)
+#define rb_hash_end(h) st_hash_end(h)
st_index_t rb_str_hash(VALUE);
int rb_str_hash_cmp(VALUE,VALUE);
Index: include/ruby/st.h

— include/ruby/st.h (revision 25102)
+++ include/ruby/st.h (working copy)
@@ -109,4 +109,10 @@ int st_strcasecmp(const char *s1, const
int st_strncasecmp(const char *s1, const char *s2, size_t n);
size_t st_memsize(const st_table *);
+st_index_t st_hash(const void *ptr, size_t len, st_index_t h);
+st_index_t st_hash_uint32(st_index_t h, unsigned int i);
+st_index_t st_hash_uint(st_index_t h, st_index_t i);
+st_index_t st_hash_end(st_index_t h);
+st_index_t st_hash_start(st_index_t h);
+#define st_hash_start(h) (h)

#if defined(__cplusplus)
Index: string.c

— string.c (revision 25102)
+++ string.c (working copy)
@@ -1977,243 +1977,16 @@ rb_str_concat(VALUE str1, VALUE str2)
}

-#ifndef UNALIGNED_WORD_ACCESS
-# if defined i386 || defined _M_IX86
-# define UNALIGNED_WORD_ACCESS 1
-# endif
-#endif
-#ifndef UNALIGNED_WORD_ACCESS
-# define UNALIGNED_WORD_ACCESS 0
-#endif

-/* MurmurHash described in http://murmurhash.googlepages.com/ */
-#ifndef MURMUR
-#define MURMUR 2
-#endif

-typedef char check_murmur_voidp[SIZEOF_VOIDP == (int)sizeof(st_index_t)
? 1 : -1];
-#define SIZEOF_ST_INDEX_T SIZEOF_VOIDP

-#if MURMUR == 1
-#define MurmurMagic 0xc6a4a793
-#elif MURMUR == 2
-#if SIZEOF_ST_INDEX_T > 4
-#define MurmurMagic 0xc6a4a7935bd1e995
-#else
-#define MurmurMagic 0x5bd1e995
-#endif
-#endif

-static inline st_index_t
-murmur(st_index_t h, st_index_t k, int r)
-{

const st_index_t m = MurmurMagic;
-#if MURMUR == 1
h += k;
h *= m;
h ^= h >> r;
-#elif MURMUR == 2
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
-#endif
return h;
-}

-static inline st_index_t
-murmur_finish(st_index_t h)
-{
-#if MURMUR == 1

h = murmur(h, 0, 10);
h = murmur(h, 0, 17);
-#elif MURMUR == 2
h ^= h >> 13;
h *= MurmurMagic;
h ^= h >> 15;
-#endif
return h;
-}

-#define murmur_step(h, k) murmur(h, k, 16)

-#if MURMUR == 1
-#define murmur1(h) murmur_step(h, 16)
-#else
-#define murmur1(h) murmur_step(h, 24)
-#endif

-static st_index_t
-hash(const void *ptr, size_t len, st_index_t h)
-{

const char *data = ptr;
st_index_t t = 0;
h += 0xdeadbeef;

-#define data_at(n) (st_index_t)((unsigned char)data[n])
-#define UNALIGNED_ADD_4 UNALIGNED_ADD(2); UNALIGNED_ADD(1);
UNALIGNED_ADD(0)
-#if SIZEOF_ST_INDEX_T > 4
-#define UNALIGNED_ADD_8 UNALIGNED_ADD(6); UNALIGNED_ADD(5);
UNALIGNED_ADD(4); UNALIGNED_ADD(3); UNALIGNED_ADD_4
-#if SIZEOF_ST_INDEX_T > 8
-#define UNALIGNED_ADD_16 UNALIGNED_ADD(14); UNALIGNED_ADD(13);
UNALIGNED_ADD(12); UNALIGNED_ADD(11); \

UNALIGNED_ADD(10); UNALIGNED_ADD(9); UNALIGNED_ADD(8);
UNALIGNED_ADD(7); UNALIGNED_ADD_8
-#define UNALIGNED_ADD_ALL UNALIGNED_ADD_16
-#endif
-#define UNALIGNED_ADD_ALL UNALIGNED_ADD_8
-#else
-#define UNALIGNED_ADD_ALL UNALIGNED_ADD_4
-#endif
if (len >= sizeof(st_index_t)) {
-#if !UNALIGNED_WORD_ACCESS
int align = (int)((st_data_t)data % sizeof(st_index_t));
if (align) {
```
 st_index_t d = 0;
```
```
 int sl, sr, pack;
```
```
 switch (align) {
```

-#ifdef WORDS_BIGENDIAN
-# define UNALIGNED_ADD(n) case SIZEOF_ST_INDEX_T - (n) - 1: \

t |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 2)
-#else
-# define UNALIGNED_ADD(n) case SIZEOF_ST_INDEX_T - (n) - 1: \
t |= data_at(n) << CHAR_BIT*(n)
-#endif
UNALIGNED_ADD_ALL;
-#undef UNALIGNED_ADD
```
 }
```

-#ifdef WORDS_BIGENDIAN

```
 t >>= (CHAR_BIT * align) - CHAR_BIT;
```

-#else

```
 t <<= (CHAR_BIT * align);
```

-#endif

```
 data += sizeof(st_index_t)-align;
```
```
 len -= sizeof(st_index_t)-align;
```

 sl = CHAR_BIT * (SIZEOF_ST_INDEX_T-align);

```
 sr = CHAR_BIT * align;
```
```
 while (len >= sizeof(st_index_t)) {
```
d = *(st_index_t *)data;
-#ifdef WORDS_BIGENDIAN
t = (t << sr) | (d >> sl);
-#else
t = (t >> sr) | (d << sl);
-#endif
h = murmur_step(h, t);
t = d;
data += sizeof(st_index_t);
len -= sizeof(st_index_t);
```
 }
```

 pack = len < (size_t)align ? (int)len : align;

```
 d = 0;
```
```
 switch (pack) {
```

-#ifdef WORDS_BIGENDIAN
-# define UNALIGNED_ADD(n) case (n) + 1: \

d |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 1)
-#else
-# define UNALIGNED_ADD(n) case (n) + 1: \
d |= data_at(n) << CHAR_BIT*(n)
-#endif
UNALIGNED_ADD_ALL;
-#undef UNALIGNED_ADD
```
 }
```

-#ifdef WORDS_BIGENDIAN

```
 t = (t << sr) | (d >> sl);
```

-#else

```
 t = (t >> sr) | (d << sl);
```

-#endif

-#if MURMUR == 2

 if (len < (size_t)align) goto skip_tail;

-#endif

```
 h = murmur_step(h, t);
```
```
 data += pack;
```
```
 len -= pack;
```
}
else
-#endif
{
```
 do {
```
h = murmur_step(h, *(st_index_t *)data);
data += sizeof(st_index_t);
len -= sizeof(st_index_t);
```
 } while (len >= sizeof(st_index_t));
```
}
}
t = 0;
switch (len) {
-#ifdef WORDS_BIGENDIAN
-# define UNALIGNED_ADD(n) case (n) + 1: \
t |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 1)
-#else
-# define UNALIGNED_ADD(n) case (n) + 1: \
t |= data_at(n) << CHAR_BIT*(n)
-#endif
UNALIGNED_ADD_ALL;
-#undef UNALIGNED_ADD
-#if MURMUR == 1
h = murmur_step(h, t);
-#elif MURMUR == 2
-# if !UNALIGNED_WORD_ACCESS
```
 skip_tail:
```

-# endif

h ^= t;
h *= MurmurMagic;
-#endif
}
return murmur_finish(h);
-}

-st_index_t
-rb_hash_uint32(st_index_t h, uint32_t i)
-{

return murmur_step(h + i, 16);
-}

-st_index_t
-rb_hash_uint(st_index_t h, st_index_t i)
-{

st_index_t v = 0;
h += i;
-#ifdef WORDS_BIGENDIAN
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 128
v = murmur1(v + (h >> 128));
-#endif
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 8*8
v = murmur1(v + (h >> 88));
-#endif
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 4*8
v = murmur1(v + (h >> 4*8));
-#endif
-#endif
v = murmur1(v + h);
-#ifndef WORDS_BIGENDIAN
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 48
v = murmur1(v + (h >> 48));
-#endif
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 8*8
v = murmur1(v + (h >> 88));
-#endif
-#if SIZEOF_ST_INDEX_TCHAR_BIT > 12*8
v = murmur1(v + (h >> 12*8));
-#endif
-#endif
return v;
-}

-st_index_t
-rb_hash_end(st_index_t h)
-{

h = murmur_step(h, 10);
h = murmur_step(h, 17);
return h;
-}
+#undef rb_hash_uint32
+#undef rb_hash_uint
+#undef rb_hash_end
+RUBY_ALIAS_FUNCTION_TYPE(st_index_t,

  rb_hash_uint32(st_index_t h, unsigned int i),

```
  st_hash_uint32, (h, i));
```

+RUBY_ALIAS_FUNCTION_TYPE(st_index_t,

  rb_hash_uint(st_index_t h, st_index_t i),

```
  st_hash_uint, (h, i));
```

+RUBY_ALIAS_FUNCTION_TYPE(st_index_t,

```
  rb_hash_end(st_index_t h),
```
```
  st_hash_end, (h));
```

st_index_t
@@ -2240,5 +2013,5 @@ rb_hash_start(st_index_t h)
}

return hashseed + h;

return st_hash_start(hashseed + h);
}

@@ -2246,5 +2019,5 @@ st_index_t
rb_memhash(const void *ptr, long len)
{

return hash(ptr, len, rb_hash_start(0));

return st_hash(ptr, len, rb_hash_start(0));
}

Index: st.c

— st.c (revision 25102)
+++ st.c (working copy)
@@ -967,4 +967,5 @@ st_reverse_foreach(st_table *table, int
#define FNV_32_PRIME 0x01000193

+#ifdef ST_USE_FNV1
static st_index_t
strhash(st_data_t arg)
@@ -985,4 +986,260 @@ strhash(st_data_t arg)
return hval;
}
+#else
+
+#ifndef UNALIGNED_WORD_ACCESS
+# if defined i386 || defined _M_IX86
+# define UNALIGNED_WORD_ACCESS 1
+# endif
+#endif
+#ifndef UNALIGNED_WORD_ACCESS
+# define UNALIGNED_WORD_ACCESS 0
+#endif
+
+/* MurmurHash described in http://murmurhash.googlepages.com/ */
+#ifndef MURMUR
+#define MURMUR 2
+#endif
+
+typedef char check_murmur_voidp[SIZEOF_VOIDP == (int)sizeof(st_index_t)
? 1 : -1];
+#define SIZEOF_ST_INDEX_T SIZEOF_VOIDP
+
+#if MURMUR == 1
+#define MurmurMagic 0xc6a4a793
+#elif MURMUR == 2
+#if SIZEOF_ST_INDEX_T > 4
+#define MurmurMagic 0xc6a4a7935bd1e995
+#else
+#define MurmurMagic 0x5bd1e995
+#endif
+#endif
+
+static inline st_index_t
+murmur(st_index_t h, st_index_t k, int r)
+{

const st_index_t m = MurmurMagic;
+#if MURMUR == 1
h += k;
h *= m;
h ^= h >> r;
+#elif MURMUR == 2
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
+#endif
return h;
+}

+static inline st_index_t
+murmur_finish(st_index_t h)
+{
+#if MURMUR == 1

h = murmur(h, 0, 10);
h = murmur(h, 0, 17);
+#elif MURMUR == 2
h ^= h >> 13;
h *= MurmurMagic;
h ^= h >> 15;
+#endif
return h;
+}

+#define murmur_step(h, k) murmur(h, k, 16)
+
+#if MURMUR == 1
+#define murmur1(h) murmur_step(h, 16)
+#else
+#define murmur1(h) murmur_step(h, 24)
+#endif
+
+st_index_t
+st_hash(const void *ptr, size_t len, st_index_t h)
+{

const char *data = ptr;
st_index_t t = 0;
h += 0xdeadbeef;

+#define data_at(n) (st_index_t)((unsigned char)data[n])
+#define UNALIGNED_ADD_4 UNALIGNED_ADD(2); UNALIGNED_ADD(1);
UNALIGNED_ADD(0)
+#if SIZEOF_ST_INDEX_T > 4
+#define UNALIGNED_ADD_8 UNALIGNED_ADD(6); UNALIGNED_ADD(5);
UNALIGNED_ADD(4); UNALIGNED_ADD(3); UNALIGNED_ADD_4
+#if SIZEOF_ST_INDEX_T > 8
+#define UNALIGNED_ADD_16 UNALIGNED_ADD(14); UNALIGNED_ADD(13);
UNALIGNED_ADD(12); UNALIGNED_ADD(11); \

UNALIGNED_ADD(10); UNALIGNED_ADD(9); UNALIGNED_ADD(8);
UNALIGNED_ADD(7); UNALIGNED_ADD_8
+#define UNALIGNED_ADD_ALL UNALIGNED_ADD_16
+#endif
+#define UNALIGNED_ADD_ALL UNALIGNED_ADD_8
+#else
+#define UNALIGNED_ADD_ALL UNALIGNED_ADD_4
+#endif
if (len >= sizeof(st_index_t)) {
+#if !UNALIGNED_WORD_ACCESS
int align = (int)((st_data_t)data % sizeof(st_index_t));
if (align) {
```
 st_index_t d = 0;
```
```
 int sl, sr, pack;
```
```
 switch (align) {
```

+#ifdef WORDS_BIGENDIAN
+# define UNALIGNED_ADD(n) case SIZEOF_ST_INDEX_T - (n) - 1: \

t |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 2)
+#else
+# define UNALIGNED_ADD(n) case SIZEOF_ST_INDEX_T - (n) - 1: \
t |= data_at(n) << CHAR_BIT*(n)
+#endif
UNALIGNED_ADD_ALL;
+#undef UNALIGNED_ADD
```
 }
```

+#ifdef WORDS_BIGENDIAN

```
 t >>= (CHAR_BIT * align) - CHAR_BIT;
```

+#else

```
 t <<= (CHAR_BIT * align);
```

+#endif
+

```
 data += sizeof(st_index_t)-align;
```
```
 len -= sizeof(st_index_t)-align;
```

 sl = CHAR_BIT * (SIZEOF_ST_INDEX_T-align);

```
 sr = CHAR_BIT * align;
```
```
 while (len >= sizeof(st_index_t)) {
```
d = *(st_index_t *)data;
+#ifdef WORDS_BIGENDIAN
t = (t << sr) | (d >> sl);
+#else
t = (t >> sr) | (d << sl);
+#endif
h = murmur_step(h, t);
t = d;
data += sizeof(st_index_t);
len -= sizeof(st_index_t);
```
 }
```

 pack = len < (size_t)align ? (int)len : align;

```
 d = 0;
```
```
 switch (pack) {
```

+#ifdef WORDS_BIGENDIAN
+# define UNALIGNED_ADD(n) case (n) + 1: \

d |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 1)
+#else
+# define UNALIGNED_ADD(n) case (n) + 1: \
d |= data_at(n) << CHAR_BIT*(n)
+#endif
UNALIGNED_ADD_ALL;
+#undef UNALIGNED_ADD
```
 }
```

+#ifdef WORDS_BIGENDIAN

```
 t = (t << sr) | (d >> sl);
```

+#else

```
 t = (t >> sr) | (d << sl);
```

+#endif
+
+#if MURMUR == 2

 if (len < (size_t)align) goto skip_tail;

+#endif

```
 h = murmur_step(h, t);
```
```
 data += pack;
```
```
 len -= pack;
```
}
else
+#endif
{
```
 do {
```
h = murmur_step(h, *(st_index_t *)data);
data += sizeof(st_index_t);
len -= sizeof(st_index_t);
```
 } while (len >= sizeof(st_index_t));
```
}
}
t = 0;
switch (len) {
+#ifdef WORDS_BIGENDIAN
+# define UNALIGNED_ADD(n) case (n) + 1: \
t |= data_at(n) << CHAR_BIT*(SIZEOF_ST_INDEX_T - (n) - 1)
+#else
+# define UNALIGNED_ADD(n) case (n) + 1: \
t |= data_at(n) << CHAR_BIT*(n)
+#endif
UNALIGNED_ADD_ALL;
+#undef UNALIGNED_ADD
+#if MURMUR == 1
h = murmur_step(h, t);
+#elif MURMUR == 2
+# if !UNALIGNED_WORD_ACCESS
```
 skip_tail:
```

+# endif

h ^= t;
h *= MurmurMagic;
+#endif
}
return murmur_finish(h);
+}

+st_index_t
+st_hash_uint32(st_index_t h, uint32_t i)
+{

return murmur_step(h + i, 16);
+}

+st_index_t
+st_hash_uint(st_index_t h, st_index_t i)
+{

st_index_t v = 0;
h += i;
+#ifdef WORDS_BIGENDIAN
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 128
v = murmur1(v + (h >> 128));
+#endif
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 8*8
v = murmur1(v + (h >> 88));
+#endif
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 4*8
v = murmur1(v + (h >> 4*8));
+#endif
+#endif
v = murmur1(v + h);
+#ifndef WORDS_BIGENDIAN
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 48
v = murmur1(v + (h >> 48));
+#endif
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 8*8
v = murmur1(v + (h >> 88));
+#endif
+#if SIZEOF_ST_INDEX_TCHAR_BIT > 12*8
v = murmur1(v + (h >> 12*8));
+#endif
+#endif
return v;
+}

+st_index_t
+st_hash_end(st_index_t h)
+{

h = murmur_step(h, 10);
h = murmur_step(h, 17);
return h;
+}

+#undef st_hash_start
+st_index_t
+st_hash_start(st_index_t h)
+{

return h;
+}

+static st_index_t
+strhash(st_data_t arg)
+{

register const char *string = (const char *)arg;
return st_hash(string, strlen(string), FNV1_32A_INIT);
+}
+#endif

int

nobu · September 26, 2009, 4:20pm

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:39376] use of murmur hash in st.c”
on Sat, 26 Sep 2009 18:09:27 +0900, Nobuyoshi N.
[email protected] writes:

|String#hashe$B$K$Oe(BMurmurHashe$B$r;H$C$F$$$^$9$,!"e(Bstrhashe$B$O$^$@e(BFNV-1ae$B$Ne(B
|e$B$^$^$G$9!#e(Bst.ce$B$be(BMurmure$B$KE}0l$7$F$$$$$N$G$O$J$$$+$H;W$$$^$9!#e(B

e$B$*4j$$$7$^$9!#e(B

Use of murmur hash in st.c

Index: include/ruby/intern.h

#if defined(__cplusplus) Index: string.c

-#ifndef UNALIGNED_WORD_ACCESS -# if defined i386 || defined _M_IX86 -# define UNALIGNED_WORD_ACCESS 1 -# endif -#endif -#ifndef UNALIGNED_WORD_ACCESS -# define UNALIGNED_WORD_ACCESS 0 -#endif

-/* MurmurHash described in http://murmurhash.googlepages.com/ */ -#ifndef MURMUR -#define MURMUR 2 -#endif

-typedef char check_murmur_voidp[SIZEOF_VOIDP == (int)sizeof(st_index_t) ? 1 : -1]; -#define SIZEOF_ST_INDEX_T SIZEOF_VOIDP

-#if MURMUR == 1 -#define MurmurMagic 0xc6a4a793 -#elif MURMUR == 2 -#if SIZEOF_ST_INDEX_T > 4 -#define MurmurMagic 0xc6a4a7935bd1e995 -#else -#define MurmurMagic 0x5bd1e995 -#endif -#endif

-#define murmur_step(h, k) murmur(h, k, 16)

-#if MURMUR == 1 -#define murmur1(h) murmur_step(h, 16) -#else -#define murmur1(h) murmur_step(h, 24) -#endif

-#endif

-#endif

Index: st.c

#if defined(__cplusplus)
Index: string.c

-#ifndef UNALIGNED_WORD_ACCESS
-# if defined i386 || defined _M_IX86
-# define UNALIGNED_WORD_ACCESS 1
-# endif
-#endif
-#ifndef UNALIGNED_WORD_ACCESS
-# define UNALIGNED_WORD_ACCESS 0
-#endif

-/* MurmurHash described in http://murmurhash.googlepages.com/ */
-#ifndef MURMUR
-#define MURMUR 2
-#endif

-typedef char check_murmur_voidp[SIZEOF_VOIDP == (int)sizeof(st_index_t)
? 1 : -1];
-#define SIZEOF_ST_INDEX_T SIZEOF_VOIDP

-#if MURMUR == 1
-#define MurmurMagic 0xc6a4a793
-#elif MURMUR == 2
-#if SIZEOF_ST_INDEX_T > 4
-#define MurmurMagic 0xc6a4a7935bd1e995
-#else
-#define MurmurMagic 0x5bd1e995
-#endif
-#endif

-#if MURMUR == 1
-#define murmur1(h) murmur_step(h, 16)
-#else
-#define murmur1(h) murmur_step(h, 24)
-#endif