Ruby Forum Ruby-core > Copy-on-write friendly garbage collector

Posted by Hongli Lai (Guest)
on 03.03.2008 10:49
(Received via mailing list)
Hi.

I've written patch which makes Ruby's garbage collector copy-on-write
friendly. Details can be found on my blog,
http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in
the "Making Ruby's garbage collector copy-on-write friendly" series.

Matz had shown interest in merging the patch into Ruby 1.9. I'm
wondering whether that has already been done, and if not, whether I can
be of any assistance.

Regards,
Hongli Lai
Posted by Daniel DeLorme (Guest)
on 03.03.2008 13:39
(Received via mailing list)
Hongli Lai wrote:
> I've written patch which makes Ruby's garbage collector copy-on-write 
> friendly. Details can be found on my blog, 
> http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in 
> the "Making Ruby's garbage collector copy-on-write friendly" series.
> 
> Matz had shown interest in merging the patch into Ruby 1.9. I'm 
> wondering whether that has already been done, and if not, whether I can 
> be of any assistance.

AFAIK, the 1.9 garbage collector has always been COW-friendly.
Posted by Yukihiro Matsumoto (Guest)
on 03.03.2008 14:12
(Received via mailing list)
Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Mon, 3 Mar 2008 18:48:34 +0900, Hongli Lai <hongli@plan99.net> 
writes:

|I've written patch which makes Ruby's garbage collector copy-on-write 
|friendly. Details can be found on my blog, 
|http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in 
|the "Making Ruby's garbage collector copy-on-write friendly" series.
|
|Matz had shown interest in merging the patch into Ruby 1.9. I'm 
|wondering whether that has already been done, and if not, whether I can 
|be of any assistance.

Here's the patch against the latest trunk (r15675).  It's still 8-10%
slower than the current implementation.

              matz.
diff --git a/debug.c b/debug.c
index d7f99ed..dfcb523 100644
--- a/debug.c
+++ b/debug.c
@@ -29,8 +29,8 @@ static const union {
         RUBY_ENC_CODERANGE_7BIT    = ENC_CODERANGE_7BIT,
         RUBY_ENC_CODERANGE_VALID   = ENC_CODERANGE_VALID,
         RUBY_ENC_CODERANGE_BROKEN  = ENC_CODERANGE_BROKEN,
-        RUBY_FL_MARK        = FL_MARK,
-        RUBY_FL_RESERVED    = FL_RESERVED,
+        RUBY_FL_RESERVED0   = FL_RESERVED0,
+        RUBY_FL_RESERVED1   = FL_RESERVED1,
         RUBY_FL_FINALIZE    = FL_FINALIZE,
         RUBY_FL_TAINT       = FL_TAINT,
         RUBY_FL_EXIVAR      = FL_EXIVAR,
diff --git a/gc.c b/gc.c
index c47f8a0..38a9fe7 100644
--- a/gc.c
+++ b/gc.c
@@ -22,8 +22,14 @@
 #include "gc.h"
 #include <stdio.h>
 #include <setjmp.h>
+#include <math.h>
 #include <sys/types.h>

+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif
@@ -145,6 +151,8 @@ static struct heaps_slot {
     void *membase;
     RVALUE *slot;
     int limit;
+    int *marks;
+    int marks_size;
 } *heaps;
 static int heaps_length = 0;
 static int heaps_used   = 0;
@@ -322,6 +330,36 @@ ruby_xfree(void *x)
   RUBY_CRITICAL(free(x));
 }

+static int debugging = 0;
+
+#define DEBUG_POINT(message) \
+  do { \
+    if (debugging) { \
+      printf("%s\n", message); \
+      getchar(); \
+    } \
+  } while (0)
+
+#define OPTION_ENABLED(name) (getenv((name)) && *getenv((name)) && 
*getenv((name)) != '0')
+
+static void *
+alloc_ruby_heap(size_t size)
+{
+    return malloc(size);
+}
+
+static void
+free_ruby_heap(void *heap)
+{
+    free(heap);
+}
+
+static void
+init_debugging()
+{
+    debugging = OPTION_ENABLED("RUBY_GC_DEBUG");
+}
+

 /*
  *  call-seq:
@@ -413,6 +451,106 @@ rb_gc_unregister_address(VALUE *addr)
     }
 }

+static struct heaps_slot *last_heap = NULL;
+
+static inline struct heaps_slot *
+find_heap_slot_for_object(RVALUE *object)
+{
+    struct heaps_slot *heap;
+    register long hi, lo, mid;
+
+    /* Look in the cache first. */
+    if (last_heap != NULL && object >= last_heap->slot
+  && object < last_heap->slot + last_heap->limit) {
+  return last_heap;
+    }
+    /* find heap_slot for object using bsearch*/
+    lo = 0;
+    hi = heaps_used;
+    while (lo < hi) {
+  mid = (lo + hi) / 2;
+  heap = &heaps[mid];
+  if (heap->slot <= object) {
+      if (object < heap->slot + heap->limit) {
+    /* Cache this result. According to empirical evidence, the chance 
is
+     * high that the next lookup will be for the same heap slot.
+     */
+    last_heap = heap;
+    return heap;
+      }
+      lo = mid + 1;
+  }
+  else {
+      hi = mid;
+  }
+    }
+    return NULL;
+}
+
+static inline void
+find_position_in_bitfield(struct heaps_slot *hs, RVALUE *object,
+                          unsigned int *bitfield_index, unsigned int 
*bitfield_offset)
+{
+    unsigned int index;
+
+    index = object - hs->slot;
+    *bitfield_index = index / (sizeof(int) * 8);
+    *bitfield_offset = index % (sizeof(int) * 8);
+}
+
+
+static void
+rb_mark_table_add(RVALUE *object)
+{
+    struct heaps_slot *hs;
+    unsigned int bitfield_index, bitfield_offset;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+  find_position_in_bitfield(hs, object, &bitfield_index, 
&bitfield_offset);
+  hs->marks[bitfield_index] |= (1 << bitfield_offset);
+    }
+}
+
+static inline int
+rb_mark_table_heap_contains(struct heaps_slot *hs, RVALUE *object)
+{
+    unsigned int bitfield_index, bitfield_offset;
+
+    find_position_in_bitfield(hs, object, &bitfield_index, 
&bitfield_offset);
+    return hs->marks[bitfield_index] & (1 << bitfield_offset);
+}
+
+static inline int
+rb_mark_table_contains(RVALUE *object)
+{
+    struct heaps_slot *hs;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+  return rb_mark_table_heap_contains(hs, object);
+    }
+}
+
+static inline void
+rb_mark_table_heap_remove(struct heaps_slot *hs, RVALUE *object)
+{
+    unsigned int bitfield_index, bitfield_offset;
+    find_position_in_bitfield(hs, object, &bitfield_index, 
&bitfield_offset);
+    hs->marks[bitfield_index] &= ~(1 << bitfield_offset);
+}
+
+static void
+rb_mark_table_remove(RVALUE *object)
+{
+    struct heaps_slot *hs;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+  rb_mark_table_heap_remove(hs, object);
+    }
+}
+
 static int
 heap_cmp(const void *ap, const void *bp, void *dummy)
 {
@@ -445,7 +583,7 @@ add_heap(void)
     }

     for (;;) {
-  RUBY_CRITICAL(p = (RVALUE*)malloc(sizeof(RVALUE)*(heap_slots+1)));
+  RUBY_CRITICAL(p = 
(RVALUE*)alloc_ruby_heap(sizeof(RVALUE)*(heap_slots+1)));
   if (p == 0) {
       if (heap_slots == HEAP_MIN_SLOTS) {
     rb_memerror();
@@ -460,6 +598,8 @@ add_heap(void)
       p = (RVALUE*)((VALUE)p + sizeof(RVALUE) - ((VALUE)p % 
sizeof(RVALUE)));
   heaps[heaps_used].slot = p;
   heaps[heaps_used].limit = heap_slots;
+        heaps[heaps_used].marks_size = (int) (ceil(heap_slots / 
(sizeof(int) * 8.0)));
+        heaps[heaps_used].marks = (int *) 
calloc(heaps[heaps_used].marks_size, sizeof(int));
   break;
     }
     pend = p + heap_slots;
@@ -494,6 +634,7 @@ rb_newobj_from_heap(void)
     freelist = freelist->as.free.next;

     MEMZERO((void*)obj, RVALUE, 1);
+    RANY(obj)->as.free.flags = 0;
 #ifdef GC_DEBUG
     RANY(obj)->file = rb_sourcefile();
     RANY(obj)->line = rb_sourceline();
@@ -702,8 +843,7 @@ gc_mark_all(void)
     for (i = 0; i < heaps_used; i++) {
   p = heaps[i].slot; pend = p + heaps[i].limit;
   while (p < pend) {
-      if ((p->as.basic.flags & FL_MARK) &&
-    (p->as.basic.flags != FL_MARK)) {
+      if (rb_mark_table_contains(p) && (p->as.basic.flags != 0)) {
     gc_mark_children((VALUE)p, 0);
       }
       p++;
@@ -737,21 +877,8 @@ is_pointer_to_heap(void *ptr)
     if (p < lomem || p > himem) return Qfalse;
     if ((VALUE)p % sizeof(RVALUE) != 0) return Qfalse;

-    /* check if p looks like a pointer using bsearch*/
-    lo = 0;
-    hi = heaps_used;
-    while (lo < hi) {
-  mid = (lo + hi) / 2;
-  heap = &heaps[mid];
-  if (heap->slot <= p) {
-      if (p < heap->slot + heap->limit)
-    return Qtrue;
-      lo = mid + 1;
-  }
-  else {
-      hi = mid;
-  }
-    }
+    if (find_heap_slot_for_object(p))
+  return Qtrue;
     return Qfalse;
 }

@@ -857,8 +984,8 @@ gc_mark(VALUE ptr, int lev)
     obj = RANY(ptr);
     if (rb_special_const_p(ptr)) return; /* special const not marked */
     if (obj->as.basic.flags == 0) return;       /* free cell */
-    if (obj->as.basic.flags & FL_MARK) return;  /* already marked */
-    obj->as.basic.flags |= FL_MARK;
+    if (rb_mark_table_contains(obj)) return;  /* already marked */
+    rb_mark_table_add(obj);

     if (lev > GC_LEVEL_MAX || (lev == 0 && ruby_stack_check())) {
   if (!mark_stack_overflow) {
@@ -892,8 +1019,8 @@ gc_mark_children(VALUE ptr, int lev)
     obj = RANY(ptr);
     if (rb_special_const_p(ptr)) return; /* special const not marked */
     if (obj->as.basic.flags == 0) return;       /* free cell */
-    if (obj->as.basic.flags & FL_MARK) return;  /* already marked */
-    obj->as.basic.flags |= FL_MARK;
+    if (rb_mark_table_contains(obj)) return;  /* already marked */
+    rb_mark_table_add(obj);

   marking:
     if (FL_TEST(obj, FL_EXIVAR)) {
@@ -1147,10 +1274,15 @@ finalize_list(RVALUE *p)
     while (p) {
   RVALUE *tmp = p->as.free.next;
   run_final((VALUE)p);
-  if (!FL_TEST(p, FL_SINGLETON)) { /* not freeing page */
+  /* Don't free objects that are singletons, or objects that are 
already freed.
+   * The latter is to prevent the unnecessary marking of memory pages 
as dirty,
+   * which can destroy copy-on-write semantics.
+   */
+  if (!FL_TEST(p, FL_SINGLETON) && p->as.free.flags != 0) {
             VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
       p->as.free.flags = 0;
       p->as.free.next = freelist;
+      rb_mark_table_remove(p);
       freelist = p;
   }
   p = tmp;
@@ -1164,7 +1296,8 @@ free_unused_heaps(void)

     for (i = j = 1; j < heaps_used; i++) {
   if (heaps[i].limit == 0) {
-      free(heaps[i].membase);
+      free_ruby_heap(heaps[i].membase);
+      free(heaps[i].marks);
       heaps_used--;
   }
   else {
@@ -1208,29 +1341,34 @@ gc_sweep(void)

   p = heaps[i].slot; pend = p + heaps[i].limit;
   while (p < pend) {
-      if (!(p->as.basic.flags & FL_MARK)) {
+      if (!rb_mark_table_contains(p)) {
     if (p->as.basic.flags) {
         obj_free((VALUE)p);
     }
     if (need_call_final && FL_TEST(p, FL_FINALIZE)) {
-        p->as.free.flags = FL_MARK; /* remain marked */
+        p->as.free.flags = FL_FINALIZE;
         p->as.free.next = final_list;
         final_list = p;
     }
     else {
                     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, 
sizeof(RVALUE));
-        p->as.free.flags = 0;
-        p->as.free.next = freelist;
+        /* Do not touch the fields if they don't have to be modified.
+         * This is in order to preserve copy-on-write semantics.
+         */
+        if (p->as.free.flags != 0)
+      p->as.free.flags = 0;
+        if (p->as.free.next != freelist)
+      p->as.free.next = freelist;
         freelist = p;
     }
     n++;
       }
-      else if (RBASIC(p)->flags == FL_MARK) {
+      else if (RBASIC(p)->flags == FL_FINALIZE) {
     /* objects to be finalized */
-    /* do nothing remain marked */
+    /* do nothing here */
       }
       else {
-    RBASIC(p)->flags &= ~FL_MARK;
+    rb_mark_table_heap_remove(&heaps[i], p);
     live++;
       }
       p++;
@@ -1272,6 +1410,7 @@ rb_gc_force_recycle(VALUE p)
     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
     RANY(p)->as.free.flags = 0;
     RANY(p)->as.free.next = freelist;
+    rb_mark_table_remove((RVALUE *) p);
     freelist = RANY(p);
 }

@@ -1462,6 +1601,7 @@ mark_current_machine_context(rb_thread_t *th)

     FLUSH_REGISTER_WINDOWS;
     /* This assumes that all registers are saved into the jmp_buf (and 
stack) */
+    memset(save_regs_gc_mark, 0, sizeof(save_regs_gc_mark));
     setjmp(save_regs_gc_mark);
     mark_locations_array((VALUE*)save_regs_gc_mark,
        sizeof(save_regs_gc_mark) / sizeof(VALUE));
@@ -1501,6 +1641,7 @@ garbage_collect(void)

     SET_STACK_END;

+    last_heap = NULL;
     init_mark_stack();

     th->vm->self ? rb_gc_mark(th->vm->self) : rb_vm_mark(th->vm);
@@ -1602,6 +1743,18 @@ ruby_set_stack_size(size_t size)
     rb_gc_stack_maxsize = size;
 }

+int
+rb_gc_is_thread_marked(the_thread)
+    VALUE the_thread;
+{
+    if (FL_ABLE(the_thread)) {
+  return rb_mark_table_contains((RVALUE *) the_thread);
+    }
+    else {
+  return 0;
+    }
+}
+
 void
 Init_stack(VALUE *addr)
 {
@@ -2037,6 +2190,7 @@ rb_gc_call_finalizer_at_exit(void)
     DATA_PTR(p) && RANY(p)->as.data.dfree &&
     RANY(p)->as.basic.klass != rb_cThread) {
     p->as.free.flags = 0;
+    rb_mark_table_remove(p);
     if ((long)RANY(p)->as.data.dfree == -1) {
         RUBY_CRITICAL(free(DATA_PTR(p)));
     }
@@ -2048,6 +2202,7 @@ rb_gc_call_finalizer_at_exit(void)
       else if (BUILTIN_TYPE(p) == T_FILE) {
     if (rb_io_fptr_finalize(RANY(p)->as.file.fptr)) {
         p->as.free.flags = 0;
+        rb_mark_table_remove(p);
                     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, 
sizeof(RVALUE));
     }
       }
@@ -2268,6 +2423,61 @@ count_objects(int argc, VALUE *argv, VALUE os)
     return hash;
 }

+static VALUE
+os_statistics()
+{
+    int i;
+    int n = 0;
+    unsigned int objects = 0;
+    unsigned int total_heap_size = 0;
+    unsigned int ast_nodes = 0;
+    char message[1024];
+
+    for (i = 0; i < heaps_used; i++) {
+  RVALUE *p, *pend;
+
+  p = heaps[i].slot;
+  pend = p + heaps[i].limit;
+  for (;p < pend; p++) {
+      if (p->as.basic.flags) {
+    int isAST = 0;
+    switch (TYPE(p)) {
+      case T_ICLASS:
+      case T_NODE:
+        isAST = 1;
+        break;
+      case T_CLASS:
+        if (FL_TEST(p, FL_SINGLETON)) {
+            isAST = 1;
+            break;
+        }
+      default:
+        break;
+    }
+    objects++;
+    if (isAST) {
+       ast_nodes++;
+    }
+      }
+  }
+  total_heap_size += (void*)pend - heaps[i].membase;
+    }
+
+    snprintf(message, sizeof(message),
+        "Number of objects: %d (%d AST nodes, %.2f%%)\n"
+        "Heap slot size: %d\n"
+        "Number of heaps: %d\n"
+        "Total size of objects: %.2f KB\n"
+        "Total size of heaps: %.2f KB\n",
+        objects, ast_nodes, ast_nodes * 100 / (double) objects,
+        sizeof(RVALUE),
+        heaps_used,
+        objects * sizeof(RVALUE) / 1024.0,
+        total_heap_size / 1024.0
+    );
+    return rb_str_new2(message);
+}
+
 /*
  *  The <code>GC</code> module provides an interface to Ruby's mark and
  *  sweep garbage collection mechanism. Some of the underlying methods
@@ -2300,6 +2510,8 @@ Init_GC(void)

     rb_define_module_function(rb_mObSpace, "_id2ref", id2ref, 1);

+    rb_define_module_function(rb_mObSpace, "statistics", os_statistics, 
0);
+
     rb_gc_register_address(&rb_mObSpace);
     rb_global_variable(&finalizers);
     rb_gc_unregister_address(&rb_mObSpace);
@@ -2315,4 +2527,6 @@ Init_GC(void)
     rb_define_method(rb_mKernel, "object_id", rb_obj_id, 0);

     rb_define_module_function(rb_mObSpace, "count_objects", 
count_objects, -1);
+
+    init_debugging();
 }
diff --git a/include/ruby/ruby.h b/include/ruby/ruby.h
index 4438bc3..1626a7e 100644
--- a/include/ruby/ruby.h
+++ b/include/ruby/ruby.h
@@ -621,8 +621,8 @@ struct RBignum {
 #define RVALUES(obj) (R_CAST(RValues)(obj))

 #define FL_SINGLETON FL_USER0
-#define FL_MARK      (((VALUE)1)<<5)
-#define FL_RESERVED  (((VALUE)1)<<6) /* will be used in the future GC 
*/
+#define FL_RESERVED0 (((VALUE)1)<<5) /* will be used in the future GC 
*/
+#define FL_RESERVED1 (((VALUE)1)<<6) /* will be used in the future GC 
*/
 #define FL_FINALIZE  (((VALUE)1)<<7)
 #define FL_TAINT     (((VALUE)1)<<8)
 #define FL_EXIVAR    (((VALUE)1)<<9)
@@ -716,6 +716,7 @@ void rb_global_variable(VALUE*);
 void rb_register_mark_object(VALUE);
 void rb_gc_register_address(VALUE*);
 void rb_gc_unregister_address(VALUE*);
+int rb_gc_is_thread_marked(VALUE);

 ID rb_intern(const char*);
 ID rb_intern2(const char*, long);
Posted by Gonzalo Garramuño (Guest)
on 04.03.2008 12:33
(Received via mailing list)
Yukihiro Matsumoto wrote:
> 
> Here's the patch against the latest trunk (r15675).  It's still 8-10%
> slower than the current implementation.
> 

Matz,

I cannot speak for the implementation, but that function he added of
GC.statistics is something that probably should be merged as it is
useful for debugging memory issues.

--
Gonzalo 
Garramuñoggarra@advancedsl.com.ar

AMD4400 - ASUS48N-E
GeForce7300GT
Xubuntu Gutsy
Posted by Hongli Lai (Guest)
on 07.03.2008 13:05
(Received via mailing list)
Yukihiro Matsumoto wrote:
> Hi,
> Here's the patch against the latest trunk (r15675).  It's still 8-10%
> slower than the current implementation.

Hi.

It's good to know that the patch isn't in limbo. :) But what is
preventing it from being applied to trunk?
Posted by Paul Brannan (cout)
on 07.03.2008 16:21
(Received via mailing list)
On Fri, Mar 07, 2008 at 09:04:45PM +0900, Hongli Lai wrote:
> It's good to know that the patch isn't in limbo. :) But what is 
> preventing it from being applied to trunk?

If what Matz says is true, that it's 8-10% slower than the current
implementation, isn't that enough of a yellow flag?

Is it possible to improve performance of the patch?  I looked over the
implementation for a little while, but didn't see any obvious
opportunities for improvement.  I'd be interested to see profiler output
to see where time is being spent.

Paul
Posted by Hongli Lai (Guest)
on 07.03.2008 17:23
(Received via mailing list)
Paul Brannan wrote:
> If what Matz says is true, that it's 8-10% slower than the current
> implementation, isn't that enough of a yellow flag?
> 
> Is it possible to improve performance of the patch?  I looked over the
> implementation for a little while, but didn't see any obvious
> opportunities for improvement.  I'd be interested to see profiler output
> to see where time is being spent.

I've looked for ways to optimize it, but the bitfield implementation
already seems to be near-optimal. All the old garbage collector had to
do was setting a flag. But this one has to:
1. Call a mark function.
2. Find the heap that the object is on. This operation is O(n), with n =
the number of heaps. Though it uses a cache to speed up the average 
case.
3. Calculate the location in the bitfield. This requires a division (/)
and a remainder (%) operation.

(1), function call overhead, could be solved with aggressive inlining
and aggressive use of macros. The downside is excessive use of macros
would make the code very hard to read.

(2) could be solved as follows: put an index field in each VALUE object.
Then the mark function can find the location of the heap (and thus the
bitfield) in O(1) time. But this will make each VALUE object at least 4
bytes bigger (on x86). Is this increase in memory usage acceptable for
8-10% performance gain?
I've also thought about ways to make this operation O(1) without an
index field. *Maybe* it is possible by ensuring that heap addresses are
aligned on some multiple of an integer constant. Then finding the heap
from an object would only involve the extraction of a few bits in the
object address. But this is highly platform-dependent and I'm not sure
whether it is viable to implement this without making the code
unmaintainable and unportable.

(3): I don't think this can be solved.

In any case, all of these operations need at least several pointer
dereferences. The old GC only needed 1.

And because Windows does not support fork(), the garbage collector
doesn't have to use a bit field for marking when compiled on Windows.
This could be configured with a macro.
Posted by Joel VanderWerf (Guest)
on 07.03.2008 19:48
(Received via mailing list)
Hongli Lai wrote:
> But this will make each VALUE object at least 4 
> bytes bigger (on x86). Is this increase in memory usage acceptable for 
> 8-10% performance gain?

And 4 extra bytes per object will use more cache, so it's not clear
you'd get all of that 8-10%.
Posted by Daniel DeLorme (Guest)
on 08.03.2008 06:35
(Received via mailing list)
Yukihiro Matsumoto wrote:
> Here's the patch against the latest trunk (r15675).  It's still 8-10%
> slower than the current implementation.

Matz, what do you consider would be an acceptable tradeoff in speed for
the benefits of copy-on-write? Is 5% slower acceptable or does it *have*
to match the current performance?

I tried my hand at optimization; assuming that
  1) a given object is more likely to be in one of the larger heaps,
  2) there are not enough heaps to justify the overhead of a bsearch,
I changed the bsearch to a linear search in the function below and found
a slight performance improvement.



static inline struct heaps_slot *
find_heap_slot_for_object(RVALUE *object)
{
     struct heaps_slot *heap;
     register int i;

     /* Look in the cache first. */
     if (last_heap != NULL && object >= last_heap->slot
  && object < last_heap->slot + last_heap->limit) {
  return last_heap;
     }
     /* find heap_slot for object using bsearch*/
     for(i=0; i<heaps_used; i++) {
  heap = &heaps[i];
  if (heap->slot <= object) {
      if (object < heap->slot + heap->limit) {
    /* Cache this result. According to empirical evidence, the chance is
     * high that the next lookup will be for the same heap slot.
     */
    last_heap = heap;
    return heap;
      }
  }
     }
     return NULL;
}
Posted by Daniel DeLorme (Guest)
on 08.03.2008 08:53
(Received via mailing list)
I believe I managed to close the performance gap to only 6% slower than
the current implementation (but I must admit I have trouble getting
consistently reproducible benchmarks). The attached patch should be
added on top of the one in Matz' email.
Posted by Daniel DeLorme (Guest)
on 08.03.2008 11:02
(Received via mailing list)
Third email in a row... I hope that's not considered spamming ~_~

Since the patch was based on r15675, I tried to merge that plus my
changes into the trunk (r15730) but ran into some conflicts. While
resolving them, I realized that r15683 and 15684 (the apparent source of
the conflict) were sorting the heap list by order of memory location.
But that's not needed if a linear search is more efficient than a
bsearch, so I removed the sorting. New heaps are now added at the end of
the list and the list is searched from last to first (therefore largest
to smallest).

So now, for the following test case which generates 9 heaps
   GC.disable
   x=(1..1000000).map{"x"*10}
   GC.enable
   GC.start
The COW-friendly version is only 1-2% slower than r15730.

I hope this makes it into the trunk but please do review the code.
Posted by Yukihiro Matsumoto (Guest)
on 08.03.2008 16:40
(Received via mailing list)
Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Sat, 8 Mar 2008 19:01:51 +0900, Daniel DeLorme <dan-ml@dan42.com> 
writes:

|But that's not needed if a linear search is more efficient than a 
|bsearch, so I removed the sorting.

Binary search is a preparation for making heap segment size smaller
(say 512 objects per a segment) to increase chances to free unused
heaps to the OS.

              matz.
Posted by Hongli Lai (Guest)
on 12.03.2008 18:24
(Received via mailing list)
Daniel DeLorme wrote:
> I believe I managed to close the performance gap to only 6% slower than 
> the current implementation (but I must admit I have trouble getting 
> consistently reproducible benchmarks). The attached patch should be 
> added on top of the one in Matz' email.
> 
> -- 
> Daniel
> 

Great work Daniel. I don't measure the same amount of speedup that you
claim in your email, but there is definitely a small speedup.

I've added some further further optimizations.
find_position_in_bitfield() now uses bit operators instead of division
and modulo operators. This should speed things up a little more.

The attached patch is created against Ruby 1.8, but it shows what I've
exactly changed.
Posted by Yukihiro Matsumoto (Guest)
on 12.03.2008 18:39
(Received via mailing list)
Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Thu, 13 Mar 2008 02:23:54 +0900, Hongli Lai <hongli@plan99.net> 
writes:

|Great work Daniel. I don't measure the same amount of speedup that you 
|claim in your email, but there is definitely a small speedup.
|
|I've added some further further optimizations. 
|find_position_in_bitfield() now uses bit operators instead of division 
|and modulo operators. This should speed things up a little more.
|
|The attached patch is created against Ruby 1.8, but it shows what I've 
|exactly changed.

I will merge the patch into my personal repository.  Thank you.
Stay tuned until I will merge it to the trunk.

              matz.
Posted by Daniel DeLorme (Guest)
on 13.03.2008 01:48
(Received via mailing list)
Hongli Lai wrote:
> Great work Daniel. I don't measure the same amount of speedup that you 
> claim in your email, but there is definitely a small speedup.
> 
> I've added some further further optimizations. 
> find_position_in_bitfield() now uses bit operators instead of division 
> and modulo operators. This should speed things up a little more.
> 
> The attached patch is created against Ruby 1.8, but it shows what I've 
> exactly changed.

Thank you. I also looked at find_position_in_bitfield() but I assumed
the compiler would be able to make those optimizations, so ended up not
touching it.

I notice that this patch is relative to one of the patches that I
submitted. But in those patches I changed the bsearch to a linear
search, which may not be desirable as Matz pointed out. But I'm not sure
it's a good idea to sacrifice performance now for the sake of a
hypothetical change in heap size in the future (YAGNI). Maybe a #define
could take care of that? e.g.
   #ifdef FEW_HEAPS
     order heaps by size, linear search
   #else
     order heaps by memory location, bsearch
   #endif
Posted by Hongli Lai (Guest)
on 13.03.2008 12:06
(Received via mailing list)
Daniel DeLorme wrote:
>   #ifdef FEW_HEAPS
>     order heaps by size, linear search
>   #else
>     order heaps by memory location, bsearch
>   #endif

My patch is against 1.8, which doesn't have sorted heaps.
Posted by Hongli Lai (Guest)
on 13.03.2008 12:19
(Received via mailing list)
Daniel DeLorme wrote:
> Matz, what do you consider would be an acceptable tradeoff in speed for 
> the benefits of copy-on-write? Is 5% slower acceptable or does it *have* 
> to match the current performance?

Yes, I'd like to know this as well. What would be an acceptable
performance tradeoff?

And will this patch ever make a chance to be accepted into Ruby 1.8? g.c
is very large, and it's generally hard to navigate in. I'd like to
refactor a few things so that it's easier to navigate and to change, but
this comes at the risk that my patch will conflict with future Ruby 1.8
releases, or with other peoples' patches. If this gets merged into Ruby
1.8 then that problem can be solved, so I'm really interested in the
conditions for acceptance.

Regards,
Hongli Lai
Posted by Hongli Lai (Guest)
on 14.03.2008 04:22
(Received via mailing list)
Hi.

More optimization updates. I'm currently working on pluggable mark table
implementations. This makes it possible to choose, during runtime,
whether to use the copy-on-write friendly mark table, or the classic
"mark table" (i.e. setting the FL_MARK flag on objects). Using the
classic "mark table" with this approach results in only 1% performance
loss (which is caused by function call overhead and can't be avoided).
I'll post a patch soon.

Regards,
Hongli Lai
Posted by Daniel DeLorme (Guest)
on 14.03.2008 05:45
(Received via mailing list)
Hongli Lai wrote:
> More optimization updates. I'm currently working on pluggable mark table 
> implementations. This makes it possible to choose, during runtime, 
> whether to use the copy-on-write friendly mark table, or the classic 
> "mark table" (i.e. setting the FL_MARK flag on objects). Using the 
> classic "mark table" with this approach results in only 1% performance 
> loss (which is caused by function call overhead and can't be avoided). 
> I'll post a patch soon.

So the GC method could be dynamically changed to cow-friendly just
before doing a Process.fork... very nice!
Posted by Hongli Lai (Guest)
on 14.03.2008 12:26
(Received via mailing list)
Daniel DeLorme wrote:
> So the GC method could be dynamically changed to cow-friendly just 
> before doing a Process.fork... very nice!

Actually, I'm thinking about requiring the script call "GC.cow_friendly
= true". Most scripts do not fork, and when they do, they usually don't
use it as a means to save memory.
Posted by Meinrad Recheis (Guest)
on 14.03.2008 13:02
(Received via mailing list)
On Fri, Mar 14, 2008 at 12:25 PM, Hongli Lai <hongli@plan99.net> wrote:
> Daniel DeLorme wrote:
>  > So the GC method could be dynamically changed to cow-friendly just
>  > before doing a Process.fork... very nice!
>
>  Actually, I'm thinking about requiring the script call "GC.cow_friendly
>  = true". Most scripts do not fork, and when they do, they usually don't
>  use it as a means to save memory.
>

the name cow_friendly could be misinterpreted by some cow boys ^^. how
about calling it #copy_on_write_friendly or even better
#supports_copy_on_write ?

cow, fork ... *chuckle*
-- henon
Posted by Daniel Berger (Guest)
on 14.03.2008 16:00
(Received via mailing list)
On Fri, Mar 14, 2008 at 5:25 AM, Hongli Lai <hongli@plan99.net> wrote:
> Daniel DeLorme wrote:
>  > So the GC method could be dynamically changed to cow-friendly just
>  > before doing a Process.fork... very nice!
>
>  Actually, I'm thinking about requiring the script call "GC.cow_friendly
>  = true". Most scripts do not fork, and when they do, they usually don't
>  use it as a means to save memory.

I recommend against this. Most people are simply not going to know
when they should invoke it and when they shouldn't. It's an
implementation detail that should be hidden from the user IMHO.
Besides, it really worth splitting it out like this over a 5% speed
hit? Or am I missing some other implementation detail?

Regards,

Dan
Posted by Hongli Lai (Guest)
on 14.03.2008 16:55
(Received via mailing list)
Daniel Berger wrote:
> I recommend against this. Most people are simply not going to know
> when they should invoke it and when they shouldn't. It's an
> implementation detail that should be hidden from the user IMHO.
> Besides, it really worth splitting it out like this over a 5% speed
> hit? Or am I missing some other implementation detail?

Apparently people care about even a 5% speed hit, or we wouldn't still
be discussing this.

However, it is true that most scripts will not benefit from a
copy-on-write friendly garbage collector. Only very few scripts rely on
it for saving memory. As far as I know, the program that I'm developing
is the first Ruby program that uses copy-on-write friendliness for
important stuff. There is both a legit reason for wanting the garbage
collector to be fast in the normal case, and for wanting it to be
not-so-fast but copy-on-write friendly in some edge cases.

That said, exposing implementation details is generally not a nice thing
to do. That's why I've written extensive documentation for the
"copy_on_write_friendly?" and "copy_on_write_friendly=" methods, which
explains that the behavior might be different across different Ruby
implementations and/or different platforms, and that they should only
use these methods if they know what they're doing.

Please see the attached patch. This patch is made against Ruby 1.8,
relative to my last patch.
Posted by Joel VanderWerf (Guest)
on 14.03.2008 18:35
(Received via mailing list)
Hongli Lai wrote:
> Daniel Berger wrote:
>> I recommend against this. Most people are simply not going to know
>> when they should invoke it and when they shouldn't. It's an
>> implementation detail that should be hidden from the user IMHO.
>> Besides, it really worth splitting it out like this over a 5% speed
>> hit? Or am I missing some other implementation detail?
> 
> Apparently people care about even a 5% speed hit, or we wouldn't still 
> be discussing this.

I can certainly see a need for this as an _option_, even if it does cost 
5%.

IIUC, it would make a difference if your main process (say, an in-memory
database, or a large simulation) constructed a big set of objects. Then
you want to fork to generate some graphs from the simulation or perform
a search of the database. Currently, the best solution is GC.disable (in
the fork), but that's not good if the fork needs to live a while (and
the parent will eventually GC anyway).

The 5% cost is much less than the cost of thrashing because all these
forks can't share memory. If the simulation or database process doesn't
need to GC much, the 5% is negligible.

OTOH, most programs probably don't benefit, so shouldn't pay the 5%.

Thanks for your work on cowruby, Hongli.
Posted by Hongli Lai (Guest)
on 15.03.2008 17:16
(Received via mailing list)
Hongli Lai wrote:
> Great work Daniel. I don't measure the same amount of speedup that you 
> claim in your email, but there is definitely a small speedup.
> 
> I've added some further further optimizations. 
> find_position_in_bitfield() now uses bit operators instead of division 
> and modulo operators. This should speed things up a little more.
> 
> The attached patch is created against Ruby 1.8, but it shows what I've 
> exactly changed.

I've been such a fool. The patch I sent uses platform-specific macros to
figure out the base 2 log of sizeof(int)*8.

Here is a new patch, which uses straight C code to figure out that
value. There is no performance overhead because the code is easily
optimizable by the compiler, and is a lot more portable.
Posted by Hongli Lai (Guest)
on 20.03.2008 12:00
(Received via mailing list)
Hi.

Sorry for mentioning this again, but it has been almost a week since my
last patch. I'd really appreciate it if a core committer can comment on 
it.

Thanks in advance.
Posted by Yukihiro Matsumoto (Guest)
on 20.03.2008 12:56
(Received via mailing list)
Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Thu, 20 Mar 2008 19:59:16 +0900, Hongli Lai <hongli@plan99.net> 
writes:

|Sorry for mentioning this again, but it has been almost a week since my 
|last patch. I'd really appreciate it if a core committer can comment on it.

It works on my machine.  But since it slows down Ruby a bit, we
haven't decided yet to merge it officially.  Do you (or anyone else)
have something we can use for GC benchmark?

              matz.
Posted by Hongli Lai (Guest)
on 20.03.2008 13:54
(Received via mailing list)
Yukihiro Matsumoto wrote:
> have something we can use for GC benchmark?
> 
>               matz.

Hi.

Yes, I use the following tool for benchmarking:
http://pastebin.com/m26839d72

The results are as follows:

- Standard Ruby: 13.600 seconds
- My Ruby: 16.600 seconds

So the copy-on-write friendly GC seems to be about 20% slower. (At
least, on this machine. I've noticed that the percentage can vary wildly
depending on the specific machine. On some machines it only seems to be
about 5% slower.)

However, I've recently submitted a patch which implements pluggable
marking tables. By default, it uses the non-copy-on-write-friendly
implementation (i.e. the same as the old one, which uses mark flags on
the objects directly). So if I remove the "GC.copy_on_write_friendly =
true" line, then the result is as follows:

- My Ruby: 13.394 seconds

So by default, performance is on par with standard Ruby. Only if the
programmer has explicitly set "GC.copy_on_write_friendly = true" will
there be a speed hit.

The standard Ruby version I use is 1.8.6 patchlevel 114. My modified
Ruby is also based on that version. Both Ruby binaries are compiled with
'-g' (i.e. no optimizations).

Finally, I've benchmarked some Ruby on Rails applications. Only 10% of
the total application time seems to be spent on the garbage collector.
So if the garbage collector is 20% slower, then it only results in a 2%
speed reduction in total.

I'd like to know whether my patches are acceptable, now that it's just
as fast as the normal Ruby by default. And if it's not acceptable, I'd
like to know how many % speed hit would be acceptable.

Thanks.

- Hongli Lai
Posted by Yukihiro Matsumoto (Guest)
on 20.03.2008 15:24
(Received via mailing list)
Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Thu, 20 Mar 2008 21:54:04 +0900, Hongli Lai <hongli@plan99.net> 
writes:

|Yes, I use the following tool for benchmarking:
|http://pastebin.com/m26839d72

I feel this one is bit too simple.  Since GC behavior heavily depends
on object allocation pattern of programs, I guess we need more
realistic cases to benchmark.  Perhaps we should contact Lloyd for his
cases in <http://lloydforge.org/projects/ruby/>.

By the way, I'd like to know how much effective this copy-on-write
friendly patch is.  Does anyone have a good (even artificial) case?

|I'd like to know whether my patches are acceptable, now that it's just 
|as fast as the normal Ruby by default. And if it's not acceptable, I'd 
|like to know how many % speed hit would be acceptable.

2% loss for almost all cases are good enough.  But I cannot be sure
for 2% for this specific case.

              matz.
Posted by Hongli Lai (Guest)
on 20.03.2008 15:46
(Received via mailing list)
Yukihiro Matsumoto wrote:
> By the way, I'd like to know how much effective this copy-on-write
> friendly patch is.  Does anyone have a good (even artificial) case?

I use the following test:
http://pastebin.com/f353f2ade
It loads Ruby on Rails (2.0.2), then runs garbage collection in a child
process.

The results are as follows:

== Standard Ruby:

After forking, before gc:
- Parent process: 64 KB private dirty memory
- Child process 1: 60 KB private dirty memory
- Child process 2: 128 KB private dirty memory

After forking, after gc:
- Parent process: 64 KB private dirty memory
- Child process 1: 60 KB private dirty memory
- Child process 2: 9.1 MB private dirty memory

== My Ruby:

After forking, before gc:
- Parent process: 64 KB private dirty memory
- Child process 1: 60 KB private dirty memory
- Child process 2: 156 KB private dirty memory

After forking, after gc:
- Parent process: 64 KB private dirty memory
- Child process 1: 60 KB private dirty memory
- Child process 2: 1.1 MB private dirty memory

Some parts of the memory are made dirty for as yet unknown reasons. It
seems to have something to do with the way the system malloc() is
implemented.
Posted by Joel VanderWerf (Guest)
on 20.03.2008 16:29
(Received via mailing list)
Yukihiro Matsumoto wrote:
> By the way, I'd like to know how much effective this copy-on-write
> friendly patch is.  Does anyone have a good (even artificial) case?

Very artificial: [ruby-talk:186561]