Hi. I've written patch which makes Ruby's garbage collector copy-on-write friendly. Details can be found on my blog, http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in the "Making Ruby's garbage collector copy-on-write friendly" series. Matz had shown interest in merging the patch into Ruby 1.9. I'm wondering whether that has already been done, and if not, whether I can be of any assistance. Regards, Hongli Lai
on 03.03.2008 10:49
on 03.03.2008 13:39
Hongli Lai wrote: > I've written patch which makes Ruby's garbage collector copy-on-write > friendly. Details can be found on my blog, > http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in > the "Making Ruby's garbage collector copy-on-write friendly" series. > > Matz had shown interest in merging the patch into Ruby 1.9. I'm > wondering whether that has already been done, and if not, whether I can > be of any assistance. AFAIK, the 1.9 garbage collector has always been COW-friendly.
on 03.03.2008 14:12
Hi,
In message "Re: Copy-on-write friendly garbage collector"
on Mon, 3 Mar 2008 18:48:34 +0900, Hongli Lai <hongli@plan99.net>
writes:
|I've written patch which makes Ruby's garbage collector copy-on-write
|friendly. Details can be found on my blog,
|http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in
|the "Making Ruby's garbage collector copy-on-write friendly" series.
|
|Matz had shown interest in merging the patch into Ruby 1.9. I'm
|wondering whether that has already been done, and if not, whether I can
|be of any assistance.
Here's the patch against the latest trunk (r15675). It's still 8-10%
slower than the current implementation.
matz.
diff --git a/debug.c b/debug.c
index d7f99ed..dfcb523 100644
--- a/debug.c
+++ b/debug.c
@@ -29,8 +29,8 @@ static const union {
RUBY_ENC_CODERANGE_7BIT = ENC_CODERANGE_7BIT,
RUBY_ENC_CODERANGE_VALID = ENC_CODERANGE_VALID,
RUBY_ENC_CODERANGE_BROKEN = ENC_CODERANGE_BROKEN,
- RUBY_FL_MARK = FL_MARK,
- RUBY_FL_RESERVED = FL_RESERVED,
+ RUBY_FL_RESERVED0 = FL_RESERVED0,
+ RUBY_FL_RESERVED1 = FL_RESERVED1,
RUBY_FL_FINALIZE = FL_FINALIZE,
RUBY_FL_TAINT = FL_TAINT,
RUBY_FL_EXIVAR = FL_EXIVAR,
diff --git a/gc.c b/gc.c
index c47f8a0..38a9fe7 100644
--- a/gc.c
+++ b/gc.c
@@ -22,8 +22,14 @@
#include "gc.h"
#include <stdio.h>
#include <setjmp.h>
+#include <math.h>
#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
@@ -145,6 +151,8 @@ static struct heaps_slot {
void *membase;
RVALUE *slot;
int limit;
+ int *marks;
+ int marks_size;
} *heaps;
static int heaps_length = 0;
static int heaps_used = 0;
@@ -322,6 +330,36 @@ ruby_xfree(void *x)
RUBY_CRITICAL(free(x));
}
+static int debugging = 0;
+
+#define DEBUG_POINT(message) \
+ do { \
+ if (debugging) { \
+ printf("%s\n", message); \
+ getchar(); \
+ } \
+ } while (0)
+
+#define OPTION_ENABLED(name) (getenv((name)) && *getenv((name)) &&
*getenv((name)) != '0')
+
+static void *
+alloc_ruby_heap(size_t size)
+{
+ return malloc(size);
+}
+
+static void
+free_ruby_heap(void *heap)
+{
+ free(heap);
+}
+
+static void
+init_debugging()
+{
+ debugging = OPTION_ENABLED("RUBY_GC_DEBUG");
+}
+
/*
* call-seq:
@@ -413,6 +451,106 @@ rb_gc_unregister_address(VALUE *addr)
}
}
+static struct heaps_slot *last_heap = NULL;
+
+static inline struct heaps_slot *
+find_heap_slot_for_object(RVALUE *object)
+{
+ struct heaps_slot *heap;
+ register long hi, lo, mid;
+
+ /* Look in the cache first. */
+ if (last_heap != NULL && object >= last_heap->slot
+ && object < last_heap->slot + last_heap->limit) {
+ return last_heap;
+ }
+ /* find heap_slot for object using bsearch*/
+ lo = 0;
+ hi = heaps_used;
+ while (lo < hi) {
+ mid = (lo + hi) / 2;
+ heap = &heaps[mid];
+ if (heap->slot <= object) {
+ if (object < heap->slot + heap->limit) {
+ /* Cache this result. According to empirical evidence, the chance
is
+ * high that the next lookup will be for the same heap slot.
+ */
+ last_heap = heap;
+ return heap;
+ }
+ lo = mid + 1;
+ }
+ else {
+ hi = mid;
+ }
+ }
+ return NULL;
+}
+
+static inline void
+find_position_in_bitfield(struct heaps_slot *hs, RVALUE *object,
+ unsigned int *bitfield_index, unsigned int
*bitfield_offset)
+{
+ unsigned int index;
+
+ index = object - hs->slot;
+ *bitfield_index = index / (sizeof(int) * 8);
+ *bitfield_offset = index % (sizeof(int) * 8);
+}
+
+
+static void
+rb_mark_table_add(RVALUE *object)
+{
+ struct heaps_slot *hs;
+ unsigned int bitfield_index, bitfield_offset;
+
+ hs = find_heap_slot_for_object(object);
+ if (hs != NULL) {
+ find_position_in_bitfield(hs, object, &bitfield_index,
&bitfield_offset);
+ hs->marks[bitfield_index] |= (1 << bitfield_offset);
+ }
+}
+
+static inline int
+rb_mark_table_heap_contains(struct heaps_slot *hs, RVALUE *object)
+{
+ unsigned int bitfield_index, bitfield_offset;
+
+ find_position_in_bitfield(hs, object, &bitfield_index,
&bitfield_offset);
+ return hs->marks[bitfield_index] & (1 << bitfield_offset);
+}
+
+static inline int
+rb_mark_table_contains(RVALUE *object)
+{
+ struct heaps_slot *hs;
+
+ hs = find_heap_slot_for_object(object);
+ if (hs != NULL) {
+ return rb_mark_table_heap_contains(hs, object);
+ }
+}
+
+static inline void
+rb_mark_table_heap_remove(struct heaps_slot *hs, RVALUE *object)
+{
+ unsigned int bitfield_index, bitfield_offset;
+ find_position_in_bitfield(hs, object, &bitfield_index,
&bitfield_offset);
+ hs->marks[bitfield_index] &= ~(1 << bitfield_offset);
+}
+
+static void
+rb_mark_table_remove(RVALUE *object)
+{
+ struct heaps_slot *hs;
+
+ hs = find_heap_slot_for_object(object);
+ if (hs != NULL) {
+ rb_mark_table_heap_remove(hs, object);
+ }
+}
+
static int
heap_cmp(const void *ap, const void *bp, void *dummy)
{
@@ -445,7 +583,7 @@ add_heap(void)
}
for (;;) {
- RUBY_CRITICAL(p = (RVALUE*)malloc(sizeof(RVALUE)*(heap_slots+1)));
+ RUBY_CRITICAL(p =
(RVALUE*)alloc_ruby_heap(sizeof(RVALUE)*(heap_slots+1)));
if (p == 0) {
if (heap_slots == HEAP_MIN_SLOTS) {
rb_memerror();
@@ -460,6 +598,8 @@ add_heap(void)
p = (RVALUE*)((VALUE)p + sizeof(RVALUE) - ((VALUE)p %
sizeof(RVALUE)));
heaps[heaps_used].slot = p;
heaps[heaps_used].limit = heap_slots;
+ heaps[heaps_used].marks_size = (int) (ceil(heap_slots /
(sizeof(int) * 8.0)));
+ heaps[heaps_used].marks = (int *)
calloc(heaps[heaps_used].marks_size, sizeof(int));
break;
}
pend = p + heap_slots;
@@ -494,6 +634,7 @@ rb_newobj_from_heap(void)
freelist = freelist->as.free.next;
MEMZERO((void*)obj, RVALUE, 1);
+ RANY(obj)->as.free.flags = 0;
#ifdef GC_DEBUG
RANY(obj)->file = rb_sourcefile();
RANY(obj)->line = rb_sourceline();
@@ -702,8 +843,7 @@ gc_mark_all(void)
for (i = 0; i < heaps_used; i++) {
p = heaps[i].slot; pend = p + heaps[i].limit;
while (p < pend) {
- if ((p->as.basic.flags & FL_MARK) &&
- (p->as.basic.flags != FL_MARK)) {
+ if (rb_mark_table_contains(p) && (p->as.basic.flags != 0)) {
gc_mark_children((VALUE)p, 0);
}
p++;
@@ -737,21 +877,8 @@ is_pointer_to_heap(void *ptr)
if (p < lomem || p > himem) return Qfalse;
if ((VALUE)p % sizeof(RVALUE) != 0) return Qfalse;
- /* check if p looks like a pointer using bsearch*/
- lo = 0;
- hi = heaps_used;
- while (lo < hi) {
- mid = (lo + hi) / 2;
- heap = &heaps[mid];
- if (heap->slot <= p) {
- if (p < heap->slot + heap->limit)
- return Qtrue;
- lo = mid + 1;
- }
- else {
- hi = mid;
- }
- }
+ if (find_heap_slot_for_object(p))
+ return Qtrue;
return Qfalse;
}
@@ -857,8 +984,8 @@ gc_mark(VALUE ptr, int lev)
obj = RANY(ptr);
if (rb_special_const_p(ptr)) return; /* special const not marked */
if (obj->as.basic.flags == 0) return; /* free cell */
- if (obj->as.basic.flags & FL_MARK) return; /* already marked */
- obj->as.basic.flags |= FL_MARK;
+ if (rb_mark_table_contains(obj)) return; /* already marked */
+ rb_mark_table_add(obj);
if (lev > GC_LEVEL_MAX || (lev == 0 && ruby_stack_check())) {
if (!mark_stack_overflow) {
@@ -892,8 +1019,8 @@ gc_mark_children(VALUE ptr, int lev)
obj = RANY(ptr);
if (rb_special_const_p(ptr)) return; /* special const not marked */
if (obj->as.basic.flags == 0) return; /* free cell */
- if (obj->as.basic.flags & FL_MARK) return; /* already marked */
- obj->as.basic.flags |= FL_MARK;
+ if (rb_mark_table_contains(obj)) return; /* already marked */
+ rb_mark_table_add(obj);
marking:
if (FL_TEST(obj, FL_EXIVAR)) {
@@ -1147,10 +1274,15 @@ finalize_list(RVALUE *p)
while (p) {
RVALUE *tmp = p->as.free.next;
run_final((VALUE)p);
- if (!FL_TEST(p, FL_SINGLETON)) { /* not freeing page */
+ /* Don't free objects that are singletons, or objects that are
already freed.
+ * The latter is to prevent the unnecessary marking of memory pages
as dirty,
+ * which can destroy copy-on-write semantics.
+ */
+ if (!FL_TEST(p, FL_SINGLETON) && p->as.free.flags != 0) {
VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
p->as.free.flags = 0;
p->as.free.next = freelist;
+ rb_mark_table_remove(p);
freelist = p;
}
p = tmp;
@@ -1164,7 +1296,8 @@ free_unused_heaps(void)
for (i = j = 1; j < heaps_used; i++) {
if (heaps[i].limit == 0) {
- free(heaps[i].membase);
+ free_ruby_heap(heaps[i].membase);
+ free(heaps[i].marks);
heaps_used--;
}
else {
@@ -1208,29 +1341,34 @@ gc_sweep(void)
p = heaps[i].slot; pend = p + heaps[i].limit;
while (p < pend) {
- if (!(p->as.basic.flags & FL_MARK)) {
+ if (!rb_mark_table_contains(p)) {
if (p->as.basic.flags) {
obj_free((VALUE)p);
}
if (need_call_final && FL_TEST(p, FL_FINALIZE)) {
- p->as.free.flags = FL_MARK; /* remain marked */
+ p->as.free.flags = FL_FINALIZE;
p->as.free.next = final_list;
final_list = p;
}
else {
VALGRIND_MAKE_MEM_UNDEFINED((void*)p,
sizeof(RVALUE));
- p->as.free.flags = 0;
- p->as.free.next = freelist;
+ /* Do not touch the fields if they don't have to be modified.
+ * This is in order to preserve copy-on-write semantics.
+ */
+ if (p->as.free.flags != 0)
+ p->as.free.flags = 0;
+ if (p->as.free.next != freelist)
+ p->as.free.next = freelist;
freelist = p;
}
n++;
}
- else if (RBASIC(p)->flags == FL_MARK) {
+ else if (RBASIC(p)->flags == FL_FINALIZE) {
/* objects to be finalized */
- /* do nothing remain marked */
+ /* do nothing here */
}
else {
- RBASIC(p)->flags &= ~FL_MARK;
+ rb_mark_table_heap_remove(&heaps[i], p);
live++;
}
p++;
@@ -1272,6 +1410,7 @@ rb_gc_force_recycle(VALUE p)
VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
RANY(p)->as.free.flags = 0;
RANY(p)->as.free.next = freelist;
+ rb_mark_table_remove((RVALUE *) p);
freelist = RANY(p);
}
@@ -1462,6 +1601,7 @@ mark_current_machine_context(rb_thread_t *th)
FLUSH_REGISTER_WINDOWS;
/* This assumes that all registers are saved into the jmp_buf (and
stack) */
+ memset(save_regs_gc_mark, 0, sizeof(save_regs_gc_mark));
setjmp(save_regs_gc_mark);
mark_locations_array((VALUE*)save_regs_gc_mark,
sizeof(save_regs_gc_mark) / sizeof(VALUE));
@@ -1501,6 +1641,7 @@ garbage_collect(void)
SET_STACK_END;
+ last_heap = NULL;
init_mark_stack();
th->vm->self ? rb_gc_mark(th->vm->self) : rb_vm_mark(th->vm);
@@ -1602,6 +1743,18 @@ ruby_set_stack_size(size_t size)
rb_gc_stack_maxsize = size;
}
+int
+rb_gc_is_thread_marked(the_thread)
+ VALUE the_thread;
+{
+ if (FL_ABLE(the_thread)) {
+ return rb_mark_table_contains((RVALUE *) the_thread);
+ }
+ else {
+ return 0;
+ }
+}
+
void
Init_stack(VALUE *addr)
{
@@ -2037,6 +2190,7 @@ rb_gc_call_finalizer_at_exit(void)
DATA_PTR(p) && RANY(p)->as.data.dfree &&
RANY(p)->as.basic.klass != rb_cThread) {
p->as.free.flags = 0;
+ rb_mark_table_remove(p);
if ((long)RANY(p)->as.data.dfree == -1) {
RUBY_CRITICAL(free(DATA_PTR(p)));
}
@@ -2048,6 +2202,7 @@ rb_gc_call_finalizer_at_exit(void)
else if (BUILTIN_TYPE(p) == T_FILE) {
if (rb_io_fptr_finalize(RANY(p)->as.file.fptr)) {
p->as.free.flags = 0;
+ rb_mark_table_remove(p);
VALGRIND_MAKE_MEM_UNDEFINED((void*)p,
sizeof(RVALUE));
}
}
@@ -2268,6 +2423,61 @@ count_objects(int argc, VALUE *argv, VALUE os)
return hash;
}
+static VALUE
+os_statistics()
+{
+ int i;
+ int n = 0;
+ unsigned int objects = 0;
+ unsigned int total_heap_size = 0;
+ unsigned int ast_nodes = 0;
+ char message[1024];
+
+ for (i = 0; i < heaps_used; i++) {
+ RVALUE *p, *pend;
+
+ p = heaps[i].slot;
+ pend = p + heaps[i].limit;
+ for (;p < pend; p++) {
+ if (p->as.basic.flags) {
+ int isAST = 0;
+ switch (TYPE(p)) {
+ case T_ICLASS:
+ case T_NODE:
+ isAST = 1;
+ break;
+ case T_CLASS:
+ if (FL_TEST(p, FL_SINGLETON)) {
+ isAST = 1;
+ break;
+ }
+ default:
+ break;
+ }
+ objects++;
+ if (isAST) {
+ ast_nodes++;
+ }
+ }
+ }
+ total_heap_size += (void*)pend - heaps[i].membase;
+ }
+
+ snprintf(message, sizeof(message),
+ "Number of objects: %d (%d AST nodes, %.2f%%)\n"
+ "Heap slot size: %d\n"
+ "Number of heaps: %d\n"
+ "Total size of objects: %.2f KB\n"
+ "Total size of heaps: %.2f KB\n",
+ objects, ast_nodes, ast_nodes * 100 / (double) objects,
+ sizeof(RVALUE),
+ heaps_used,
+ objects * sizeof(RVALUE) / 1024.0,
+ total_heap_size / 1024.0
+ );
+ return rb_str_new2(message);
+}
+
/*
* The <code>GC</code> module provides an interface to Ruby's mark and
* sweep garbage collection mechanism. Some of the underlying methods
@@ -2300,6 +2510,8 @@ Init_GC(void)
rb_define_module_function(rb_mObSpace, "_id2ref", id2ref, 1);
+ rb_define_module_function(rb_mObSpace, "statistics", os_statistics,
0);
+
rb_gc_register_address(&rb_mObSpace);
rb_global_variable(&finalizers);
rb_gc_unregister_address(&rb_mObSpace);
@@ -2315,4 +2527,6 @@ Init_GC(void)
rb_define_method(rb_mKernel, "object_id", rb_obj_id, 0);
rb_define_module_function(rb_mObSpace, "count_objects",
count_objects, -1);
+
+ init_debugging();
}
diff --git a/include/ruby/ruby.h b/include/ruby/ruby.h
index 4438bc3..1626a7e 100644
--- a/include/ruby/ruby.h
+++ b/include/ruby/ruby.h
@@ -621,8 +621,8 @@ struct RBignum {
#define RVALUES(obj) (R_CAST(RValues)(obj))
#define FL_SINGLETON FL_USER0
-#define FL_MARK (((VALUE)1)<<5)
-#define FL_RESERVED (((VALUE)1)<<6) /* will be used in the future GC
*/
+#define FL_RESERVED0 (((VALUE)1)<<5) /* will be used in the future GC
*/
+#define FL_RESERVED1 (((VALUE)1)<<6) /* will be used in the future GC
*/
#define FL_FINALIZE (((VALUE)1)<<7)
#define FL_TAINT (((VALUE)1)<<8)
#define FL_EXIVAR (((VALUE)1)<<9)
@@ -716,6 +716,7 @@ void rb_global_variable(VALUE*);
void rb_register_mark_object(VALUE);
void rb_gc_register_address(VALUE*);
void rb_gc_unregister_address(VALUE*);
+int rb_gc_is_thread_marked(VALUE);
ID rb_intern(const char*);
ID rb_intern2(const char*, long);
on 04.03.2008 12:33
Yukihiro Matsumoto wrote: > > Here's the patch against the latest trunk (r15675). It's still 8-10% > slower than the current implementation. > Matz, I cannot speak for the implementation, but that function he added of GC.statistics is something that probably should be merged as it is useful for debugging memory issues. -- Gonzalo Garramuñoggarra@advancedsl.com.ar AMD4400 - ASUS48N-E GeForce7300GT Xubuntu Gutsy
on 07.03.2008 13:05
Yukihiro Matsumoto wrote: > Hi, > Here's the patch against the latest trunk (r15675). It's still 8-10% > slower than the current implementation. Hi. It's good to know that the patch isn't in limbo. :) But what is preventing it from being applied to trunk?
on 07.03.2008 16:21
On Fri, Mar 07, 2008 at 09:04:45PM +0900, Hongli Lai wrote: > It's good to know that the patch isn't in limbo. :) But what is > preventing it from being applied to trunk? If what Matz says is true, that it's 8-10% slower than the current implementation, isn't that enough of a yellow flag? Is it possible to improve performance of the patch? I looked over the implementation for a little while, but didn't see any obvious opportunities for improvement. I'd be interested to see profiler output to see where time is being spent. Paul
on 07.03.2008 17:23
Paul Brannan wrote: > If what Matz says is true, that it's 8-10% slower than the current > implementation, isn't that enough of a yellow flag? > > Is it possible to improve performance of the patch? I looked over the > implementation for a little while, but didn't see any obvious > opportunities for improvement. I'd be interested to see profiler output > to see where time is being spent. I've looked for ways to optimize it, but the bitfield implementation already seems to be near-optimal. All the old garbage collector had to do was setting a flag. But this one has to: 1. Call a mark function. 2. Find the heap that the object is on. This operation is O(n), with n = the number of heaps. Though it uses a cache to speed up the average case. 3. Calculate the location in the bitfield. This requires a division (/) and a remainder (%) operation. (1), function call overhead, could be solved with aggressive inlining and aggressive use of macros. The downside is excessive use of macros would make the code very hard to read. (2) could be solved as follows: put an index field in each VALUE object. Then the mark function can find the location of the heap (and thus the bitfield) in O(1) time. But this will make each VALUE object at least 4 bytes bigger (on x86). Is this increase in memory usage acceptable for 8-10% performance gain? I've also thought about ways to make this operation O(1) without an index field. *Maybe* it is possible by ensuring that heap addresses are aligned on some multiple of an integer constant. Then finding the heap from an object would only involve the extraction of a few bits in the object address. But this is highly platform-dependent and I'm not sure whether it is viable to implement this without making the code unmaintainable and unportable. (3): I don't think this can be solved. In any case, all of these operations need at least several pointer dereferences. The old GC only needed 1. And because Windows does not support fork(), the garbage collector doesn't have to use a bit field for marking when compiled on Windows. This could be configured with a macro.
on 07.03.2008 19:48
Hongli Lai wrote: > But this will make each VALUE object at least 4 > bytes bigger (on x86). Is this increase in memory usage acceptable for > 8-10% performance gain? And 4 extra bytes per object will use more cache, so it's not clear you'd get all of that 8-10%.
on 08.03.2008 06:35
Yukihiro Matsumoto wrote: > Here's the patch against the latest trunk (r15675). It's still 8-10% > slower than the current implementation. Matz, what do you consider would be an acceptable tradeoff in speed for the benefits of copy-on-write? Is 5% slower acceptable or does it *have* to match the current performance? I tried my hand at optimization; assuming that 1) a given object is more likely to be in one of the larger heaps, 2) there are not enough heaps to justify the overhead of a bsearch, I changed the bsearch to a linear search in the function below and found a slight performance improvement. static inline struct heaps_slot * find_heap_slot_for_object(RVALUE *object) { struct heaps_slot *heap; register int i; /* Look in the cache first. */ if (last_heap != NULL && object >= last_heap->slot && object < last_heap->slot + last_heap->limit) { return last_heap; } /* find heap_slot for object using bsearch*/ for(i=0; i<heaps_used; i++) { heap = &heaps[i]; if (heap->slot <= object) { if (object < heap->slot + heap->limit) { /* Cache this result. According to empirical evidence, the chance is * high that the next lookup will be for the same heap slot. */ last_heap = heap; return heap; } } } return NULL; }
on 08.03.2008 08:53
I believe I managed to close the performance gap to only 6% slower than the current implementation (but I must admit I have trouble getting consistently reproducible benchmarks). The attached patch should be added on top of the one in Matz' email.
on 08.03.2008 11:02
Third email in a row... I hope that's not considered spamming ~_~
Since the patch was based on r15675, I tried to merge that plus my
changes into the trunk (r15730) but ran into some conflicts. While
resolving them, I realized that r15683 and 15684 (the apparent source of
the conflict) were sorting the heap list by order of memory location.
But that's not needed if a linear search is more efficient than a
bsearch, so I removed the sorting. New heaps are now added at the end of
the list and the list is searched from last to first (therefore largest
to smallest).
So now, for the following test case which generates 9 heaps
GC.disable
x=(1..1000000).map{"x"*10}
GC.enable
GC.start
The COW-friendly version is only 1-2% slower than r15730.
I hope this makes it into the trunk but please do review the code.
on 08.03.2008 16:40
Hi,
In message "Re: Copy-on-write friendly garbage collector"
on Sat, 8 Mar 2008 19:01:51 +0900, Daniel DeLorme <dan-ml@dan42.com>
writes:
|But that's not needed if a linear search is more efficient than a
|bsearch, so I removed the sorting.
Binary search is a preparation for making heap segment size smaller
(say 512 objects per a segment) to increase chances to free unused
heaps to the OS.
matz.
on 12.03.2008 18:24
Daniel DeLorme wrote: > I believe I managed to close the performance gap to only 6% slower than > the current implementation (but I must admit I have trouble getting > consistently reproducible benchmarks). The attached patch should be > added on top of the one in Matz' email. > > -- > Daniel > Great work Daniel. I don't measure the same amount of speedup that you claim in your email, but there is definitely a small speedup. I've added some further further optimizations. find_position_in_bitfield() now uses bit operators instead of division and modulo operators. This should speed things up a little more. The attached patch is created against Ruby 1.8, but it shows what I've exactly changed.
on 12.03.2008 18:39
Hi,
In message "Re: Copy-on-write friendly garbage collector"
on Thu, 13 Mar 2008 02:23:54 +0900, Hongli Lai <hongli@plan99.net>
writes:
|Great work Daniel. I don't measure the same amount of speedup that you
|claim in your email, but there is definitely a small speedup.
|
|I've added some further further optimizations.
|find_position_in_bitfield() now uses bit operators instead of division
|and modulo operators. This should speed things up a little more.
|
|The attached patch is created against Ruby 1.8, but it shows what I've
|exactly changed.
I will merge the patch into my personal repository. Thank you.
Stay tuned until I will merge it to the trunk.
matz.
on 13.03.2008 01:48
Hongli Lai wrote: > Great work Daniel. I don't measure the same amount of speedup that you > claim in your email, but there is definitely a small speedup. > > I've added some further further optimizations. > find_position_in_bitfield() now uses bit operators instead of division > and modulo operators. This should speed things up a little more. > > The attached patch is created against Ruby 1.8, but it shows what I've > exactly changed. Thank you. I also looked at find_position_in_bitfield() but I assumed the compiler would be able to make those optimizations, so ended up not touching it. I notice that this patch is relative to one of the patches that I submitted. But in those patches I changed the bsearch to a linear search, which may not be desirable as Matz pointed out. But I'm not sure it's a good idea to sacrifice performance now for the sake of a hypothetical change in heap size in the future (YAGNI). Maybe a #define could take care of that? e.g. #ifdef FEW_HEAPS order heaps by size, linear search #else order heaps by memory location, bsearch #endif
on 13.03.2008 12:06
Daniel DeLorme wrote: > #ifdef FEW_HEAPS > order heaps by size, linear search > #else > order heaps by memory location, bsearch > #endif My patch is against 1.8, which doesn't have sorted heaps.
on 13.03.2008 12:19
Daniel DeLorme wrote: > Matz, what do you consider would be an acceptable tradeoff in speed for > the benefits of copy-on-write? Is 5% slower acceptable or does it *have* > to match the current performance? Yes, I'd like to know this as well. What would be an acceptable performance tradeoff? And will this patch ever make a chance to be accepted into Ruby 1.8? g.c is very large, and it's generally hard to navigate in. I'd like to refactor a few things so that it's easier to navigate and to change, but this comes at the risk that my patch will conflict with future Ruby 1.8 releases, or with other peoples' patches. If this gets merged into Ruby 1.8 then that problem can be solved, so I'm really interested in the conditions for acceptance. Regards, Hongli Lai
on 14.03.2008 04:22
Hi. More optimization updates. I'm currently working on pluggable mark table implementations. This makes it possible to choose, during runtime, whether to use the copy-on-write friendly mark table, or the classic "mark table" (i.e. setting the FL_MARK flag on objects). Using the classic "mark table" with this approach results in only 1% performance loss (which is caused by function call overhead and can't be avoided). I'll post a patch soon. Regards, Hongli Lai
on 14.03.2008 05:45
Hongli Lai wrote: > More optimization updates. I'm currently working on pluggable mark table > implementations. This makes it possible to choose, during runtime, > whether to use the copy-on-write friendly mark table, or the classic > "mark table" (i.e. setting the FL_MARK flag on objects). Using the > classic "mark table" with this approach results in only 1% performance > loss (which is caused by function call overhead and can't be avoided). > I'll post a patch soon. So the GC method could be dynamically changed to cow-friendly just before doing a Process.fork... very nice!
on 14.03.2008 12:26
Daniel DeLorme wrote: > So the GC method could be dynamically changed to cow-friendly just > before doing a Process.fork... very nice! Actually, I'm thinking about requiring the script call "GC.cow_friendly = true". Most scripts do not fork, and when they do, they usually don't use it as a means to save memory.
on 14.03.2008 13:02
On Fri, Mar 14, 2008 at 12:25 PM, Hongli Lai <hongli@plan99.net> wrote: > Daniel DeLorme wrote: > > So the GC method could be dynamically changed to cow-friendly just > > before doing a Process.fork... very nice! > > Actually, I'm thinking about requiring the script call "GC.cow_friendly > = true". Most scripts do not fork, and when they do, they usually don't > use it as a means to save memory. > the name cow_friendly could be misinterpreted by some cow boys ^^. how about calling it #copy_on_write_friendly or even better #supports_copy_on_write ? cow, fork ... *chuckle* -- henon
on 14.03.2008 16:00
On Fri, Mar 14, 2008 at 5:25 AM, Hongli Lai <hongli@plan99.net> wrote: > Daniel DeLorme wrote: > > So the GC method could be dynamically changed to cow-friendly just > > before doing a Process.fork... very nice! > > Actually, I'm thinking about requiring the script call "GC.cow_friendly > = true". Most scripts do not fork, and when they do, they usually don't > use it as a means to save memory. I recommend against this. Most people are simply not going to know when they should invoke it and when they shouldn't. It's an implementation detail that should be hidden from the user IMHO. Besides, it really worth splitting it out like this over a 5% speed hit? Or am I missing some other implementation detail? Regards, Dan
on 14.03.2008 16:55
Daniel Berger wrote: > I recommend against this. Most people are simply not going to know > when they should invoke it and when they shouldn't. It's an > implementation detail that should be hidden from the user IMHO. > Besides, it really worth splitting it out like this over a 5% speed > hit? Or am I missing some other implementation detail? Apparently people care about even a 5% speed hit, or we wouldn't still be discussing this. However, it is true that most scripts will not benefit from a copy-on-write friendly garbage collector. Only very few scripts rely on it for saving memory. As far as I know, the program that I'm developing is the first Ruby program that uses copy-on-write friendliness for important stuff. There is both a legit reason for wanting the garbage collector to be fast in the normal case, and for wanting it to be not-so-fast but copy-on-write friendly in some edge cases. That said, exposing implementation details is generally not a nice thing to do. That's why I've written extensive documentation for the "copy_on_write_friendly?" and "copy_on_write_friendly=" methods, which explains that the behavior might be different across different Ruby implementations and/or different platforms, and that they should only use these methods if they know what they're doing. Please see the attached patch. This patch is made against Ruby 1.8, relative to my last patch.
on 14.03.2008 18:35
Hongli Lai wrote: > Daniel Berger wrote: >> I recommend against this. Most people are simply not going to know >> when they should invoke it and when they shouldn't. It's an >> implementation detail that should be hidden from the user IMHO. >> Besides, it really worth splitting it out like this over a 5% speed >> hit? Or am I missing some other implementation detail? > > Apparently people care about even a 5% speed hit, or we wouldn't still > be discussing this. I can certainly see a need for this as an _option_, even if it does cost 5%. IIUC, it would make a difference if your main process (say, an in-memory database, or a large simulation) constructed a big set of objects. Then you want to fork to generate some graphs from the simulation or perform a search of the database. Currently, the best solution is GC.disable (in the fork), but that's not good if the fork needs to live a while (and the parent will eventually GC anyway). The 5% cost is much less than the cost of thrashing because all these forks can't share memory. If the simulation or database process doesn't need to GC much, the 5% is negligible. OTOH, most programs probably don't benefit, so shouldn't pay the 5%. Thanks for your work on cowruby, Hongli.
on 15.03.2008 17:16
Hongli Lai wrote: > Great work Daniel. I don't measure the same amount of speedup that you > claim in your email, but there is definitely a small speedup. > > I've added some further further optimizations. > find_position_in_bitfield() now uses bit operators instead of division > and modulo operators. This should speed things up a little more. > > The attached patch is created against Ruby 1.8, but it shows what I've > exactly changed. I've been such a fool. The patch I sent uses platform-specific macros to figure out the base 2 log of sizeof(int)*8. Here is a new patch, which uses straight C code to figure out that value. There is no performance overhead because the code is easily optimizable by the compiler, and is a lot more portable.
on 20.03.2008 12:00
Hi. Sorry for mentioning this again, but it has been almost a week since my last patch. I'd really appreciate it if a core committer can comment on it. Thanks in advance.
on 20.03.2008 12:56
Hi,
In message "Re: Copy-on-write friendly garbage collector"
on Thu, 20 Mar 2008 19:59:16 +0900, Hongli Lai <hongli@plan99.net>
writes:
|Sorry for mentioning this again, but it has been almost a week since my
|last patch. I'd really appreciate it if a core committer can comment on it.
It works on my machine. But since it slows down Ruby a bit, we
haven't decided yet to merge it officially. Do you (or anyone else)
have something we can use for GC benchmark?
matz.
on 20.03.2008 13:54
Yukihiro Matsumoto wrote: > have something we can use for GC benchmark? > > matz. Hi. Yes, I use the following tool for benchmarking: http://pastebin.com/m26839d72 The results are as follows: - Standard Ruby: 13.600 seconds - My Ruby: 16.600 seconds So the copy-on-write friendly GC seems to be about 20% slower. (At least, on this machine. I've noticed that the percentage can vary wildly depending on the specific machine. On some machines it only seems to be about 5% slower.) However, I've recently submitted a patch which implements pluggable marking tables. By default, it uses the non-copy-on-write-friendly implementation (i.e. the same as the old one, which uses mark flags on the objects directly). So if I remove the "GC.copy_on_write_friendly = true" line, then the result is as follows: - My Ruby: 13.394 seconds So by default, performance is on par with standard Ruby. Only if the programmer has explicitly set "GC.copy_on_write_friendly = true" will there be a speed hit. The standard Ruby version I use is 1.8.6 patchlevel 114. My modified Ruby is also based on that version. Both Ruby binaries are compiled with '-g' (i.e. no optimizations). Finally, I've benchmarked some Ruby on Rails applications. Only 10% of the total application time seems to be spent on the garbage collector. So if the garbage collector is 20% slower, then it only results in a 2% speed reduction in total. I'd like to know whether my patches are acceptable, now that it's just as fast as the normal Ruby by default. And if it's not acceptable, I'd like to know how many % speed hit would be acceptable. Thanks. - Hongli Lai
on 20.03.2008 15:24
Hi,
In message "Re: Copy-on-write friendly garbage collector"
on Thu, 20 Mar 2008 21:54:04 +0900, Hongli Lai <hongli@plan99.net>
writes:
|Yes, I use the following tool for benchmarking:
|http://pastebin.com/m26839d72
I feel this one is bit too simple. Since GC behavior heavily depends
on object allocation pattern of programs, I guess we need more
realistic cases to benchmark. Perhaps we should contact Lloyd for his
cases in <http://lloydforge.org/projects/ruby/>.
By the way, I'd like to know how much effective this copy-on-write
friendly patch is. Does anyone have a good (even artificial) case?
|I'd like to know whether my patches are acceptable, now that it's just
|as fast as the normal Ruby by default. And if it's not acceptable, I'd
|like to know how many % speed hit would be acceptable.
2% loss for almost all cases are good enough. But I cannot be sure
for 2% for this specific case.
matz.
on 20.03.2008 15:46
Yukihiro Matsumoto wrote: > By the way, I'd like to know how much effective this copy-on-write > friendly patch is. Does anyone have a good (even artificial) case? I use the following test: http://pastebin.com/f353f2ade It loads Ruby on Rails (2.0.2), then runs garbage collection in a child process. The results are as follows: == Standard Ruby: After forking, before gc: - Parent process: 64 KB private dirty memory - Child process 1: 60 KB private dirty memory - Child process 2: 128 KB private dirty memory After forking, after gc: - Parent process: 64 KB private dirty memory - Child process 1: 60 KB private dirty memory - Child process 2: 9.1 MB private dirty memory == My Ruby: After forking, before gc: - Parent process: 64 KB private dirty memory - Child process 1: 60 KB private dirty memory - Child process 2: 156 KB private dirty memory After forking, after gc: - Parent process: 64 KB private dirty memory - Child process 1: 60 KB private dirty memory - Child process 2: 1.1 MB private dirty memory Some parts of the memory are made dirty for as yet unknown reasons. It seems to have something to do with the way the system malloc() is implemented.
on 20.03.2008 16:29
Yukihiro Matsumoto wrote: > By the way, I'd like to know how much effective this copy-on-write > friendly patch is. Does anyone have a good (even artificial) case? Very artificial: [ruby-talk:186561]