Parsing JSON (#155)

On Feb 4, 2008, at 7:29 PM, James G. wrote:

Here’s my own recursive descent parser (based on the not-quite-
correct quiz tests):

This is a version I built some time ago when experimenting with peggy:

#!/usr/bin/env ruby -wKU

require “lib/parser”
require “lib/builder”

class JSONParser < Peggy::Builder
KEYWORDS = {“true” => true, “false” => false, “null” => nil}
ESCAPES = Hash[*%W[b \b f \f n \n r \r t \t]]

def self.parse(json_string)
parser = self.new

 parser.source_text = json_string
 parser.parse?(:value) or raise "Failed to parse:

#{json_string.inspect}"

 parser.to_ruby

end

def initialize
super

 self.ignore_productions = [:space]
 space { lit /\s+/ }

 value {
   seq {
     opt { space }
     one {
       object
       array
       string
       number
       keyword
     }
     opt { space }
   }
 }

 object {
   seq {
     lit /\{\s*/
     one {
       seq {
         opt { many { seq { string; lit /\s*:/; value; lit /,

\s*/ } } }
seq { string; lit /\s*:/; value }
lit “}”
}
lit “}”
}
}
}

 array {
   seq {
     lit "["
     one {
       seq {
         opt { many { seq { value; lit "," } } }; value; lit "]"
       }
       lit "]"
     }
   }
 }

 string {
   seq {
     lit '"'
     one {
       lit '"'
       seq {
         many {
           one {
             seq { string_content; opt { escape         } }
             seq { escape;         opt { string_content } }
           }
         }
         lit '"'
       }
     }
   }
 }
 string_content { lit(/[^\\"]+/) }
 escape {
   one {
     escape_literal
     escape_sequence
     escape_unicode
   }
 }

 escape_literal  { lit(%r{\\["\\/]})      }
 escape_sequence { lit(/\\[bfnrt]/)       }
 escape_unicode  { lit(/\\u[0-9a-f]{4}/i) }

 number  { lit(/-?(?:0|[1-9]\d*)(?:\.\d+(?:[eE][+-]?\d+)?)?\b/) }
 keyword { lit(/\b(?:true|false|null)\b/)                       }

end

def to_ruby(from = parse_results.keys.min)
kind = parse_results[from][:found_order].first
to = parse_results[from][kind]
send(“to_ruby_#{kind}”, from, to)
end

private

def to_ruby_object(from, to)
p parse_results
object = Hash.new
skip_to = nil
last_key = nil
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next if skip_to and key < skip_to
next unless content[:found_order] and
( ( content[:found_order].size == 2 and
content[:found_order][1] == :value ) or
content[:found_order] == [:string] )
if content[:found_order] == [:string]
last_key = to_ruby_string(key, content[:string])
else
case content[:found_order].first
when :object
object[last_key] = to_ruby_object(key, content[:object])
skip_to = content[:object]
when :array
object[last_key] = to_ruby_array(key, content[:array])
skip_to = content[:array]
else
object[last_key] = to_ruby(key)
end
end
end
object
end

def to_ruby_array(from, to)
array = Array.new
skip_to = nil
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next if skip_to and key < skip_to
next unless content[:found_order] and
content[:found_order].size == 2 and
content[:found_order][1] == :value
case content[:found_order].first
when :object
array << to_ruby_object(key, content[:object])
skip_to = content[:object]
when :array
array << to_ruby_array(key, content[:array])
skip_to = content[:array]
else
array << to_ruby(key)
end
end
array
end

def to_ruby_string(from, to)
string = String.new
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next unless content[:found_order]
case content[:found_order].first
when :string_content
string << source_text[key…content[:string_content]]
when :escape_literal
string << source_text[content[:escape_literal] - 1, 1]
when :escape_sequence
string << ESCAPES[source_text[content[:escape_sequence] - 1,
1]]
when :escape_unicode
string << [Integer(“0x#{source_text[key + 2, 4]}”)].pack(“U”)
end
end
string
end

def to_ruby_number(from, to)
num = source_text[from…to]
num.include?(".") ? Float(num) : Integer(num)
end

def to_ruby_keyword(from, to)
KEYWORDS[source_text[from…to]]
end
end

END

I guess you can see why that library didn’t win me over. :slight_smile:

James Edward G. II

Here’s my own recursive descent parser (based on the not-quite-correct
quiz tests):

#!/usr/bin/env ruby -wKU

require “strscan”

http://json.org/

class JSONParser
AST = Struct.new(:value)

def parse(input)
@input = StringScanner.new(input.strip)
parse_value.value
end

private

def parse_value
parse_object or
parse_array or
parse_string or
parse_number or
parse_keyword or
error(“Illegal JSON value”)
end

def parse_object
if @input.scan(/{\s*/)
object = Hash.new
while key = parse_string
@input.scan(/\s*:\s*/) or error(“Expecting object separator”)
object[key.value] = parse_value.value
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*}\s*/) or error(“Unclosed object”)
AST.new(object)
else
false
end
end

def parse_array
if @input.scan(/[\s*/)
array = Array.new
while contents = parse_value rescue nil
array << contents.value
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*]\s*/) or error(“Unclosed array”)
AST.new(array)
else
false
end
end

def parse_string
if @input.scan(/“/)
string = String.new
while contents = parse_string_content || parse_string_escape
string << contents.value
end
@input.scan(/”\s*/) or error(“Unclosed string”)
AST.new(string)
else
false
end
end

def parse_string_content
@input.scan(/[^\"]+/) and AST.new(@input.matched)
end

def parse_string_escape
if @input.scan(%r{\[“\/]})
AST.new(@input.matched[-1])
elsif @input.scan(/\[bfnrt]/)
AST.new(eval(%Q{”#{@input.matched}"}))
elsif @input.scan(/\u[0-9a-fA-F]{4}/)
AST.new([Integer(“0x#{@input.matched[2…-1]}”)].pack(“U”))
else
false
end
end

def parse_number
@input.scan(/-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?\b/) and
AST.new(eval(@input.matched))
end

def parse_keyword
@input.scan(/\b(?:true|false|null)\b/) and
AST.new(eval(@input.matched.sub(“null”, “nil”)))
end

def error(message)
if @input.eos?
raise “Unexpected end of input.”
else
raise “#{message}: #{@input.peek(@input.string.length)}”
end
end
end

END

James Edward G. II

On Feb 4, 2008, at 7:29 PM, James G. wrote:

Here’s my own recursive descent parser (based on the not-quite-
correct quiz tests):

Finally, here’s the JSON parser right out of Ghost Wheel’s tests:

#!/usr/bin/env ruby -wKU

JSONParser = GhostWheel.build_parser( %q{
keyword = ‘true’ { true } | ‘false’ { false } | ‘null’ { nil }

number = /-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?/
{ ast.include?(".") ? Float(ast) : Integer(ast) }

string_content = /\\["\\/]/ { ast[-1, 1] }
| /\\[bfnrt]/
{ Hash[%W[b \n f \f n \n r \r t \t]][ast[-1, 1]] }
| /\\u[0-9a-fA-F]{4}/
{ [Integer(“0x#{ast[2…-1]}”)].pack(“U”) }
| /[^\\"]+/
string = ‘"’ string_content
‘"’ { ast.flatten[1…-2].join }

array_content = value_with_array_sep+ value
{ ast[0].inject([]) { |a, v| a.push(v) } +
ast[-1…-1] }
| value? { ast.is_a?(EmptyParseResult) ? [] : [ast] }
array = /[\s
/ array_content /\s*]/ { ast[1] }

object_pair = string /\s*:\s*/ value { {ast[0] => ast[-1]} }
object_pair_and_sep = object_pair /\s*;\s*/ { ast[0] }
object_content = object_pair_and_sep+ object_pair
{ ast.flatten }
| object_pair?
{ ast.is_a?(EmptyParseResult) ? [] : [ast] }
object = /\{\s*/ object_content /\}\s*/
{ ast[1].inject({}) { |h, p| h.merge§ } }

value_space = /\s*/
value_content = keyword | number | string | array | object
value = value_space value_content value_space
{ ast[1] }
value_with_array_sep = value /\s*,\s*/ { ast[0] }

json := value EOF { ast[0] }
} )

END

James Edward G. II

It seems though, if I may say so, that cough certain solutions don’t
pass all test cases:

assert_raise(RuntimeError) { @parser.parse(%{[], p “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{""; p 123; “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{"" p 123; “”}) }

From the original set:
assert_raises(RuntimeError) { @parser.parse("-5.-4") }
assert_raises(RuntimeError) { @parser.parse(%Q{{ “a” : 2, }}) }
assert_raise(RuntimeError) { @parser.parse(%q{true false}) }

My eval() based solution seems to fail on A Stedile’s test case unless
the more strict rfc-rules, which allow only array and object at the
top level, are applied:
assert_raise(RuntimeError) { @parser.parse(%Q{“a” “b”}) }

Thomas.

On Feb 4, 2008 7:29 PM, James G. [email protected] wrote:

Here’s my own recursive descent parser (based on the not-quite-correct
quiz tests):

Hi James,

I benchmarked your code. I also was curious how well a good
hand-crafted
Regexp recursive descent parser (using the StringScanner) would compare
to
my hand-crafted LL(1) (1 character lookahead) parser. So, I took your
code
and applied some of the same optimizations that I used:

  • minimize method calls (inline at least where a method is called
    once)
  • minimize object creation (put results in the callers output buffer
    instead of returning an AST)
  • minimize exceptions

Here are the benchmark results with this and the other parsers you
posted
(couldn’t get the ghostwheel parser to work well):

ch/s author/gem


  •   Pawel R. (RE, couldn't parse benchmark JSON)
    
  •   ghostwheel (ghostwheel, couldn't parse benchmark JSON)
    

1226 James Edward G. II (peggy, fails one test)
3214 Justin E. (RE lexer, ruby eval, fixed number parsing)
4054 Eric M. (Grammar0, no lexer, no parser generation)
4078 Eric I (Treetop, unicode broken)
6534 oksteev (Treetop, mismatches in benchmark)
8313 Clifford H. (Treetop, had to remove handling of “/”)
17320 Alexander Stedile (RE, recursive descent)
54586 Eric M. (Grammar, no lexer, v0.5)
137989 Paolo B. (RE, recursive descent)
166041 Thomas Link (RE lexer, ruby eval, ruby 1.9 results)
186042 James Edward G. II (RE, recursive descent)
220289 json (pure ruby version)
223486 Eric M. (Grammar, no lexer, unreleased)
224823 fjson (uses C extensions)
287292 James Edward G. II (RE, recursive descent, Eric optimized)
333368 Thomas Link & Paolo B. (RE + eval, unicode broken)
388670 Eric M. (recursive descent)
553081 Eric M. (Grammar, no lexer, unreleased, w/ ruby2cext)
1522250 json (w/ C extensions)

I’d like to see a RACC parser for JSON.

Here is the optimized version of your recursive-descent
Regexp/StringScanner
parser:

require “strscan”

class JSONParser

def parse(input)
@input = StringScanner.new(input.strip)
parse_value(out=[]) or error(“Illegal JSON value”)
out[0]
end

private

def parse_value(out)
if @input.scan(/{\s*/)
object = {}
kv = []
while @input.scan(/“/)
parse_string(kv)
@input.scan(/\s*:\s*/) or error(“Expecting object separator”)
parse_value(kv) or error(“Illegal JSON value”)
object[kv[0]] = kv[1]
@input.scan(/\s*,\s*/) or break
kv.clear
end
@input.scan(/\s*}\s*/) or error(“Unclosed object”)
out << object
elsif @input.scan(/[\s*/)
array = []
while parse_value(array)
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*]\s*/) or error(“Unclosed array”)
out << array
elsif @input.scan(/”/)
parse_string(out)
elsif @input.scan(/-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?\b/)
out << eval(@input.matched)
elsif @input.scan(/\b(?:true|false|null)\b/)
out << eval(@input.matched.sub(“null”, “nil”))
end
end

def parse_string(out)
string = “”
while true
if @input.scan(/[^\“]+/)
string << @input.matched
elsif @input.scan(%r{\[”\/]})
string << @input.matched[-1]
elsif @input.scan(/\[bfnrt]/)
string << eval(%Q{“#{@input.matched}”})
elsif @input.scan(/\u[0-9a-fA-F]{4}/)
string << [Integer(“0x#{@input.matched[2…-1]}”)].pack(“U”)
else
break
end
end
@input.scan(/"\s*/) or error(“Unclosed string”)
out << string
end

def error(message)
if @input.eos?
raise “Unexpected end of input.”
else
raise “#{message}: #{@input.peek(@input.string.length)}”
end
end
end

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

That’s true of course (i_18 is the $n version):

$ ruby benchmark_eric_mahurin.rb i_18
"quiz155i_18"
205653 chars/second

$ ruby19 benchmark_eric_mahurin.rb i_18
"quiz155i_18"
271050 chars/second

$ ruby19 benchmark_eric_mahurin.rb i
"quiz155i"
207100 chars/second

But just having the pleasure to be able to use more than 9 groups and
not to have to care about group order IMHO justifies the use of named
groups. It wasn’t just once that I wasted time with searching for the
source of some problem just to find out that the group order changed
or
that the group numbers between, well, only mostly similar
programmatically generated regexps differed.

But the reason why I’m asking is that, e.g., your solution finishes
the
benchmark with 107323 chars/second when run with ruby18. With ruby19
though, it’s astounishing 1703 chars/second. I’m not sure though what
is
causing this slowdown. Also I’m wondering if this isn’t an artefact
of
the benchmark. The full run looks like this:

  user     system      total        real


8 24142 0.541000 0.000000 0.541000 ( 0.601000)
9 25988 0.621000 0.000000 0.621000 ( 0.641000)
10 588993246.555000 93.324000 339.879000 (345.657000)
1703 chars/second

So the slowdown only happens after Step #9.

My version that uses Regexp#match() (but still eval based) makes:

  user     system      total        real


7 3518 0.090000 0.000000 0.090000 ( 0.120000)
8 24142 2.894000 0.010000 2.904000 ( 2.934000)
8228 chars/second

But the time limit is exceeded at step #8 already. This makes me
wonder
what is causing this “interesting” behaviour of your solution at
step #10 when run with ruby19.

BTW, my own benchmarks (using random object generation of different
lenght based on Ara’s proposal but slightly enhanced) show the
following
picture:

First the net timing spent on random object generation alone:
user system total real
10 2.003000 0.000000 2.003000 ( 2.033000)
20 6.799000 0.000000 6.799000 ( 6.940000)
30 17.636000 0.000000 17.636000 ( 17.786000)

10: n=10000 avg.size=47
20: n=10000 avg.size=159
30: n=10000 avg.size=406

“quiz155i” (my original submission)
user system total real
10 3.495000 0.000000 3.495000 ( 3.535000)
20 10.024000 0.000000 10.024000 ( 10.095000)
30 25.988000 0.000000 25.988000 ( 26.027000)

10: n=10000 avg.size=46
20: n=10000 avg.size=148
30: n=10000 avg.size=391

“quiz155i_18” (modifyied for 18 compatibility)
user system total real
10 3.345000 0.000000 3.345000 ( 3.385000)
20 9.964000 0.000000 9.964000 ( 10.014000)
30 24.455000 0.000000 24.455000 ( 24.506000)

10: n=10000 avg.size=47
20: n=10000 avg.size=157
30: n=10000 avg.size=396

“quiz155b” (json based, i.e. ragel-based C extension)
user system total real
10 2.263000 0.010000 2.273000 ( 2.303000)
20 7.351000 0.000000 7.351000 ( 7.391000)
30 18.636000 0.000000 18.636000 ( 18.687000)

10: n=10000 avg.size=46
20: n=10000 avg.size=156
30: n=10000 avg.size=399

“solution_paolo_bonzini.rb”
user system total real
10 4.226000 0.000000 4.226000 ( 4.256000)
20 13.349000 0.000000 13.349000 ( 13.450000)
30 34.470000 0.070000 34.540000 ( 36.001000)

10: n=10000 avg.size=47
20: n=10000 avg.size=154
30: n=10000 avg.size=388

BTW I also measured steve’s version of a treetop parser but with 1000
iterations only:

“solution_steve.rb”
user system total real
10 5.037000 0.000000 5.037000 ( 5.068000)
20 14.651000 0.010000 14.661000 ( 14.961000)
30 40.298000 0.020000 40.318000 ( 41.330000)

10: n=1000 avg.size=48
20: n=1000 avg.size=148
30: n=1000 avg.size=407

What would the size of an average json snippet an ajax app has to deal
with be? I’m not in the webapp development buisness but from my
understanding this would be rather small, wouldn’t it?

Regards,
Thomas.

The differences between ruby18 and ruby19 are quite
interesting though. Can somebody with deeper knowledge of
ruby19 affirm that the
“copy on write” strategy is still in use?

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

Thanks!

Paolo

But the reason why I’m asking is that, e.g., your
solution finishes the
benchmark with 107323 chars/second when run with ruby18.
With ruby19 though, it’s astounishing 1703 chars/second.

Uh-oh. Someone should write to ruby-core?

Paolo

On Feb 5, 2008 3:09 AM, tho_mica_l [email protected] wrote:

My baseline assumption was that runtime was relatively linear with
respect
to the data size. This assumption is broken with the above case (I
think I
noticed this too at some point). Going from a depth of 9 to 10
increased
the length by ~20X, but the runtime went up by ~400X. There is
obviously an
O(nn) component in there (2020=400). Sounds like there is a
ruby1.9problem.

In the benchmark, you could move the print of the performance to inside
the
loop, right before the break. If there is a consistent downward trend
in
chars/second, you may have an O(n*n) solution and chars/second makes no
sense (for arbitrary data size). Otherwise, maybe we should be looking
at
the best performance between the longest two data sizes so that there is
no
penalty for a solution getting to a larger but possibly more difficult
dataset. Running this test multiple times (maybe with 4.times{} around
the
whole benchmark - including creating the generator) would also be good.

What would the size of an average json snippet an ajax app has to deal

with be? I’m not in the webapp development buisness but from my
understanding this would be rather small, wouldn’t it?

Maybe, but then making a fast parser wouldn’t be any fun :slight_smile:

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

That’s true of course (i_18 is the $n version):

$ ruby benchmark_eric_mahurin.rb i_18
"quiz155i_18"
205653 chars/second

$ ruby19 benchmark_eric_mahurin.rb i_18
"quiz155i_18"
271050 chars/second

$ ruby19 benchmark_eric_mahurin.rb i
"quiz155i"
207100 chars/second

But just having the pleasure to be able to use more than 9 groups and
not to have to care about group order IMHO justifies the use of named
groups. It wasn’t just once that I wasted time with searching for the
source of some problem just to find out that the group order changed
or
that the group numbers between, well, only mostly similar
programmatically generated regexps differed.

But the reason why I’m asking is that, e.g., your solution finishes
the
benchmark with 107323 chars/second when run with ruby18. With ruby19
though, it’s astounishing 1703 chars/second. I’m not sure though what
is
causing this slowdown. Also I’m wondering if this isn’t an artefact
of
the benchmark. The full run looks like this:

  user     system      total        real


8 24142 0.541000 0.000000 0.541000 ( 0.601000)
9 25988 0.621000 0.000000 0.621000 ( 0.641000)
10 588993246.555000 93.324000 339.879000 (345.657000)
1703 chars/second

So the slowdown only happens after Step #9.

My version that uses Regexp#match() (but still eval based) makes:

  user     system      total        real


7 3518 0.090000 0.000000 0.090000 ( 0.120000)
8 24142 2.894000 0.010000 2.904000 ( 2.934000)
8228 chars/second

But the time limit is exceeded at step #8 already. This makes me
wonder
what is causing this “interesting” behaviour of your solution at
step #10 when run with ruby19.

BTW, my own benchmarks (using random object generation of different
lenght based on Ara’s proposal but slightly enhanced) show the
following
picture:

First the net timing spent on random object generation alone:
user system total real
10 2.003000 0.000000 2.003000 ( 2.033000)
20 6.799000 0.000000 6.799000 ( 6.940000)
30 17.636000 0.000000 17.636000 ( 17.786000)

10: n=10000 avg.size=47
20: n=10000 avg.size=159
30: n=10000 avg.size=406

“quiz155i” (my original submission)
user system total real
10 3.495000 0.000000 3.495000 ( 3.535000)
20 10.024000 0.000000 10.024000 ( 10.095000)
30 25.988000 0.000000 25.988000 ( 26.027000)

10: n=10000 avg.size=46
20: n=10000 avg.size=148
30: n=10000 avg.size=391

“quiz155i_18” (modifyied for 18 compatibility)
user system total real
10 3.345000 0.000000 3.345000 ( 3.385000)
20 9.964000 0.000000 9.964000 ( 10.014000)
30 24.455000 0.000000 24.455000 ( 24.506000)

10: n=10000 avg.size=47
20: n=10000 avg.size=157
30: n=10000 avg.size=396

“quiz155b” (json based, i.e. ragel-based C extension)
user system total real
10 2.263000 0.010000 2.273000 ( 2.303000)
20 7.351000 0.000000 7.351000 ( 7.391000)
30 18.636000 0.000000 18.636000 ( 18.687000)

10: n=10000 avg.size=46
20: n=10000 avg.size=156
30: n=10000 avg.size=399

“solution_paolo_bonzini.rb”
user system total real
10 4.226000 0.000000 4.226000 ( 4.256000)
20 13.349000 0.000000 13.349000 ( 13.450000)
30 34.470000 0.070000 34.540000 ( 36.001000)

10: n=10000 avg.size=47
20: n=10000 avg.size=154
30: n=10000 avg.size=388

BTW I also measured steve’s version of a treetop parser but with 1000
iterations only:

“solution_steve.rb”
user system total real
10 5.037000 0.000000 5.037000 ( 5.068000)
20 14.651000 0.010000 14.661000 ( 14.961000)
30 40.298000 0.020000 40.318000 ( 41.330000)

10: n=1000 avg.size=48
20: n=1000 avg.size=148
30: n=1000 avg.size=407

What would the size of an average json snippet an ajax app has to deal
with be? I’m not in the webapp development buisness but from my
understanding this would be rather small, wouldn’t it?

Regards,
Thomas.

Maybe, but then making a fast parser wouldn’t be any fun :slight_smile:

Since I ran my first preliminary benchmark I have been asking myself
how big the advantage of a C-based parser would actually be. So I
elaborated a little bit on this question. In order to also answer the
question how your solutions “scale”, I cleaned up my benchmarks a
little bit. The following includes all submissions that I could make
run with ruby19 – for whatever reason. I don’t have json for ruby18
installed, which is why I didn’t run this test with ruby18.

The objects are generated before the test. The tests are run in a
tight loop, the influence of the benchmarking code should thus be
rather marginal.

Objects were generated the JSON representation of which adds up to
about 2MB in 4 different chunk sizes ranging from about 45 to 900
bytes. The object set is identical for all solutions, the numbers are
thus quite comparable. Since the figures differ slightly from Eric
Mahurin’s benchmark it’s possible that I did something wrong. But in
this case I did it equally wrong for all solutions. The code is down
below.

Regards,
Thomas.

Input chunks:
10: n=43475 avg.size=46.01 tot.size=2000236
20: n=12856 avg.size=155.61 tot.size=2000543
30: n=4897 avg.size=408.51 tot.size=2000483
40: n=2236 avg.size=894.47 tot.size=2000045

Ruby19 json
user system total real
10 2.274000 0.000000 2.274000 ( 2.294000)
20 1.402000 0.000000 1.402000 ( 1.432000)
30 1.041000 0.000000 1.041000 ( 1.061000)
40 1.282000 0.000000 1.282000 ( 1.302000)

10 871942 chars/sec (2000236/2.29)
20 1397027 chars/sec (2000543/1.43)
30 1885469 chars/sec (2000483/1.06)
40 1536132 chars/sec (2000045/1.30)

“solution_tml.rb”
user system total real
10 8.452000 0.010000 8.462000 ( 8.633000)
20 6.570000 0.000000 6.570000 ( 6.599000)
30 6.068000 0.000000 6.068000 ( 6.119000)
40 5.659000 0.000000 5.659000 ( 5.698000)

10 231696 chars/sec (2000236/8.63)
20 303158 chars/sec (2000543/6.60)
30 326929 chars/sec (2000483/6.12)
40 351008 chars/sec (2000045/5.70)

“solution_tml_pb.rb” (modified by P Bonzini)
user system total real
10 8.151000 0.000000 8.151000 ( 8.192000)
20 5.849000 0.000000 5.849000 ( 5.879000)
30 5.307000 0.000000 5.307000 ( 5.337000)
40 5.238000 0.000000 5.238000 ( 5.268000)

10 244169 chars/sec (2000236/8.19)
20 340286 chars/sec (2000543/5.88)
30 374832 chars/sec (2000483/5.34)
40 379659 chars/sec (2000045/5.27)

“solution_eric_i.rb”
user system total real
10158.318000 0.040000 158.358000 (158.798000)
20162.133000 0.030000 162.163000 (162.845000)
30170.305000 0.030000 170.335000 (170.525000)
40193.187000 0.070000 193.257000 (193.458000)

10 12596 chars/sec (2000236/158.80)
20 12284 chars/sec (2000543/162.85)
30 11731 chars/sec (2000483/170.53)
40 10338 chars/sec (2000045/193.46)

“solution_eric_mahurin3.rb”
user system total real
10 7.631000 0.000000 7.631000 ( 7.641000)
20 6.319000 0.000000 6.319000 ( 6.329000)
30 6.179000 0.000000 6.179000 ( 6.179000)
40 5.769000 0.000000 5.769000 ( 5.778000)

10 261776 chars/sec (2000236/7.64)
20 316091 chars/sec (2000543/6.33)
30 323755 chars/sec (2000483/6.18)
40 346148 chars/sec (2000045/5.78)

“solution_james_gray.rb”
user system total real
10 13.820000 0.000000 13.820000 ( 13.890000)
20 12.117000 0.000000 12.117000 ( 12.138000)
30 12.909000 0.000000 12.909000 ( 12.918000)
40 15.051000 0.010000 15.061000 ( 15.082000)

10 144005 chars/sec (2000236/13.89)
20 164816 chars/sec (2000543/12.14)
30 154860 chars/sec (2000483/12.92)
40 132611 chars/sec (2000045/15.08)

“solution_justin_ethier.rb”
user system total real
10 17.025000 0.000000 17.025000 ( 17.025000)
20 17.915000 0.040000 17.955000 ( 17.985000)
30 28.001000 0.021000 28.022000 ( 28.041000)
40 51.253000 0.070000 51.323000 ( 51.394000)

10 117488 chars/sec (2000236/17.03)
20 111233 chars/sec (2000543/17.98)
30 71341 chars/sec (2000483/28.04)
40 38915 chars/sec (2000045/51.39)

“solution_paolo_bonzini.rb”
user system total real
10 11.036000 0.000000 11.036000 ( 11.036000)
20 17.045000 0.030000 17.075000 ( 17.104000)
30 32.717000 0.020000 32.737000 ( 32.857000)
40 69.119000 0.070000 69.189000 ( 69.310000)

10 181246 chars/sec (2000236/11.04)
20 116963 chars/sec (2000543/17.10)
30 60884 chars/sec (2000483/32.86)
40 28856 chars/sec (2000045/69.31)

“solution_steve.rb”
user system total real
10210.152000 0.040000 210.192000 (210.573000)
20215.260000 0.060000 215.320000 (215.590000)
30223.201000 0.110000 223.311000 (228.368000)
40241.257000 0.260000 241.517000 (248.868000)

10 9499 chars/sec (2000236/210.57)
20 9279 chars/sec (2000543/215.59)
30 8759 chars/sec (2000483/228.37)
40 8036 chars/sec (2000045/248.87)

Benchmark code:

require ‘benchmark’

require ‘json/pure’

require ‘json’

N = 2000
S = [10, 20, 30, 40]

This is a slightly enhanced version of Ara’s object generator.

Objects are generated via RandomObject.generate(nil, DEPTH)

– the first argument defines which object types are eligible

and can be ignored in this context.

require ‘tml/random-object’

puts ‘Preparing objects …’
sizes = Hash.new
objects = S.inject({}) do |h, s|
size = 0
a = h[s] = []
n = N * 1000
while size < n
o = RandomObject.generate(nil, s)
j = o.to_json
a << [o, j]
size += j.size
end
sizes[s] = size.to_f
h
end

throughput = Hash.new {|h, k| h[k] = Hash.new(0)}

ARGV.each do |arg|
p arg
require arg

parser = JSONParser.new

throughput = []
Benchmark.bm do |b|
    S.each do |s|
        t = b.report(s.to_s) do |sn|
            objects[s].each do |o, j|
                if o != parser.parse(j)
                    raise RuntimeError
                end
            end
        end
        throughput << "%s %d chars/sec (%d/%0.2f)" % [s,

sizes[s] / t.real, sizes[s], t.real]
end
end
puts
puts throughput.join("\n")
puts
puts

end

objects.each do |s, z|
puts “%s: n=%d avg.size=%0.2f tot.size=%d” %
[s, z.size, sizes[s].to_f / z.size, sizes[s]]

end
puts

On Feb 5, 2008 11:44 AM, tho_mica_l [email protected] wrote:

Maybe, but then making a fast parser wouldn’t be any fun :slight_smile:

Since the figures differ slightly from Eric
Mahurin’s benchmark it’s possible that I did something wrong. But in
this case I did it equally wrong for all solutions. The code is down
below.

We probably should probably assume all of these benchmarks have ±50%
error. The performance is highly data-set and phase-of-the-moon
dependent.
You can still judge whether something has non-linear performance (i.e.
quadratic runtime) or judge whether one solution is 5-10X faster than
another. But, if two solutions are within 2X of each other in a
benchmark,
I don’t think there is a clear winner.

It does look like some solutions have quadratic runtime on ruby 1.9. I
didn’t observe this on 1.8.6.

I added all of the unit tests I found in this thread, plus this one:

def test_int_parsing
assert_same(0, @parser.parse(“0”))
assert_same(42, @parser.parse(“42”))
assert_same(-13, @parser.parse(“-13”))
end

and removed these that don’t seem correct:

#assert_raise(RuntimeError) { @parser.parse(%{“\u0022; p 123;
\u0022Busted”}) }
#assert_equal(“\u0022; p 123; \u0022Busted”,

@parser.parse(%{“\u0022; p 123; \u0022Busted”}))

Here is a tally of failures(F) and errors(F) using this expanded unit
test
suite:

ch/s F E author/gem


  •   5 0  Pawel R. (RE, recursive descent)
    
  •   6 2  ghostwheel (ghostwheel)
    

1226 3 2 James Edward G. II (peggy)
3214 5 1 Justin E. (RE lexer, ruby eval, fixed numbers)
4054 0 0 Eric M. (Grammar0, no lexer, no parser generation)
4078 2 0 Eric I (Treetop, unicode broken)
6534 2 0 Steve (Treetop, mismatches in benchmark)
8313 1 1 Clifford H. (Treetop, removed handling of “/”)
17320 0 0 Alexander Stedile (RE, recursive descent)
54586 0 0 Eric M. (Grammar, no lexer, v0.5)
137989 2 1 Paolo B. (RE, recursive descent)
166041 2 1 Thomas Link (RE lexer, ruby eval, ruby 1.9 results)
186042 5 0 James Edward G. II (RE, recursive descent)
220289 1 7* json
223486 0 0 Eric M. (Grammar, no lexer, unreleased)
224823 6 0 fjson (uses C extensions)
287292 5 0 James Edward G. II (RE, recursive, Eric optimized)
333368 3 0 Thomas Link & Paolo B. (RE + eval, unicode broken)
388670 0 0 Eric M. (recursive descent)
553081 4 9 Eric M. (Grammar, no lexer, unreleased, ruby2cext)
1522250 0 7* json (w/ C extensions)

For the json gem, all of the failures happen because the tests are
invalid -
top-level json should only be an array or an object.

My Grammar with ruby2cext didn’t work well with unit testing because it
didn’t handle creating the parser multiple times. Need to fix that.

Has anyone been able to benchmark the ghostwheel json parser? I would
like
to see how well it does.

Here is the complete set of unit tests I used:

require “test/unit”

class TestJSONParser < Test::Unit::TestCase
def setup
@parser = JSONParser.new
end

def test_keyword_parsing
assert_equal(true, @parser.parse(“true”))
assert_equal(false, @parser.parse(“false”))
assert_equal(nil, @parser.parse(“null”))
end

def test_number_parsing
assert_equal(42, @parser.parse(“42”))
assert_equal(-13, @parser.parse(“-13”))
assert_equal(3.1415, @parser.parse(“3.1415”))
assert_equal(-0.01, @parser.parse(“-0.01”))

assert_equal(0.2e1, @parser.parse(“0.2e1”))
assert_equal(0.2e+1, @parser.parse(“0.2e+1”))
assert_equal(0.2e-1, @parser.parse(“0.2e-1”))
assert_equal(0.2E1, @parser.parse(“0.2e1”))
end

def test_string_parsing
assert_equal(String.new, @parser.parse(%Q{“”}))
assert_equal(“JSON”, @parser.parse(%Q{“JSON”}))

assert_equal( %Q{nested “quotes”},
@parser.parse(‘“nested "quotes"”’) )
assert_equal(“\n”, @parser.parse(%Q{“\n”}))
assert_equal( “a”,
@parser.parse(%Q{“\u#{”%04X" % ?a}"}) )
end

def test_array_parsing
assert_equal(Array.new, @parser.parse(%Q{[]}))
assert_equal( [“JSON”, 3.1415, true],
@parser.parse(%Q{[“JSON”, 3.1415, true]}) )
assert_equal([1, [2, [3]]], @parser.parse(%Q{[1, [2, [3]]]}))
end

def test_object_parsing
assert_equal(Hash.new, @parser.parse(%Q{{}}))
assert_equal( {“JSON” => 3.1415, “data” => true},
@parser.parse(%Q{{“JSON”: 3.1415, “data”: true}}) )
assert_equal( { “Array” => [1, 2, 3],
“Object” => {“nested” => “objects”} },
@parser.parse(<<-END_OBJECT) )
{“Array”: [1, 2, 3], “Object”: {“nested”: “objects”}}
END_OBJECT
end

def test_parse_errors
assert_raise(RuntimeError) { @parser.parse(“{”) }
assert_raise(RuntimeError) { @parser.parse(%q{{“key”: true false}}) }

assert_raise(RuntimeError) { @parser.parse(“[”) }
assert_raise(RuntimeError) { @parser.parse(“[1,2]”) }

assert_raise(RuntimeError) { @parser.parse(%Q{“}) }
assert_raise(RuntimeError) { @parser.parse(%Q{”\i"}) }

assert_raise(RuntimeError) { @parser.parse(“$1,000”) }
assert_raise(RuntimeError) { @parser.parse(“1_000”) }
assert_raise(RuntimeError) { @parser.parse(“1K”) }

assert_raise(RuntimeError) { @parser.parse(“unknown”) }
end

def test_int_parsing
assert_same(0, @parser.parse(“0”))
assert_same(42, @parser.parse(“42”))
assert_same(-13, @parser.parse(“-13”))
end

def test_more_numbers
assert_equal(5, @parser.parse(“5”))
assert_equal(-5, @parser.parse(“-5”))
assert_equal 45.33, @parser.parse(“45.33”)
assert_equal 0.33, @parser.parse(“0.33”)
assert_equal 0.0, @parser.parse(“0.0”)
assert_equal 0, @parser.parse(“0”)
assert_raises(RuntimeError) { @parser.parse(“-5.-4”) }
assert_raises(RuntimeError) { @parser.parse(“01234”) }
assert_equal(0.2e1, @parser.parse(“0.2E1”))
assert_equal(42e10, @parser.parse(“42E10”))
end

def test_more_string
assert_equal(“abc\befg”, @parser.parse(%Q{“abc\befg”}))
assert_equal(“abc\nefg”, @parser.parse(%Q{“abc\nefg”}))
assert_equal(“abc\refg”, @parser.parse(%Q{“abc\refg”}))
assert_equal(“abc\fefg”, @parser.parse(%Q{“abc\fefg”}))
assert_equal(“abc\tefg”, @parser.parse(%Q{“abc\tefg”}))
assert_equal(“abc\efg”, @parser.parse(%Q{“abc\\efg”}))
assert_equal(“abc/efg”, @parser.parse(%Q{“abc\/efg”}))
end

def test_more_object_parsing
assert_equal({‘a’=>2,‘b’=>4}, @parser.parse(%Q{{ “a” : 2 , “b”:4
}}))
assert_raises(RuntimeError) { @parser.parse(%Q{{ “a” : 2, }}) }
assert_raises(RuntimeError) { @parser.parse(%Q{[ “a” , 2, ]}) }
end

def test_alexander
assert_raise(RuntimeError) { @parser.parse(%Q{“a” “b”}) }
end

def test_thomas
assert_raise(RuntimeError) { @parser.parse(%{p “Busted”}) }
assert_raise(RuntimeError) { @parser.parse(%{[], p “Busted”}) }
assert_raise(RuntimeError) { @parser.parse(%{[p “Busted”]}) }
assert_raise(RuntimeError) { @parser.parse(%{{1 =>
STDOUT.puts(“Busted”)}})
}
#assert_raise(RuntimeError) { @parser.parse(%{“\u0022; p 123;
\u0022Busted”}) }
assert_raise(RuntimeError) { @parser.parse(%{“” p 123; “”}) }
#assert_equal(“\u0022; p 123; \u0022Busted”,

@parser.parse(%{“\u0022; p 123; \u0022Busted”}))

assert_equal(‘#{p 123}’, @parser.parse(%q{“#{p 123}”}))
assert_equal([‘#{ls -r}’], @parser.parse(%q{[“#{ls -r}”]}))
assert_equal(‘#{p 123}’, @parser.parse(%q{“\u0023{p 123}”}))
assert_equal(‘#{p 123}’, @parser.parse(%q{“\u0023{p 123}”}))
end

def test_thomas2
assert_raise(RuntimeError) { @parser.parse(%{[], p “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{“”; p 123; “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{“” p 123; “”}) }

assert_raises(RuntimeError) { @parser.parse(“-5.-4”) }
assert_raises(RuntimeError) { @parser.parse(%Q{{ “a” : 2, }}) }
assert_raise(RuntimeError) { @parser.parse(%q{true false}) }
end

end

On Feb 5, 2008, at 1:00 PM, Eric M. wrote:

Has anyone been able to benchmark the ghostwheel json parser?

Sorry about that. GhostWheel doesn’t need to instantiate the parser
before calling parse(). Drop the .new in your setup and it will work.

James Edward G. II

Eric M. wrote:

I definitely need to go learn about packrat/PEG stuff. Sounds interesting
after looking at wikipedia. Still don’t really understand LR/LALR parsers.
My main focus has been LL/recursive-descent and PEG sounds to be
recursive-descent.

Yes, it’s a simple concept. I give a little comparison of parsing
techniques which has the flavour of how LR parsing works in my
presentation on Treetop, http://dataconstellation.com/blog.
you’ll need to grab the audio though.

You can add packrat behaviour any recursive-descent LL parser with
backtracking simply by including, at the start of each rule, code
that says:

return m if (m = memo[location, :rule_name])
start_location = location

and before any other return statement,

return memo[start_location, :rule_name] = parse_result

I’d love to see you bend your considerable mind towards working
out how to minimise the memoization in a packrat parser. The sheer
number of objects created is clearly the performance-limiting
factor.

Clifford H…

On Feb 5, 2008, at 5:27 PM, Eric M. wrote:

benchmark to get some results.
Oops. :wink:

Thanks for figuring most of it out.

Notice that there isn’t much advantage to use a parser generator in
this
case. Since JSON is relatively simple, you don’t save much (or any)
coding by using a parser generator.

Right. One of the main points of recursive descent parsing is that it
is supposed to be easy to hand roll. For many cases, you really don’t
need the parser generator to hold your hand. Of course, Treetop adds
the nice memoization and whatnot. Tradeoffs.

James Edward G. II

On Feb 5, 2008 3:14 PM, James G. [email protected] wrote:

On Feb 5, 2008, at 1:00 PM, Eric M. wrote:

Has anyone been able to benchmark the ghostwheel json parser?

Sorry about that. GhostWheel doesn’t need to instantiate the parser
before calling parse(). Drop the .new in your setup and it will work.

I figured that part out, but there were some significant bugs in the
ghostwheel grammar spec:

  • was using “;” instead of “,” between key:value pairs
  • \b was converted to \n
  • not handling number with exponent and fractional part

I fixed those, but it still has a bug where it sometimes spits out
arrays
where an object/hash should be. I’m just skipped the self-checking in
my
benchmark to get some results.

I added a new column to the results to show how much coding is needed
for
the parser. I stripped out comments and leading whitespace and measured
gzip size. For the parser generators, I only measured the parser spec
(i.e.
treetop file) size. Here are the results:

ch/s F E gzip author/gem


  - 5 0  545 Pawel R. (RE, recursive descent)

1226 3 2 1074 James Edward G. II (peggy)
3214 5 1 683 Justin E. (RE lexer, ruby eval, fixes)
4054 0 0 608 Eric M. (Grammar0, no lexer, no code-gen)
4076 6 2 588 ghostwheel (ghostwheel, fixes)
4078 2 0 706 Eric I (Treetop, unicode broken)
6534 2 0 631 Steve (Treetop, mismatches in benchmark)
8313 1 1 545 Clifford H. (Treetop, removed handling of “/”)
17320 0 0 842 Alexander Stedile (RE, recursive descent)
54586 0 0 723 Eric M. (Grammar, no lexer, v0.5)
137989 2 1 660 Paolo B. (RE, recursive descent)
166041 2 1 445 Thomas Link (RE lexer, ruby eval, ruby 1.9 results)
186042 5 0 685 James Edward G. II (RE, recursive descent)
220289 1 0 - json
223486 0 0 653 Eric M. (Grammar, no lexer, unreleased)
224823 6 0 - fjson (uses C extensions)
287292 5 0 606 James Edward G. II (RE, recursive, Eric optimized)
333368 3 0 405 Thomas Link & Paolo B. (RE + eval)
388670 0 0 827 Eric M. (recursive descent)
553081 4 9 653 Eric M. (Grammar, no lexer, unreleased, ruby2cext)
1522250 0 0 - json (w/ C extensions)

Notice that there isn’t much advantage to use a parser generator in this
case. Since JSON is relatively simple, you don’t save much (or any)
coding
by using a parser generator.

Eric M. wrote:

With Regexp, you find it easiest
and safest to read the entire file/stream into a String first. IMHO, a
“real” parser should not need to read the entire file into memory.

Agreed. It should however be possible to modify a Regexp engine to
work on an IO stream, reading more input only as required. Because
of the lookahead required, it is necessary to be able to wind back
excessive input that has been read.

On Feb 5, 2008 5:32 PM, James G. [email protected] wrote:

where an object/hash should be. I’m just skipped the self-checking
coding by using a parser generator.

Right. One of the main points of recursive descent parsing is that it
is supposed to be easy to hand roll. For many cases, you really don’t
need the parser generator to hold your hand. Of course, Treetop adds
the nice memoization and whatnot. Tradeoffs.

A point I would like to make based on these benchmarks is that you can
build
a very efficient parser (or lexer) without using Regexp. The fastest
(on my
machine and my benchmark) pure-ruby solution doesn’t use a single Regexp

it just is a straight single character lookahead recursive descent
parser.
I think the benefit of Regexp goes down the more of them you are using.

Not using Regexp also gives a lot more flexibility. A Regexp
unfortunately
only operates on a String. This makes it very difficult to deal with
files
when a given Regexp may cover an arbitrary number of lines (like when
parsing a string or a multi-line comment). With Regexp, you find it
easiest
and safest to read the entire file/stream into a String first. IMHO, a
“real” parser should not need to read the entire file into memory.

Eric

On Feb 5, 2008 6:44 PM, Clifford H. [email protected] wrote:

Eric M. wrote:

With Regexp, you find it easiest
and safest to read the entire file/stream into a String first. IMHO, a
“real” parser should not need to read the entire file into memory.

Agreed. It should however be possible to modify a Regexp engine to
work on an IO stream, reading more input only as required. Because
of the lookahead required, it is necessary to be able to wind back
excessive input that has been read.

When I first started writing my parser generator, I wanted it to be able
to
handle a regex at the leaf. I tried putting some layer on top, then I
complained about the inflexibility of regex. Finally I gave up and
decided
not to use them. I think I ended up with something much cleaner since a
regex is already a mini-parser. Using two parsing languages (low-level
regexp and high-level BNF-like thing) just wasn’t as appealing anymore.
Fortunately, I found out much later that this decision didn’t really
cost
anything in terms of performance.

But, yes regex should be changed to be more flexible. In my opinion, it
should be duck-typed to work on any String-like object (using a subset
of
string methods - #[] would be minimal). It’s not to difficult to wrap
an IO
to look like a read-only String. It would of course optimize the real
String case, but that shouldn’t effect the functionality.

Even C++ has “duck-typed” a regex:

http://www.boost.org/libs/regex/doc/index.html

This works on arbitrary characters and arbitrary (external) iterators
(into
containers). Of course it uses the ugly template syntax to map to
static
types.

Eric M. wrote:

Fortunately, I found out much later that this decision didn’t really cost
anything in terms of performance.

I submit that if your code is as fast as a Ruby Regexp,
that’s because Regexp isn’t as fast as it should be :-).