Parsing JSON (#155)

bbazzarrakk · February 5, 2008, 4:53am

On Feb 4, 2008, at 7:29 PM, James G. wrote:

Here’s my own recursive descent parser (based on the not-quite-
correct quiz tests):

This is a version I built some time ago when experimenting with peggy:

#!/usr/bin/env ruby -wKU

require “lib/parser”
require “lib/builder”

class JSONParser < Peggy::Builder
KEYWORDS = {“true” => true, “false” => false, “null” => nil}
ESCAPES = Hash[*%W[b \b f \f n \n r \r t \t]]

def self.parse(json_string)
parser = self.new

 parser.source_text = json_string
 parser.parse?(:value) or raise "Failed to parse:

#{json_string.inspect}"

 parser.to_ruby

end

def initialize
super

 self.ignore_productions = [:space]
 space { lit /\s+/ }

 value {
   seq {
     opt { space }
     one {
       object
       array
       string
       number
       keyword
     }
     opt { space }
   }
 }

 object {
   seq {
     lit /\{\s*/
     one {
       seq {
         opt { many { seq { string; lit /\s*:/; value; lit /,

\s*/ } } }
seq { string; lit /\s*:/; value }
lit “}”
}
lit “}”
}
}
}

 array {
   seq {
     lit "["
     one {
       seq {
         opt { many { seq { value; lit "," } } }; value; lit "]"
       }
       lit "]"
     }
   }
 }

 string {
   seq {
     lit '"'
     one {
       lit '"'
       seq {
         many {
           one {
             seq { string_content; opt { escape         } }
             seq { escape;         opt { string_content } }
           }
         }
         lit '"'
       }
     }
   }
 }
 string_content { lit(/[^\\"]+/) }
 escape {
   one {
     escape_literal
     escape_sequence
     escape_unicode
   }
 }

 escape_literal  { lit(%r{\\["\\/]})      }
 escape_sequence { lit(/\\[bfnrt]/)       }
 escape_unicode  { lit(/\\u[0-9a-f]{4}/i) }

 number  { lit(/-?(?:0|[1-9]\d*)(?:\.\d+(?:[eE][+-]?\d+)?)?\b/) }
 keyword { lit(/\b(?:true|false|null)\b/)                       }

end

def to_ruby(from = parse_results.keys.min)
kind = parse_results[from][:found_order].first
to = parse_results[from][kind]
send(“to_ruby_#{kind}”, from, to)
end

private

def to_ruby_object(from, to)
p parse_results
object = Hash.new
skip_to = nil
last_key = nil
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next if skip_to and key < skip_to
next unless content[:found_order] and
( ( content[:found_order].size == 2 and
content[:found_order][1] == :value ) or
content[:found_order] == [:string] )
if content[:found_order] == [:string]
last_key = to_ruby_string(key, content[:string])
else
case content[:found_order].first
when :object
object[last_key] = to_ruby_object(key, content[:object])
skip_to = content[:object]
when :array
object[last_key] = to_ruby_array(key, content[:array])
skip_to = content[:array]
else
object[last_key] = to_ruby(key)
end
end
end
object
end

def to_ruby_array(from, to)
array = Array.new
skip_to = nil
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next if skip_to and key < skip_to
next unless content[:found_order] and
content[:found_order].size == 2 and
content[:found_order][1] == :value
case content[:found_order].first
when :object
array << to_ruby_object(key, content[:object])
skip_to = content[:object]
when :array
array << to_ruby_array(key, content[:array])
skip_to = content[:array]
else
array << to_ruby(key)
end
end
array
end

def to_ruby_string(from, to)
string = String.new
parse_results.keys.select { |k| k > from and k < to }.sort.each
do |key|
content = parse_results[key]
next unless content[:found_order]
case content[:found_order].first
when :string_content
string << source_text[key…content[:string_content]]
when :escape_literal
string << source_text[content[:escape_literal] - 1, 1]
when :escape_sequence
string << ESCAPES[source_text[content[:escape_sequence] - 1,
1]]
when :escape_unicode
string << [Integer(“0x#{source_text[key + 2, 4]}”)].pack(“U”)
end
end
string
end

def to_ruby_number(from, to)
num = source_text[from…to]
num.include?(".") ? Float(num) : Integer(num)
end

def to_ruby_keyword(from, to)
KEYWORDS[source_text[from…to]]
end
end

END

I guess you can see why that library didn’t win me over.

James Edward G. II

bbazzarrakk · February 5, 2008, 2:31am

Here’s my own recursive descent parser (based on the not-quite-correct
quiz tests):

#!/usr/bin/env ruby -wKU

require “strscan”

http://json.org/

class JSONParser
AST = Struct.new(:value)

def parse(input)
@input = StringScanner.new(input.strip)
parse_value.value
end

private

def parse_value
parse_object or
parse_array or
parse_string or
parse_number or
parse_keyword or
error(“Illegal JSON value”)
end

def parse_object
if @input.scan(/{\s*/)
object = Hash.new
while key = parse_string
@input.scan(/\s*:\s*/) or error(“Expecting object separator”)
object[key.value] = parse_value.value
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*}\s*/) or error(“Unclosed object”)
AST.new(object)
else
false
end
end

def parse_array
if @input.scan(/[\s*/)
array = Array.new
while contents = parse_value rescue nil
array << contents.value
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*]\s*/) or error(“Unclosed array”)
AST.new(array)
else
false
end
end

def parse_string
if @input.scan(/“/)
string = String.new
while contents = parse_string_content || parse_string_escape
string << contents.value
end
@input.scan(/”\s*/) or error(“Unclosed string”)
AST.new(string)
else
false
end
end

def parse_string_content
@input.scan(/[^\"]+/) and AST.new(@input.matched)
end

def parse_string_escape
if @input.scan(%r{\[“\/]})
AST.new(@input.matched[-1])
elsif @input.scan(/\[bfnrt]/)
AST.new(eval(%Q{”#{@input.matched}"}))
elsif @input.scan(/\u[0-9a-fA-F]{4}/)
AST.new([Integer(“0x#{@input.matched[2…-1]}”)].pack(“U”))
else
false
end
end

def parse_number
@input.scan(/-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?\b/) and
AST.new(eval(@input.matched))
end

def parse_keyword
@input.scan(/\b(?:true|false|null)\b/) and
AST.new(eval(@input.matched.sub(“null”, “nil”)))
end

def error(message)
if @input.eos?
raise “Unexpected end of input.”
else
raise “#{message}: #{@input.peek(@input.string.length)}”
end
end
end

END

James Edward G. II

bbazzarrakk · February 5, 2008, 4:55am

On Feb 4, 2008, at 7:29 PM, James G. wrote:

Here’s my own recursive descent parser (based on the not-quite-
correct quiz tests):

Finally, here’s the JSON parser right out of Ghost Wheel’s tests:

#!/usr/bin/env ruby -wKU

JSONParser = GhostWheel.build_parser( %q{
keyword = ‘true’ { true } | ‘false’ { false } | ‘null’ { nil }

number = /-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?/
{ ast.include?(".") ? Float(ast) : Integer(ast) }

string_content = /\\["\\/]/ { ast[-1, 1] }
| /\\[bfnrt]/
{ Hash[%W[b \n f \f n \n r \r t \t]][ast[-1, 1]] }
| /\\u[0-9a-fA-F]{4}/
{ [Integer(“0x#{ast[2…-1]}”)].pack(“U”) }
| /[^\\"]+/
string = ‘"’ string_content ‘"’ { ast.flatten[1…-2].join }

array_content = value_with_array_sep+ value
{ ast[0].inject([]) { |a, v| a.push(v) } +
ast[-1…-1] }
| value? { ast.is_a?(EmptyParseResult) ? [] : [ast] }
array = /[\s/ array_content /\s*]/ { ast[1] }

object_pair = string /\s*:\s*/ value { {ast[0] => ast[-1]} }
object_pair_and_sep = object_pair /\s*;\s*/ { ast[0] }
object_content = object_pair_and_sep+ object_pair
{ ast.flatten }
| object_pair?
{ ast.is_a?(EmptyParseResult) ? [] : [ast] }
object = /\{\s*/ object_content /\}\s*/
{ ast[1].inject({}) { |h, p| h.merge§ } }

value_space = /\s*/
value_content = keyword | number | string | array | object
value = value_space value_content value_space
{ ast[1] }
value_with_array_sep = value /\s*,\s*/ { ast[0] }

json := value EOF { ast[0] }
} )

END

James Edward G. II

bbazzarrakk · February 5, 2008, 7:00am

It seems though, if I may say so, that cough certain solutions don’t
pass all test cases:

assert_raise(RuntimeError) { @parser.parse(%{[], p “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{""; p 123; “Foo”}) }
assert_raise(RuntimeError) { @parser.parse(%{"" p 123; “”}) }

From the original set:
assert_raises(RuntimeError) { @parser.parse("-5.-4") }
assert_raises(RuntimeError) { @parser.parse(%Q{{ “a” : 2, }}) }
assert_raise(RuntimeError) { @parser.parse(%q{true false}) }

My eval() based solution seems to fail on A Stedile’s test case unless
the more strict rfc-rules, which allow only array and object at the
top level, are applied:
assert_raise(RuntimeError) { @parser.parse(%Q{“a” “b”}) }

Thomas.

bbazzarrakk · February 5, 2008, 6:03am

On Feb 4, 2008 7:29 PM, James G. [email protected] wrote:

Here’s my own recursive descent parser (based on the not-quite-correct
quiz tests):

Hi James,

I benchmarked your code. I also was curious how well a good
hand-crafted
Regexp recursive descent parser (using the StringScanner) would compare
to
my hand-crafted LL(1) (1 character lookahead) parser. So, I took your
code
and applied some of the same optimizations that I used:

minimize method calls (inline at least where a method is called
once)
minimize object creation (put results in the callers output buffer
instead of returning an AST)
minimize exceptions

Here are the benchmark results with this and the other parsers you
posted
(couldn’t get the ghostwheel parser to work well):

ch/s author/gem

  Pawel R. (RE, couldn't parse benchmark JSON)

  ghostwheel (ghostwheel, couldn't parse benchmark JSON)

1226 James Edward G. II (peggy, fails one test)
3214 Justin E. (RE lexer, ruby eval, fixed number parsing)
4054 Eric M. (Grammar0, no lexer, no parser generation)
4078 Eric I (Treetop, unicode broken)
6534 oksteev (Treetop, mismatches in benchmark)
8313 Clifford H. (Treetop, had to remove handling of “/”)
17320 Alexander Stedile (RE, recursive descent)
54586 Eric M. (Grammar, no lexer, v0.5)
137989 Paolo B. (RE, recursive descent)
166041 Thomas Link (RE lexer, ruby eval, ruby 1.9 results)
186042 James Edward G. II (RE, recursive descent)
220289 json (pure ruby version)
223486 Eric M. (Grammar, no lexer, unreleased)
224823 fjson (uses C extensions)
287292 James Edward G. II (RE, recursive descent, Eric optimized)
333368 Thomas Link & Paolo B. (RE + eval, unicode broken)
388670 Eric M. (recursive descent)
553081 Eric M. (Grammar, no lexer, unreleased, w/ ruby2cext)
1522250 json (w/ C extensions)

I’d like to see a RACC parser for JSON.

Here is the optimized version of your recursive-descent
Regexp/StringScanner
parser:

require “strscan”

class JSONParser

def parse(input)
@input = StringScanner.new(input.strip)
parse_value(out=[]) or error(“Illegal JSON value”)
out[0]
end

private

def parse_value(out)
if @input.scan(/{\s*/)
object = {}
kv = []
while @input.scan(/“/)
parse_string(kv)
@input.scan(/\s*:\s*/) or error(“Expecting object separator”)
parse_value(kv) or error(“Illegal JSON value”)
object[kv[0]] = kv[1]
@input.scan(/\s*,\s*/) or break
kv.clear
end
@input.scan(/\s*}\s*/) or error(“Unclosed object”)
out << object
elsif @input.scan(/[\s*/)
array = []
while parse_value(array)
@input.scan(/\s*,\s*/) or break
end
@input.scan(/\s*]\s*/) or error(“Unclosed array”)
out << array
elsif @input.scan(/”/)
parse_string(out)
elsif @input.scan(/-?(?:0|[1-9]\d*)(?:.\d+(?:[eE][±]?\d+)?)?\b/)
out << eval(@input.matched)
elsif @input.scan(/\b(?:true|false|null)\b/)
out << eval(@input.matched.sub(“null”, “nil”))
end
end

def parse_string(out)
string = “”
while true
if @input.scan(/[^\“]+/)
string << @input.matched
elsif @input.scan(%r{\[”\/]})
string << @input.matched[-1]
elsif @input.scan(/\[bfnrt]/)
string << eval(%Q{“#{@input.matched}”})
elsif @input.scan(/\u[0-9a-fA-F]{4}/)
string << [Integer(“0x#{@input.matched[2…-1]}”)].pack(“U”)
else
break
end
end
@input.scan(/"\s*/) or error(“Unclosed string”)
out << string
end

def error(message)
if @input.eos?
raise “Unexpected end of input.”
else
raise “#{message}: #{@input.peek(@input.string.length)}”
end
end
end

bbazzarrakk · February 5, 2008, 10:11am

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

That’s true of course (i_18 is the $n version):

$ ruby benchmark_eric_mahurin.rb i_18
"quiz155i_18"
205653 chars/second

$ ruby19 benchmark_eric_mahurin.rb i_18
"quiz155i_18"
271050 chars/second

$ ruby19 benchmark_eric_mahurin.rb i
"quiz155i"
207100 chars/second

But just having the pleasure to be able to use more than 9 groups and
not to have to care about group order IMHO justifies the use of named
groups. It wasn’t just once that I wasted time with searching for the
source of some problem just to find out that the group order changed
or
that the group numbers between, well, only mostly similar
programmatically generated regexps differed.

But the reason why I’m asking is that, e.g., your solution finishes
the
benchmark with 107323 chars/second when run with ruby18. With ruby19
though, it’s astounishing 1703 chars/second. I’m not sure though what
is
causing this slowdown. Also I’m wondering if this isn’t an artefact
of
the benchmark. The full run looks like this:

  user     system      total        real

…
8 24142 0.541000 0.000000 0.541000 ( 0.601000)
9 25988 0.621000 0.000000 0.621000 ( 0.641000)
10 588993246.555000 93.324000 339.879000 (345.657000)
1703 chars/second

So the slowdown only happens after Step #9.

My version that uses Regexp#match() (but still eval based) makes:

  user     system      total        real

…
7 3518 0.090000 0.000000 0.090000 ( 0.120000)
8 24142 2.894000 0.010000 2.904000 ( 2.934000)
8228 chars/second

But the time limit is exceeded at step #8 already. This makes me
wonder
what is causing this “interesting” behaviour of your solution at
step #10 when run with ruby19.

BTW, my own benchmarks (using random object generation of different
lenght based on Ara’s proposal but slightly enhanced) show the
following
picture:

First the net timing spent on random object generation alone:
user system total real
10 2.003000 0.000000 2.003000 ( 2.033000)
20 6.799000 0.000000 6.799000 ( 6.940000)
30 17.636000 0.000000 17.636000 ( 17.786000)

10: n=10000 avg.size=47
20: n=10000 avg.size=159
30: n=10000 avg.size=406

“quiz155i” (my original submission)
user system total real
10 3.495000 0.000000 3.495000 ( 3.535000)
20 10.024000 0.000000 10.024000 ( 10.095000)
30 25.988000 0.000000 25.988000 ( 26.027000)

10: n=10000 avg.size=46
20: n=10000 avg.size=148
30: n=10000 avg.size=391

“quiz155i_18” (modifyied for 18 compatibility)
user system total real
10 3.345000 0.000000 3.345000 ( 3.385000)
20 9.964000 0.000000 9.964000 ( 10.014000)
30 24.455000 0.000000 24.455000 ( 24.506000)

10: n=10000 avg.size=47
20: n=10000 avg.size=157
30: n=10000 avg.size=396

“quiz155b” (json based, i.e. ragel-based C extension)
user system total real
10 2.263000 0.010000 2.273000 ( 2.303000)
20 7.351000 0.000000 7.351000 ( 7.391000)
30 18.636000 0.000000 18.636000 ( 18.687000)

10: n=10000 avg.size=46
20: n=10000 avg.size=156
30: n=10000 avg.size=399

“solution_paolo_bonzini.rb”
user system total real
10 4.226000 0.000000 4.226000 ( 4.256000)
20 13.349000 0.000000 13.349000 ( 13.450000)
30 34.470000 0.070000 34.540000 ( 36.001000)

10: n=10000 avg.size=47
20: n=10000 avg.size=154
30: n=10000 avg.size=388

BTW I also measured steve’s version of a treetop parser but with 1000
iterations only:

“solution_steve.rb”
user system total real
10 5.037000 0.000000 5.037000 ( 5.068000)
20 14.651000 0.010000 14.661000 ( 14.961000)
30 40.298000 0.020000 40.318000 ( 41.330000)

10: n=1000 avg.size=48
20: n=1000 avg.size=148
30: n=1000 avg.size=407

What would the size of an average json snippet an ajax app has to deal
with be? I’m not in the webapp development buisness but from my
understanding this would be rather small, wouldn’t it?

Regards,
Thomas.

bbazzarrakk · February 5, 2008, 9:05am

The differences between ruby18 and ruby19 are quite
interesting though. Can somebody with deeper knowledge of
ruby19 affirm that the
“copy on write” strategy is still in use?

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

Thanks!

Paolo

bbazzarrakk · February 5, 2008, 12:06pm

But the reason why I’m asking is that, e.g., your
solution finishes the
benchmark with 107323 chars/second when run with ruby18.
With ruby19 though, it’s astounishing 1703 chars/second.

Uh-oh. Someone should write to ruby-core?

Paolo

bbazzarrakk · February 5, 2008, 4:03pm

On Feb 5, 2008 3:09 AM, tho_mica_l [email protected] wrote:

My baseline assumption was that runtime was relatively linear with
respect
to the data size. This assumption is broken with the above case (I
think I
noticed this too at some point). Going from a depth of 9 to 10
increased
the length by ~20X, but the runtime went up by ~400X. There is
obviously an
O(nn) component in there (2020=400). Sounds like there is a
ruby1.9problem.

In the benchmark, you could move the print of the performance to inside
the
loop, right before the break. If there is a consistent downward trend
in
chars/second, you may have an O(n*n) solution and chars/second makes no
sense (for arbitrary data size). Otherwise, maybe we should be looking
at
the best performance between the longest two data sizes so that there is
no
penalty for a solution getting to a larger but possibly more difficult
dataset. Running this test multiple times (maybe with 4.times{} around
the
whole benchmark - including creating the generator) would also be good.

What would the size of an average json snippet an ajax app has to deal

with be? I’m not in the webapp development buisness but from my
understanding this would be rather small, wouldn’t it?

Maybe, but then making a fast parser wouldn’t be any fun

bbazzarrakk · February 5, 2008, 10:12am

I think it’s just that ?<…> is slower than $n. Could you try
comparing my solution (the one based on yours) on both 1.8 and 1.9?

That’s true of course (i_18 is the $n version):

$ ruby benchmark_eric_mahurin.rb i_18
"quiz155i_18"
205653 chars/second

$ ruby19 benchmark_eric_mahurin.rb i_18
"quiz155i_18"
271050 chars/second

$ ruby19 benchmark_eric_mahurin.rb i
"quiz155i"
207100 chars/second

But just having the pleasure to be able to use more than 9 groups and
not to have to care about group order IMHO justifies the use of named
groups. It wasn’t just once that I wasted time with searching for the
source of some problem just to find out that the group order changed
or
that the group numbers between, well, only mostly similar
programmatically generated regexps differed.

But the reason why I’m asking is that, e.g., your solution finishes
the
benchmark with 107323 chars/second when run with ruby18. With ruby19
though, it’s astounishing 1703 chars/second. I’m not sure though what
is
causing this slowdown. Also I’m wondering if this isn’t an artefact
of
the benchmark. The full run looks like this:

  user     system      total        real

…
8 24142 0.541000 0.000000 0.541000 ( 0.601000)
9 25988 0.621000 0.000000 0.621000 ( 0.641000)
10 588993246.555000 93.324000 339.879000 (345.657000)
1703 chars/second

So the slowdown only happens after Step #9.

My version that uses Regexp#match() (but still eval based) makes:

  user     system      total        real