SEGV on child process by fork on GC.stress

GC.stress = true e$B2<$Ge(B fork e$B$9$k$H;R%W%m%;%9$,e(B SEGV
e$B$7$^$9!#e(B

% ./ruby -ve ’
GC.stress = true
pid = fork {}
Process.wait pid
p $?

ruby 1.9.0 (2007-11-30 patchlevel 0) [i686-linux]
#<Process::Status: pid 26804 SIGSEGV (signal 11) (core dumped)>
% gdb ruby core.26804
GNU gdb 6.4.90-debian
Copyright © 2006 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you
are
welcome to change it and/or distribute copies of it under certain
conditions.
Type “show copying” to see the conditions.
There is absolutely no warranty for GDB. Type “show warranty” for
details.
This GDB was configured as “i486-linux-gnu”…Using host libthread_db
library “/lib/tls/i686/cmov/libthread_db.so.1”.

warning: Can’t read pathname for load map:
e$BF~NOe(B/e$B=PNO%(%i!<$G$9e(B.
Reading symbols from /lib/tls/i686/cmov/libpthread.so.0…done.
Loaded symbols for /lib/tls/i686/cmov/libpthread.so.0
Reading symbols from /lib/tls/i686/cmov/librt.so.1…done.
Loaded symbols for /lib/tls/i686/cmov/librt.so.1
Reading symbols from /lib/tls/i686/cmov/libdl.so.2…done.
Loaded symbols for /lib/tls/i686/cmov/libdl.so.2
Reading symbols from /lib/tls/i686/cmov/libcrypt.so.1…done.
Loaded symbols for /lib/tls/i686/cmov/libcrypt.so.1
Reading symbols from /lib/tls/i686/cmov/libm.so.6…done.
Loaded symbols for /lib/tls/i686/cmov/libm.so.6
Reading symbols from /lib/tls/i686/cmov/libc.so.6…done.
Loaded symbols for /lib/tls/i686/cmov/libc.so.6
Reading symbols from /lib/ld-linux.so.2…done.
Loaded symbols for /lib/ld-linux.so.2
Core was generated by `./ruby -ve
GC.stress = true
pid = fork {}
Process.wait pid
p $?
'.
Program terminated with signal 11, Segmentation fault.
#0 vm_get_sourceline (cfp=0xb7d78f60) at vm.c:729
729 line_no = iseq->insn_info_table[i - 1].line_no;
(gdb) bt
#0 vm_get_sourceline (cfp=0xb7d78f60) at vm.c:729
#1 0x0805a2b1 in rb_sourceline () at eval_error.ci:27
#2 0x081065d7 in rb_bug (fmt=0x8131c1d “Segmentation fault”) at
error.c:225
#3 0x080ba4a0 in sigsegv (sig=11) at signal.c:535
#4 0xb7f2f420 in ?? ()
#5 0x0000000b in ?? ()
#6 0x00000033 in ?? ()
#7 0x00000000 in ?? ()
(gdb) p iseq
$1 = (rb_iseq_t *) 0xb7d78f54
(gdb) p iseq->insn_info_table
$2 = (struct iseq_insn_info_entry *) 0x0
(gdb)

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:32404] SEGV on child process by fork on
GC.stress.”
on Fri, 30 Nov 2007 17:45:23 +0900, Tanaka A. [email protected]
writes:

|GC.stress = true e$B2<$Ge(B fork e$B$9$k$H;R%W%m%;%9$,e(B SEGV e$B$7$^$9!#e(B

forke$B$7$?%5%V%W%m%;%9$r%G%P%C%0$9$kNI$$J}K!$r$4B8$8$NJ}$$$i$Ce(B
e$B$7$c$$$^$;$s$+!)e(B gdbe$B$N;H$$J}$,0-$$$N$+$J!)e(B

In article E1Iy7HA-0006zn-37@x31,
Yukihiro M. [email protected] writes:

forke$B$7$?%5%V%W%m%;%9$r%G%P%C%0$9$kNI$$J}K!$r$4B8$8$NJ}$$$i$Ce(B
e$B$7$c$$$^$;$s$+!)e(B gdbe$B$N;H$$J}$,0-$$$N$+$J!)e(B

set follow-fork-mode child e$B$G$7$g$&$+!#e(B

(gdb) set follow-fork-mode child
(gdb) run z.rb
Starting program: /home/akr/ruby/yarvo0/ruby/ruby z.rb
Failed to read a valid object file image from memory.
[Thread debugging using libthread_db enabled]
[New Thread -1209889920 (LWP 6738)]
[New Thread -1210627152 (LWP 6741)]

Program received signal SIGSEGV, Segmentation fault.
[Switching to LWP 6742]
0x0805b89b in rb_block_given_p () at eval.c:854
854 if (GC_GUARDED_PTR_REF(th->cfp->lfp[0])) {
(gdb)

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:32413] Re: SEGV on child process by fork on
GC.stress.”
on Sat, 1 Dec 2007 00:11:25 +0900, Tanaka A. [email protected]
writes:

|> forke$B$7$?%5%V%W%m%;%9$r%G%P%C%0$9$kNI$$J}K!$r$4B8$8$NJ}$$$i$Ce(B
|> e$B$7$c$$$^$;$s$+!)e(B gdbe$B$N;H$$J}$,0-$$$N$+$J!)e(B
|
|set follow-fork-mode child e$B$G$7$g$&$+!#e(B

e$B$"$j$,$H$&$4$6$$$^$9!#e(B

|0x0805b89b in rb_block_given_p () at eval.c:854
|854 if (GC_GUARDED_PTR_REF(th->cfp->lfp[0])) {
|(gdb)

gc_sweep()e$B$Ge(Bth->cfp->lfpe$B$,e(B0e$B$K$J$k$H$3$m$^$G$ODI$$$+$1$^$7$?e(B
e$B$,!"$=$3$+$i@h$O$h$/$o$+$j$^$;$s$G$7$?e(B(cfpe$B$de(Blfpe$B$,$I$&$$$&Iwe(B
e$B$K;H$o$l$F$$$k$N$+CN$i$J$$$N$Ge(B)e$B!#%^!<%/O3$l$+$b$7$l$^$;$s!#e(B

e$B!!$5$5$@$G$9!%e(B

Yukihiro M. wrote:

gc_sweep()e$B$Ge(Bth->cfp->lfpe$B$,e(B0e$B$K$J$k$H$3$m$^$G$ODI$$$+$1$^$7$?e(B
e$B$,!"$=$3$+$i@h$O$h$/$o$+$j$^$;$s$G$7$?e(B(cfpe$B$de(Blfpe$B$,$I$&$$$&Iwe(B
e$B$K;H$o$l$F$$$k$N$+CN$i$J$$$N$Ge(B)e$B!#%^!<%/O3$l$+$b$7$l$^$;$s!#e(B

e$B!!$3$A$i!$;d$N$[$&$GDI$C$F$*$-$^$9!%e(B

e$B!!:Y$+$$%P%0$h$j$b!$e(Bm17n
e$B$N$[$&$r4hD%$C$FLc$C$?J}$,$$$$$+$b!%e(B

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:32422] Re: SEGV on child process by fork on
GC.stress.”
on Sat, 1 Dec 2007 13:50:07 +0900, SASADA Koichi [email protected]
writes:

|e$B!!:Y$+$$%P%0$h$j$b!$e(Bm17n e$B$N$[$&$r4hD%$C$FLc$C$?J}$,$$$$$+$b!%e(B

1.9.1e$B0J9_!“;EMM$,e(B(e$B$”$^$je(B)e$BJQF0$7$J$$$3$H$,K>$^$l$F$$$k$h$&$Ge(B
e$B$9$+$i!“$=$A$i$KCmNO$7$^$9!#$G!”;d$OK:$l$C$]$$$N$G!"$3$NJU$,e(B
e$BESCf$@$H$$$&$h$&$J$3$H$K5$$,$D$$$?J}$,$$$i$C$7$c$l$P1sN8$J$/e(B
e$B;XE&$7$F$/$@$5$$!#e(B

In article E1IyLG2-0007Bo-VF@x31,
Yukihiro M. [email protected] writes:

1.9.1e$B0J9_!“;EMM$,e(B(e$B$”$^$je(B)e$BJQF0$7$J$$$3$H$,K>$^$l$F$$$k$h$&$Ge(B
e$B$9$+$i!“$=$A$i$KCmNO$7$^$9!#$G!”;d$OK:$l$C$]$$$N$G!"$3$NJU$,e(B
e$BESCf$@$H$$$&$h$&$J$3$H$K5$$,$D$$$?J}$,$$$i$C$7$c$l$P1sN8$J$/e(B
e$B;XE&$7$F$/$@$5$$!#e(B

e$BJ8;zNs$,$=$N%(%s%3!<%G%#%s%0$H$7$F@5$7$$$+$I$&$+$r3NG’$9$k5!e(B
e$BG=$H$+!#e(B

In article [email protected],
Tanaka A. [email protected] writes:

e$BJ8;zNs$,$=$N%(%s%3!<%G%#%s%0$H$7$F@5$7$$$+$I$&$+$r3NG’$9$k5!e(B
e$BG=$H$+!#e(B

e$B$H$&$N$r<BAu$9$k$K$Oe(B Oniguruma e$B%l%Y%k$Ne(B mbclen
e$B$“$?$j$G4V0c$Ce(B
e$B$?%(%s%3!<%G%#%s%0$r$^$H$b$K07$C$F!”$=$N>pJs$rDs6!$5$;$J$$$He(B
e$B$$$1$J$$$o$1$G$9$,!"$d$C$F$_$k$H$3$s$J46$8$G$9$+$M!#e(B

e$B$H$j$“$($:!”$=$N>pJs$r;H$C$Fe(B String#inspect
e$B$rJQ$J%P%$%H$r$A$ce(B
e$B$s$HH=JL$7$F%(%9%1!<%W$9$k$h$&$K$7$F$_$^$7$?!#e(B

% ./ruby -e ‘p “\xa1x”.force_encoding(“euc-jp”)’|cat -v
“M-!x”

e$B$H$$$&$h$&$KJ8;z$K$J$C$F$J$$e(B 8bit e$B%P%$%H$,=P$F$/$k$N$,e(B

% ./ruby -e ‘p “\xa1x”.force_encoding(“euc-jp”)’|cat -v
“\241x”

e$B$H$$$&$h$&$K%(%9%1!<%W$5$l$k$h$&$K$J$j$^$9!#e(B

e$BB>$K$be(B

% ./ruby -e ‘p “\374”.force_encoding(“utf-8”)’
“\000”

e$B$H$$$&$h$&$K!"13$D$1$H$$$$$?$/$J$k$h$&$J$N$,e(B

% ./ruby -e ‘p “\374”.force_encoding(“utf-8”)’
“\374”

e$B$H=P$k$h$&$K$J$j$^$9!#e(B

Index: encoding.c

— encoding.c (revision 14084)
+++ encoding.c (working copy)
@@ -495,6 +495,12 @@ rb_enc_mbclen(const char *p, const char
}

int
+rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
+{

  • return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
    +}

+int
rb_enc_codelen(int c, rb_encoding *enc)
{
int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
Index: include/ruby/encoding.h

— include/ruby/encoding.h (revision 14084)
+++ include/ruby/encoding.h (working copy)
@@ -71,6 +71,12 @@ rb_encoding * rb_enc_find(const char na
/
ptr,encoding → mbclen /
int rb_enc_mbclen(const char
, const char , rb_encoding);

+/* ptr,encoding → mbclen, invalid or needmore /
+int rb_enc_precise_mbclen(const char
, const char , rb_encoding);
+#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret)
+#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
+#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
+
/* code,encoding → codelen /
int rb_enc_codelen(int, rb_encoding
);

Index: include/ruby/oniguruma.h

— include/ruby/oniguruma.h (revision 14084)
+++ include/ruby/oniguruma.h (working copy)
@@ -144,7 +144,7 @@ typedef struct {
typedef int (OnigApplyAllCaseFoldFunc)(OnigCodePoint from,
OnigCodePoint
to, int to_len, void* arg);

typedef struct OnigEncodingTypeST {

  • int (mbc_enc_len)(const OnigUChar p,const OnigUChar* e, struct
    OnigEncodingTypeST* enc);
  • int (precise_mbc_enc_len)(const OnigUChar p,const OnigUChar* e,
    struct OnigEncodingTypeST* enc);
    const char* name;
    int max_enc_len;
    int min_enc_len;
    @@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodin
    #define ONIGENC_STEP_BACK(enc,start,s,n)
    onigenc_step_back((enc),(start),(s),(n))

-#define ONIGENC_MBC_ENC_LEN(enc,p,e)
(enc)->mbc_enc_len(p,e,enc)
+
+#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n)
+#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1)
+#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-n)
+
+static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r :
0; }
+static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 -
r : 0; }
+#define ONIGENC_MBCLEN_CHARFOUND(r) onigenc_mbclen_charfound(r)
+#define ONIGENC_MBCLEN_INVALID(r) ((r) == -1)
+#define ONIGENC_MBCLEN_NEEDMORE(r) onigenc_mbclen_needmore(r)
+
+#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e)
(enc)->precise_mbc_enc_len(p,e,enc)
+
+static inline int onigenc_mbclen_recover(const OnigUChar* p,const
OnigUChar* e, struct OnigEncodingTypeST* enc)
+{

  • int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e);
  • int r;
  • if (ONIGENC_MBCLEN_INVALID(ret))
  •    return 1;
    
  • else if ((r = ONIGENC_MBCLEN_NEEDMORE(ret)))
  •    return e-p+r;
    
  • else
  •    return ONIGENC_MBCLEN_CHARFOUND(ret);
    

+}
+
+#define ONIGENC_MBC_ENC_LEN(enc,p,e)
onigenc_mbclen_recover(p,e,enc)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)
Index: enc/euc_jp.c

— enc/euc_jp.c (revision 14084)
+++ enc/euc_jp.c (working copy)
@@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};

+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {

  • { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
  • },
  • { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
  • },
  • { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
  • },

+};
+#undef A
+#undef F
+
static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{

  • return EncLen_EUCJP[*p];
  • int firstbyte = *p++;
  • state_t s;
  • s = trans[0][firstbyte];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
  • s = trans[s][*p++];
  • return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
  •                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    

}

static OnigCodePoint
Index: enc/utf8.c

— enc/utf8.c (revision 14084)
+++ enc/utf8.c (working copy)
@@ -59,10 +59,155 @@ static const int EncLen_UTF8[] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};

+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3, S4, S5 }
state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {

  • { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* e */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  • /* f */ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, F, F
  • },
  • { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  • /* 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  • /* a */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  • /* b */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  • /* 9 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  • /* a */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  • /* b */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • }
    +};
    +#undef A
    +#undef F

static int
utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{

  • return EncLen_UTF8[*p];
  • int firstbyte = *p++;
  • state_t s;
  • s = trans[0][firstbyte];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-4);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(5)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-5);
  • s = trans[s][*p++];
  • return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(6) :
  •                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    

}

static int
Index: enc/sjis.c

— enc/sjis.c (revision 14084)
+++ enc/sjis.c (working copy)
@@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABL
#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]

+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {

  • { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
  • },
  • { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
  • /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
  • }
    +};
    +#undef A
    +#undef F

static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{

  • return EncLen_SJIS[*p];
  • int firstbyte = *p++;
  • state_t s;
  • s = trans[0][firstbyte];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
  • s = trans[s][*p++];
  • return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
  •                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    

}

static int
Index: string.c

— string.c (revision 14084)
+++ string.c (working copy)
@@ -2919,10 +2919,19 @@ rb_str_inspect(VALUE str)
str_cat_char(result, ‘"’, enc);
p = RSTRING_PTR(str); pend = RSTRING_END(str);
while (p < pend) {

  • int c = rb_enc_codepoint(p, pend, enc);
  • int n = rb_enc_codelen(c, enc);
  • int c;

  • int n;
    int cc;

  •    n = rb_enc_precise_mbclen(p, pend, enc);
    
  •    if (!MBCLEN_CHARFOUND(n)) {
    
  •        c = (unsigned char)*p++;
    
  •        goto escape_byte;
    
  •    }
    
  • c = rb_enc_codepoint(p, pend, enc);

  • n = rb_enc_codelen(c, enc);

  • p += n;
    if (c == ‘"’|| c == ‘\’ ||
    (c == ‘#’ && (cc = rb_enc_codepoint(p,pend,enc),
    @@ -2961,8 +2970,10 @@ rb_str_inspect(VALUE str)
    }
    else {
    char buf[5];

  •  char *s = buf;
    
  •  char *s;
    

+escape_byte:

  •  s = buf;
     sprintf(buf, "\\%03o", c & 0377);
     while (*s) {
    

    str_cat_char(result, *s++, enc);
    Index: test/ruby/test_m17n.rb
    ===================================================================
    — test/ruby/test_m17n.rb (revision 14084)
    +++ test/ruby/test_m17n.rb (working copy)
    @@ -36,6 +36,38 @@ class TestM17N < Test::Unit::TestCase
    assert_nothing_raised { eval(u(%{“\u{6666}\xc0\xa0”})) }
    end

  • def test_string_inspect

  • assert_equal(‘“\376”’, e(“\xfe”).inspect)

  • assert_equal(‘“\216”’, e(“\x8e”).inspect)

  • assert_equal(‘“\217”’, e(“\x8f”).inspect)

  • assert_equal(‘“\217\241”’, e(“\x8f\xa1”).inspect)

  • assert_equal(‘“\357”’, s(“\xef”).inspect)

  • assert_equal(‘“\300”’, u(“\xc0”).inspect)

  • assert_equal(‘“\340\200”’, u(“\xe0\x80”).inspect)

  • assert_equal(‘“\360\200\200”’, u(“\xf0\x80\x80”).inspect)

  • assert_equal(‘“\370\200\200\200”’, u(“\xf8\x80\x80\x80”).inspect)

  • assert_equal(‘“\374\200\200\200\200”’,
    u(“\xfc\x80\x80\x80\x80”).inspect)

  • assert_equal(‘"\376 "’, e("\xfe ").inspect)

  • assert_equal(‘"\216 "’, e("\x8e ").inspect)

  • assert_equal(‘"\217 "’, e("\x8f ").inspect)

  • assert_equal(‘"\217\241 "’, e("\x8f\xa1 ").inspect)

  • assert_equal(‘"\357 "’, s("\xef ").inspect)

  • assert_equal(‘"\300 "’, u("\xc0 ").inspect)

  • assert_equal(‘"\340\200 "’, u("\xe0\x80 ").inspect)

  • assert_equal(‘"\360\200\200 "’, u("\xf0\x80\x80 ").inspect)

  • assert_equal(‘"\370\200\200\200 "’, u("\xf8\x80\x80\x80 ").inspect)

  • assert_equal(‘"\374\200\200\200\200 "’, u("\xfc\x80\x80\x80\x80
    ").inspect)

  • assert_equal(e(“"\241\x8f\xa1\xa1"”),
    e(“\xa1\x8f\xa1\xa1”).inspect)

  • assert_equal(‘“\201.”’, s(“\x81.”).inspect)

  • assert_equal(s(“"\x81@"”), s(“\x81@”).inspect)

  • assert_equal(‘“\374”’, u(“\xfc”).inspect)

  • end

  • def test_regexp_too_short_multibyte_character
    assert_raise(SyntaxError) { eval(‘/\xfe/e’) }
    assert_raise(SyntaxError) { eval(‘/\x8e/e’) }

In article [email protected],
Tanaka A. [email protected] writes:

In article [email protected],
Tanaka A. [email protected] writes:

e$BJ8;zNs$,$=$N%(%s%3!<%G%#%s%0$H$7$F@5$7$$$+$I$&$+$r3NG’$9$k5!e(B
e$BG=$H$+!#e(B

e$B$H$&$N$r<BAu$9$k$K$Oe(B Oniguruma e$B%l%Y%k$Ne(B mbclen e$B$“$?$j$G4V0c$Ce(B
e$B$?%(%s%3!<%G%#%s%0$r$^$H$b$K07$C$F!”$=$N>pJs$rDs6!$5$;$J$$$He(B
e$B$$$1$J$$$o$1$G$9$,!"$d$C$F$_$k$H$3$s$J46$8$G$9$+$M!#e(B

String#valid_encoding? e$B$G3NG’$9$k$@$1$N$r$D$1$F!"e(BUTF-8 e$B$r87e(B
e$BL)2=$9$k$H$3$s$J46$8$G$7$g$&$+!#e(B
(string.c e$B$He(B utf8.c e$B0J30$OJQ$o$C$F$J$$$N$G>JN,e(B)

Index: string.c

— string.c (revision 14088)
+++ string.c (working copy)
@@ -2919,10 +2919,19 @@ rb_str_inspect(VALUE str)
str_cat_char(result, ‘"’, enc);
p = RSTRING_PTR(str); pend = RSTRING_END(str);
while (p < pend) {

  • int c = rb_enc_codepoint(p, pend, enc);
  • int n = rb_enc_codelen(c, enc);
  • int c;

  • int n;
    int cc;

  •    n = rb_enc_precise_mbclen(p, pend, enc);
    
  •    if (!MBCLEN_CHARFOUND(n)) {
    
  •        c = (unsigned char)*p++;
    
  •        goto escape_byte;
    
  •    }
    
  • c = rb_enc_codepoint(p, pend, enc);

  • n = rb_enc_codelen(c, enc);

  • p += n;
    if (c == ‘"’|| c == ‘\’ ||
    (c == ‘#’ && (cc = rb_enc_codepoint(p,pend,enc),
    @@ -2961,8 +2970,10 @@ rb_str_inspect(VALUE str)
    }
    else {
    char buf[5];

  •  char *s = buf;
    
  •  char *s;
    

+escape_byte:

  •  s = buf;
     sprintf(buf, "\\%03o", c & 0377);
     while (*s) {
    
    str_cat_char(result, *s++, enc);
    @@ -5232,6 +5243,25 @@ rb_str_force_encoding(VALUE str, VALUE e
    return str;
    }

+static VALUE
+rb_str_valid_encoding_p(VALUE str)
+{

  • char *p = RSTRING_PTR(str);
  • char *pend = RSTRING_END(str);
  • rb_encoding *enc = rb_enc_get(str);
  • while (p < pend) {
  • int n;
  •    n = rb_enc_precise_mbclen(p, pend, enc);
    
  •    if (!MBCLEN_CHARFOUND(n)) {
    
  •        return Qfalse;
    
  •    }
    
  •    p += n;
    
  • }
  • return Qtrue;
    +}

/**********************************************************************

  • Document-class: Symbol

@@ -5644,6 +5674,7 @@ Init_String(void)

 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in 

encoding.c */
rb_define_method(rb_cString, “force_encoding”,
rb_str_force_encoding, 1);

  • rb_define_method(rb_cString, “valid_encoding?”,
    rb_str_valid_encoding_p, 0);

    id_to_s = rb_intern(“to_s”);

Index: enc/utf8.c

— enc/utf8.c (revision 14088)
+++ enc/utf8.c (working copy)
@@ -56,13 +56,189 @@ static const int EncLen_UTF8[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

  • 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
  • 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
    };

+typedef enum {

  • FAILURE = -2,
  • ACCEPT,
  • S0, S1, S2, S3,
  • S4, S5, S6, S7
    +} state_t;
    +#define A ACCEPT
    +#define F FAILURE
    +static const signed char trans[][0x100] = {
  • { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
  • /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  • /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
  • { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  • /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  • /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
  • /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
  • },
    +};
    +#undef A
    +#undef F

static int
utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{

  • return EncLen_UTF8[*p];
  • int firstbyte = *p++;
  • state_t s;
  • s = trans[0][firstbyte];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
  • s = trans[s][*p++];
  • if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3)
    :
  •                              ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    
  • if (p == e) return
    ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
  • s = trans[s][*p++];
  • return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
  •                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
    

}

static int