In article [email protected],
Tanaka A. [email protected] writes:
e$BJ8;zNs$,$=$N%(%s%3!<%G%#%s%0$H$7$F@5$7$$$+$I$&$+$r3NG’$9$k5!e(B
e$BG=$H$+!#e(B
e$B$H$&$N$r<BAu$9$k$K$Oe(B Oniguruma e$B%l%Y%k$Ne(B mbclen
e$B$“$?$j$G4V0c$Ce(B
e$B$?%(%s%3!<%G%#%s%0$r$^$H$b$K07$C$F!”$=$N>pJs$rDs6!$5$;$J$$$He(B
e$B$$$1$J$$$o$1$G$9$,!"$d$C$F$_$k$H$3$s$J46$8$G$9$+$M!#e(B
e$B$H$j$“$($:!”$=$N>pJs$r;H$C$Fe(B String#inspect
e$B$rJQ$J%P%$%H$r$A$ce(B
e$B$s$HH=JL$7$F%(%9%1!<%W$9$k$h$&$K$7$F$_$^$7$?!#e(B
% ./ruby -e ‘p “\xa1x”.force_encoding(“euc-jp”)’|cat -v
“M-!x”
e$B$H$$$&$h$&$KJ8;z$K$J$C$F$J$$e(B 8bit e$B%P%$%H$,=P$F$/$k$N$,e(B
% ./ruby -e ‘p “\xa1x”.force_encoding(“euc-jp”)’|cat -v
“\241x”
e$B$H$$$&$h$&$K%(%9%1!<%W$5$l$k$h$&$K$J$j$^$9!#e(B
e$BB>$K$be(B
% ./ruby -e ‘p “\374”.force_encoding(“utf-8”)’
“\000”
e$B$H$$$&$h$&$K!"13$D$1$H$$$$$?$/$J$k$h$&$J$N$,e(B
% ./ruby -e ‘p “\374”.force_encoding(“utf-8”)’
“\374”
e$B$H=P$k$h$&$K$J$j$^$9!#e(B
Index: encoding.c
— encoding.c (revision 14084)
+++ encoding.c (working copy)
@@ -495,6 +495,12 @@ rb_enc_mbclen(const char *p, const char
}
int
+rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
+{
- return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+}
-
+int
rb_enc_codelen(int c, rb_encoding *enc)
{
int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
Index: include/ruby/encoding.h
— include/ruby/encoding.h (revision 14084)
+++ include/ruby/encoding.h (working copy)
@@ -71,6 +71,12 @@ rb_encoding * rb_enc_find(const char na
/ ptr,encoding → mbclen /
int rb_enc_mbclen(const char, const char , rb_encoding);
+/* ptr,encoding → mbclen, invalid or needmore /
+int rb_enc_precise_mbclen(const char, const char , rb_encoding);
+#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret)
+#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
+#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
+
/* code,encoding → codelen /
int rb_enc_codelen(int, rb_encoding);
Index: include/ruby/oniguruma.h
— include/ruby/oniguruma.h (revision 14084)
+++ include/ruby/oniguruma.h (working copy)
@@ -144,7 +144,7 @@ typedef struct {
typedef int (OnigApplyAllCaseFoldFunc)(OnigCodePoint from,
OnigCodePoint to, int to_len, void* arg);
typedef struct OnigEncodingTypeST {
- int (mbc_enc_len)(const OnigUChar p,const OnigUChar* e, struct
OnigEncodingTypeST* enc);
- int (precise_mbc_enc_len)(const OnigUChar p,const OnigUChar* e,
struct OnigEncodingTypeST* enc);
const char* name;
int max_enc_len;
int min_enc_len;
@@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodin
#define ONIGENC_STEP_BACK(enc,start,s,n)
onigenc_step_back((enc),(start),(s),(n))
-#define ONIGENC_MBC_ENC_LEN(enc,p,e)
(enc)->mbc_enc_len(p,e,enc)
+
+#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n)
+#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1)
+#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-n)
+
+static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r :
0; }
+static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 -
r : 0; }
+#define ONIGENC_MBCLEN_CHARFOUND(r) onigenc_mbclen_charfound(r)
+#define ONIGENC_MBCLEN_INVALID(r) ((r) == -1)
+#define ONIGENC_MBCLEN_NEEDMORE(r) onigenc_mbclen_needmore(r)
+
+#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e)
(enc)->precise_mbc_enc_len(p,e,enc)
+
+static inline int onigenc_mbclen_recover(const OnigUChar* p,const
OnigUChar* e, struct OnigEncodingTypeST* enc)
+{
+}
+
+#define ONIGENC_MBC_ENC_LEN(enc,p,e)
onigenc_mbclen_recover(p,e,enc)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)
Index: enc/euc_jp.c
— enc/euc_jp.c (revision 14084)
+++ enc/euc_jp.c (working copy)
@@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
- { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
- /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
- },
- { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
- },
- { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
- },
-
+};
+#undef A
+#undef F
+
static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
- int firstbyte = *p++;
- state_t s;
- s = trans[0][firstbyte];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
- s = trans[s][*p++];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
- s = trans[s][*p++];
- return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static OnigCodePoint
Index: enc/utf8.c
— enc/utf8.c (revision 14084)
+++ enc/utf8.c (working copy)
@@ -59,10 +59,155 @@ static const int EncLen_UTF8[] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3, S4, S5 }
state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
- { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* e */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- /* f */ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, F, F
- },
- { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
- },
- { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
- },
- { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- /* 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- /* a */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- /* b */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
- },
- { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
- },
- { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 8 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- /* 9 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- /* a */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- /* b */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
- }
+};
+#undef A
+#undef F
-
static int
utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
- int firstbyte = *p++;
- state_t s;
- s = trans[0][firstbyte];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
-
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
- s = trans[s][*p++];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
-
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
- s = trans[s][*p++];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
-
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
- s = trans[s][*p++];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
-
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-4);
- s = trans[s][*p++];
- if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(5)
:
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
-
- if (p == e) return
ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-5);
- s = trans[s][*p++];
- return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(6) :
-
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static int
Index: enc/sjis.c
— enc/sjis.c (revision 14084)
+++ enc/sjis.c (working copy)
@@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABL
#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
- { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
- },
- { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
- /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
- /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
- /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
- }
+};
+#undef A
+#undef F
-
static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
}
static int
Index: string.c
— string.c (revision 14084)
+++ string.c (working copy)
@@ -2919,10 +2919,19 @@ rb_str_inspect(VALUE str)
str_cat_char(result, ‘"’, enc);
p = RSTRING_PTR(str); pend = RSTRING_END(str);
while (p < pend) {
- int c = rb_enc_codepoint(p, pend, enc);
- int n = rb_enc_codelen(c, enc);
-
int c;
-
int n;
int cc;
-
n = rb_enc_precise_mbclen(p, pend, enc);
-
if (!MBCLEN_CHARFOUND(n)) {
-
c = (unsigned char)*p++;
-
goto escape_byte;
-
}
-
-
c = rb_enc_codepoint(p, pend, enc);
-
n = rb_enc_codelen(c, enc);
-
p += n;
if (c == ‘"’|| c == ‘\’ ||
(c == ‘#’ && (cc = rb_enc_codepoint(p,pend,enc),
@@ -2961,8 +2970,10 @@ rb_str_inspect(VALUE str)
}
else {
char buf[5];
+escape_byte:
-
s = buf;
sprintf(buf, "\\%03o", c & 0377);
while (*s) {
str_cat_char(result, *s++, enc);
Index: test/ruby/test_m17n.rb
===================================================================
— test/ruby/test_m17n.rb (revision 14084)
+++ test/ruby/test_m17n.rb (working copy)
@@ -36,6 +36,38 @@ class TestM17N < Test::Unit::TestCase
assert_nothing_raised { eval(u(%{“\u{6666}\xc0\xa0”})) }
end
-
def test_string_inspect
-
assert_equal(‘“\376”’, e(“\xfe”).inspect)
-
assert_equal(‘“\216”’, e(“\x8e”).inspect)
-
assert_equal(‘“\217”’, e(“\x8f”).inspect)
-
assert_equal(‘“\217\241”’, e(“\x8f\xa1”).inspect)
-
assert_equal(‘“\357”’, s(“\xef”).inspect)
-
assert_equal(‘“\300”’, u(“\xc0”).inspect)
-
assert_equal(‘“\340\200”’, u(“\xe0\x80”).inspect)
-
assert_equal(‘“\360\200\200”’, u(“\xf0\x80\x80”).inspect)
-
assert_equal(‘“\370\200\200\200”’, u(“\xf8\x80\x80\x80”).inspect)
-
assert_equal(‘“\374\200\200\200\200”’,
u(“\xfc\x80\x80\x80\x80”).inspect)
-
-
assert_equal(‘"\376 "’, e("\xfe ").inspect)
-
assert_equal(‘"\216 "’, e("\x8e ").inspect)
-
assert_equal(‘"\217 "’, e("\x8f ").inspect)
-
assert_equal(‘"\217\241 "’, e("\x8f\xa1 ").inspect)
-
assert_equal(‘"\357 "’, s("\xef ").inspect)
-
assert_equal(‘"\300 "’, u("\xc0 ").inspect)
-
assert_equal(‘"\340\200 "’, u("\xe0\x80 ").inspect)
-
assert_equal(‘"\360\200\200 "’, u("\xf0\x80\x80 ").inspect)
-
assert_equal(‘"\370\200\200\200 "’, u("\xf8\x80\x80\x80 ").inspect)
-
assert_equal(‘"\374\200\200\200\200 "’, u("\xfc\x80\x80\x80\x80
").inspect)
-
-
-
assert_equal(e(“"\241\x8f\xa1\xa1"”),
e(“\xa1\x8f\xa1\xa1”).inspect)
-
-
assert_equal(‘“\201.”’, s(“\x81.”).inspect)
-
assert_equal(s(“"\x81@"”), s(“\x81@”).inspect)
-
-
assert_equal(‘“\374”’, u(“\xfc”).inspect)
-
end
-
def test_regexp_too_short_multibyte_character
assert_raise(SyntaxError) { eval(‘/\xfe/e’) }
assert_raise(SyntaxError) { eval(‘/\x8e/e’) }