Iso-2022-jp implementation

ISO-2022-JP ‚ÌŽÀ‘•‚Å‚·‚ªA0x1e ‚ª shift ‚¾‚Æ‚¢‚Á‚Ä‚Ý‚½‚èA

% ./ruby -ve ‘p “\x1e”.encode(“euc-jp”, “iso-2022-jp”)’
ruby 1.9.0 (2008-08-07 revision 18416) [i686-linux]
-e:1:in encode': shift is not supported (RuntimeError) from -e:1:in

:invalid=>:replace, :undef=>:replace ‚Ì—¼•û‚ðŽw’肵‚Ä‚à—áŠO‚ªo‚½‚èA

% ./ruby -e ‘p “\e(X”.encode(“EUC-JP”, “ISO-2022-JP”,
:invalid=>:replace, :undef=>:replace)’
-e:1:in encode': this mode is not supported (ESC ( X) (RuntimeError) from -e:1:in

ƒGƒ‰[ƒƒbƒZ[ƒW‚É NUL ‚ª“ü‚Á‚½‚èA

% ./ruby -e ‘p “\e”.encode(“EUC-JP”, “ISO-2022-JP”)’|& cat -v
-e:1:in encode': this mode is not supported (ESC ^@) (RuntimeError) from -e:1:in

-1 bytes left ‚Æ‚¢‚¤‰ö‚µ‚¢ƒƒbƒZ[ƒW‚ªo‚½‚èA

% ./ruby -e ‘p “\e$(Da”.encode(“EUC-JP”, “ISO-2022-JP”)’
-e:1:in encode': not fully converted, -1 bytes left (ArgumentError) from -e:1:in

‚·‚é‚̂ŁAì‚è‚È‚¨‚µ‚Ĉȉº‚̂悤‚É‚·‚é‚Ì‚Í‚Ç‚¤‚Å‚µ‚傤‚©B

Index: enc/trans/iso2022.erb.c

— enc/trans/iso2022.erb.c (revision 0)
+++ enc/trans/iso2022.erb.c (revision 0)
@@ -0,0 +1,142 @@
+#include “transcode_data.h”
+
+<%

  • map = {}
  • map[“1b2842”] = :func_so # designate US-ASCII to G0.
    “ESC ( B”
  • map[“1b284a”] = :func_so # designate JIS X 0201 latin to G0.
    “ESC ( J”
  • map[“1b2440”] = :func_so # designate JIS X 0208 1978 to G0.
    “ESC $ @”
  • map[“1b2442”] = :func_so # designate JIS X 0208 1983 to G0.
    “ESC $ B”
  • map["{00-0d,10-1a,1c-7f}"] = :func_si
  • map_jisx0208_rest = {}
  • map_jisx0208_rest["{21-7e}"] = :func_so
    +%>

+<%= transcode_generate_node(ActionMap.parse(map), “iso2022jp_to_eucjp”,
[]) %>
+<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest),
“iso2022jp_to_eucjp_jisx0208_rest”, []) %>
+
+static VALUE
+fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s,
size_t l)
+{

  • if (t->stateful[0] == 0)
  •    return (VALUE)NOMAP;
    
  • else if (0x21 <= s[0] && s[0] <= 0x7e)
  •    return (VALUE)&iso2022jp_to_eucjp_jisx0208_rest;
    
  • else
  •    return (VALUE)INVALID;
    

+}
+
+static int
+fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s,
size_t l, unsigned char* o)
+{

  • if (s[0] == 0x1b) {
  •    if (s[1] == '(') {
    
  •        switch (s[l-1]) {
    
  •          case 'B':
    
  •          case 'J':
    
  •            t->stateful[0] = 0;
    
  •            break;
    
  •        }
    
  •    }
    
  •    else {
    
  •        switch (s[l-1]) {
    
  •          case '@':
    
  •          case 'B':
    
  •            t->stateful[0] = 1;
    
  •            break;
    
  •        }
    
  •    }
    
  •    return 0;
    
  • }
  • else {
  •    o[0] = s[0] | 0x80;
    
  •    o[1] = s[1] | 0x80;
    
  •    return 2;
    
  • }
    +}

+static const rb_transcoder
+rb_ISO_2022_JP_to_EUC_JP = {

  • “ISO-2022-JP”, “EUC-JP”, &iso2022jp_to_eucjp, 3, 0,
  • NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
    +};

+<%

  • map_eucjp = {
  • “{0e,0f,1b}” => :undef,
  • “{00-0d,10-1a,1c-7f}” => :func_so,
  • “{a1-fe}{a1-fe}” => :func_so,
  • “8e{a1-fe}” => :undef,
  • “8f{a1-fe}{a1-fe}” => :undef,
  • }
    +%>

+<%= transcode_generate_node(ActionMap.parse(map_eucjp),
“eucjp_to_iso2022jp”, []) %>
+
+static int
+fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s,
size_t l, unsigned char *o)
+{

  • unsigned char *output0 = o;
  • if (t->stateful[0] == 0) {
  •    t->stateful[0] = 1; /* initialized flag */
    
  •    t->stateful[1] = 1; /* ASCII mode */
    
  • }
  • if (l != t->stateful[1]) {
  •    if (l == 1) {
    
  •        *o++ = 0x1b;
    
  •        *o++ = '(';
    
  •        *o++ = 'B';
    
  •        t->stateful[1] = 1;
    
  •    }
    
  •    else {
    
  •        *o++ = 0x1b;
    
  •        *o++ = '$';
    
  •        *o++ = 'B';
    
  •        t->stateful[1] = 2;
    
  •    }
    
  • }
  • if (l == 1) {
  •    *o++ = s[0] & 0x7f;
    
  • }
  • else {
  •    *o++ = s[0] & 0x7f;
    
  •    *o++ = s[1] & 0x7f;
    
  • }
  • return o - output0;
    +}

+static int
+finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
+{

  • unsigned char *output0 = o;
  • if (t->stateful[0] == 0)
  •    return 0;
    
  • if (t->stateful[1] != 1) {
  •    *o++ = 0x1b;
    
  •    *o++ = '(';
    
  •    *o++ = 'B';
    
  •    t->stateful[1] = 1;
    
  • }
  • return o - output0;
    +}

+static const rb_transcoder
+rb_EUC_JP_to_ISO_2022_JP = {

  • “EUC-JP”, “ISO-2022-JP”, &eucjp_to_iso2022jp, 5, 0,
  • NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
    finish_eucjp_to_iso2022jp
    +};

+void
+Init_iso2022(void)
+{

  • rb_register_transcoder(&rb_ISO_2022_JP_to_EUC_JP);
  • rb_register_transcoder(&rb_EUC_JP_to_ISO_2022_JP);
    +}

Index: enc/trans/japanese.erb.c

— enc/trans/japanese.erb.c (revision 18417)
+++ enc/trans/japanese.erb.c (working copy)
@@ -17,235 +17,8 @@
<%= transcode_tblgen “UTF-8”, “EUC-JP”, [["{00-7f}", :nomap],
*UCS_TO_EUCJP_TBL] %>
<%= transcode_tblgen “UTF-8”, “CP51932”, [["{00-7f}", :nomap],
*UCS_TO_EUCJP_TBL] %>

-#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte)
-enum ISO_2022_ESCSEQ {

  • ISO_2022_CZD = ‘!’,
  • ISO_2022_C1D = ‘"’,
  • ISO_2022_GZD4 = ‘(’,
  • ISO_2022_G1D4 = ‘)’,
  • ISO_2022_G2D4 = ‘*’,
  • ISO_2022_G3D4 = ‘+’,
  • ISO_2022_G1D6 = ‘-’,
  • ISO_2022_G2D6 = ‘.’,
  • ISO_2022_G3D6 = ‘/’,
  • ISO_2022_GZDM4 = ISO_2022_ENCODING(’$’,’(’),
  • ISO_2022_G1DM4 = ISO_2022_ENCODING(’$’,’)’),
  • ISO_2022_G2DM4 = ISO_2022_ENCODING(’$’,’*’),
  • ISO_2022_G3DM4 = ISO_2022_ENCODING(’$’,’+’),
  • ISO_2022_G1DM6 = ISO_2022_ENCODING(’$’,’-’),
  • ISO_2022_G2DM6 = ISO_2022_ENCODING(’$’,’.’),
  • ISO_2022_G3DM6 = ISO_2022_ENCODING(’$’,’/’),
  • ISO_2022_DOCS = ISO_2022_ENCODING(’%’,‘I’),
  • ISO_2022_IRR = ‘&’
    -};

-#define ISO_2022_GZ_ASCII
ISO_2022_ENCODING(ISO_2022_GZD4, ‘B’)
-#define ISO_2022_GZ_JIS_X_0201_Katakana
ISO_2022_ENCODING(ISO_2022_GZD4, ‘I’)
-#define ISO_2022_GZ_JIS_X_0201_Roman
ISO_2022_ENCODING(ISO_2022_GZD4, ‘J’)
-#define ISO_2022_GZ_JIS_C_6226_1978
ISO_2022_ENCODING(ISO_2022_GZDM4,’@’)
-#define ISO_2022_GZ_JIS_X_0208_1983
ISO_2022_ENCODING(ISO_2022_GZDM4,‘B’)
-#define ISO_2022_GZ_JIS_X_0212_1990
ISO_2022_ENCODING(ISO_2022_GZDM4,‘D’)
-#define ISO_2022_GZ_JIS_X_0213_2000_1
ISO_2022_ENCODING(ISO_2022_GZDM4,‘O’)
-#define ISO_2022_GZ_JIS_X_0213_2000_2
ISO_2022_ENCODING(ISO_2022_GZDM4,‘P’)
-#define ISO_2022_GZ_JIS_X_0213_2004_1
ISO_2022_ENCODING(ISO_2022_GZDM4,‘Q’)

-#define UNSUPPORTED_MODE TRANSCODE_ERROR

-static int
-get_iso_2022_mode(const unsigned char **in_pos)
-{

  • int new_mode;
  • const unsigned char *in_p = *in_pos;
  • switch (*in_p++) {
  •  case '(':
    
  • switch (*in_p++) {
  • case ‘B’: case ‘I’: case ‘J’:
  •  new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1));
    
  •  break;
    
  • default:
  •  rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( 
    

%c)", *(in_p-1));

  •  break;
    
  • }
  • break;
  •  case '$':
    
  • switch (*in_p++) {
  • case ‘@’: case ‘A’: case ‘B’:
  •  new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
    
  •  break;
    
  • case ‘(’:
  •  switch (*in_p++) {
    
  •    case 'D': case 'O': case 'P': case 'Q':
    
  • new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
  • break;
  •    default:
    
  • rb_raise(UNSUPPORTED_MODE, “this mode is not supported (ESC $ (
    %c)”, *(in_p-1));
  • break;
  •  }
    
  •  break;
    
  • default:
  •  rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ 
    

%c)", *(in_p-1));

  •  break;
    
  • }
  • break;
  •  default:
    
  • rb_raise(UNSUPPORTED_MODE, “this mode is not supported (ESC %c)”,
    *(in_p-1));
  • break;
  • }
  • *in_pos = in_p;
  • return new_mode;
    -}

-static void
-from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos,
unsigned char **out_pos,

  •       const unsigned char *in_stop, unsigned char *out_stop,
    
  •       rb_transcoding *my_transcoding)
    

-{

  • const rb_transcoder *my_transcoder = my_transcoding->transcoder;
  • const unsigned char *in_p = *in_pos;
  • unsigned char *out_p = *out_pos;
  • int cur_mode = ISO_2022_GZ_ASCII;
  • unsigned char c1;
  • unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
  • while (in_p < in_stop) {
  • if (out_p >= out_s) {
  •  int len = (out_p - *out_pos);
    
  •  int new_len = (len + my_transcoder->max_output) * 2;
    
  •  *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
    

new_len);

  •  out_p = *out_pos + len;
    
  •  out_s = *out_pos + new_len - my_transcoder->max_output;
    
  • }
  • c1 = *in_p++;
  • if (c1 == 0x1B) {
  •  cur_mode = get_iso_2022_mode(&in_p);
    
  • }
  • else if (c1 == 0x1E || c1 == 0x1F) {
  •  /* SHIFT */
    
  •  rb_raise(UNSUPPORTED_MODE, "shift is not supported");
    
  • }
  • else if (c1 >= 0x80) {
  •  rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
    
  • }
  • else {
  •  switch (cur_mode) {
    
  •    case ISO_2022_GZ_ASCII:
    
  •    case ISO_2022_GZ_JIS_X_0201_Roman:
    
  • *out_p++ = c1;
  • break;
  •    case ISO_2022_GZ_JIS_X_0201_Katakana:
    
  • *out_p++ = 0x8E;
  • *out_p++ = c1 | 0x80;
  • break;
  •    case ISO_2022_GZ_JIS_X_0212_1990:
    
  • *out_p++ = 0x8F;
  •    case ISO_2022_GZ_JIS_C_6226_1978:
    
  •    case ISO_2022_GZ_JIS_X_0208_1983:
    
  • *out_p++ = c1 | 0x80;
  • *out_p++ = *in_p++ | 0x80;
  • break;
  •  }
    
  • }
  • }
  • /* cleanup */
  • *in_pos = in_p;
  • *out_pos = out_p;
    -}

-static int
-select_iso_2022_mode(unsigned char **out_pos, int new_mode)
-{

  • unsigned char *out_p = *out_pos;
  • *out_p++ = ‘\x1b’;
  • switch (new_mode>>8) {
  •  case ISO_2022_GZD4:
    
  • *out_p++ = new_mode >> 8;
  • *out_p++ = new_mode & 0x7F;
  • break;
  •  case ISO_2022_GZDM4:
    
  • *out_p++ = new_mode >> 16;
  • if ((new_mode & 0x7F) != ‘@’ &&
  •  (new_mode & 0x7F) != 'A' &&
    
  •  (new_mode & 0x7F) != 'B')
    
  • {
  •  *out_p++ = (new_mode>>8) & 0x7F;
    
  • }
  • *out_p++ = new_mode & 0x7F;
  • break;
  •  default:
    
  • rb_raise(UNSUPPORTED_MODE, “this mode is not supported.”);
  • break;
  • }
  • *out_pos = out_p;
  • return new_mode;
    -}

-static void
-to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos,
unsigned char **out_pos,

  •      const unsigned char *in_stop, unsigned char *out_stop,
    
  •      rb_transcoding *my_transcoding)
    

-{

  • const rb_transcoder *my_transcoder = my_transcoding->transcoder;
  • const unsigned char *in_p = *in_pos;
  • unsigned char *out_p = *out_pos;
  • int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0;
  • unsigned char next_byte;
  • unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
  • while (in_p < in_stop) {
  • if (out_p >= out_s) {
  •  int len = (out_p - *out_pos);
    
  •  int new_len = (len + my_transcoder->max_output) * 2;
    
  •  *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
    

new_len);

  •  out_p = *out_pos + len;
    
  •  out_s = *out_pos + new_len - my_transcoder->max_output;
    
  • }
  • next_byte = *in_p++;
  • if (next_byte < 0x80) {
  •  new_mode = ISO_2022_GZ_ASCII;
    
  • }
  • else if (next_byte == 0x8E) {
  •  new_mode = ISO_2022_GZ_JIS_X_0201_Katakana;
    
  •  next_byte = *in_p++;
    
  • }
  • else if (next_byte == 0x8F) {
  •  new_mode = ISO_2022_GZ_JIS_X_0212_1990;
    
  •  next_byte = *in_p++;
    
  • }
  • else {
  •  new_mode = ISO_2022_GZ_JIS_X_0208_1983;
    
  • }
  • if (cur_mode != new_mode)
  •  cur_mode = select_iso_2022_mode(&out_p, new_mode);
    
  • if (cur_mode < 0xFFFF) {
  •  *out_p++ = next_byte & 0x7F;
    
  • }
  • else {
  •  *out_p++ = next_byte & 0x7F;
    
  •  *out_p++ = *in_p++ & 0x7F;
    
  • }
  • }
  • if (cur_mode != ISO_2022_GZ_ASCII)
  • cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII);
  • /* cleanup */
  • *in_pos = in_p;
  • *out_pos = out_p;
    -}

-static const rb_transcoder
-rb_from_ISO_2022_JP = {

  • “ISO-2022-JP”, “UTF-8”, &from_EUC_JP, 8, 0,
  • &from_iso_2022_jp_transcoder_preprocessor, NULL,
    -};

-static const rb_transcoder
-rb_to_ISO_2022_JP = {

  • “UTF-8”, “ISO-2022-JP”, &to_EUC_JP, 8, 1,
  • NULL, &to_iso_2022_jp_transcoder_postprocessor,
    -};

void
Init_japanese(void)
{
<%= transcode_register_code %>

  • rb_register_transcoder(&rb_from_ISO_2022_JP);
  • rb_register_transcoder(&rb_to_ISO_2022_JP);
    }
    Index: enc/trans/utf_16_32.erb.c
    ===================================================================
    — enc/trans/utf_16_32.erb.c (revision 18417)
    +++ enc/trans/utf_16_32.erb.c (working copy)
    @@ -1,7 +1,7 @@
    #include “transcode_data.h”

static int
-fun_so_from_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16be(rb_transcoding* t, const unsigned char* s, size_t
l, unsigned char* o)
{
if (!s[0] && s[1]<0x80) {
o[0] = s[1];
@@ -29,7 +29,7 @@ fun_so_from_utf_16be(const unsigned char
}

static int
-fun_so_to_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l,
unsigned char* o)
{
if (!(s[0]&0x80)) {
o[0] = 0x00;
@@ -57,7 +57,7 @@ fun_so_to_utf_16be(const unsigned char*
}

static int
-fun_so_from_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16le(rb_transcoding* t, const unsigned char* s, size_t
l, unsigned char* o)
{
if (!s[1] && s[0]<0x80) {
o[0] = s[0];
@@ -85,7 +85,7 @@ fun_so_from_utf_16le(const unsigned char
}

static int
-fun_so_to_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l,
unsigned char* o)
{
if (!(s[0]&0x80)) {
o[1] = 0x00;
@@ -113,7 +113,7 @@ fun_so_to_utf_16le(const unsigned char*
}

static int
-fun_so_from_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32be(rb_transcoding* t, const unsigned char* s, size_t
l, unsigned char* o)
{
if (!s[1]) {
if (s[2]==0 && s[3]<0x80) {
@@ -142,7 +142,7 @@ fun_so_from_utf_32be(const unsigned char
}

static int
-fun_so_to_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l,
unsigned char* o)
{
o[0] = 0;
if (!(s[0]&0x80)) {
@@ -168,13 +168,13 @@ fun_so_to_utf_32be(const unsigned char*
}

static int
-fun_so_from_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32le(rb_transcoding* t, const unsigned char* s, size_t
l, unsigned char* o)
{
return 1;
}

static int
-fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l,
unsigned char* o)
{
return 4;
}
@@ -191,7 +191,7 @@ fun_so_to_utf_32le(const unsigned char*
static const rb_transcoder
rb_from_UTF_16BE = {
“UTF-16BE”, “UTF-8”, &from_UTF_16BE, 4, 0,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be
  • NULL, NULL, NULL, &fun_so_from_utf_16be
    };

<%=
@@ -217,7 +217,7 @@ rb_from_UTF_16BE = {
static const rb_transcoder
rb_to_UTF_16BE = {
“UTF-8”, “UTF-16BE”, &to_UTF_16BE, 4, 1,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be
  • NULL, NULL, NULL, &fun_so_to_utf_16be
    };

<%=
@@ -232,13 +232,13 @@ rb_to_UTF_16BE = {
static const rb_transcoder
rb_from_UTF_16LE = {
“UTF-16LE”, “UTF-8”, &from_UTF_16LE, 4, 0,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16le
  • NULL, NULL, NULL, &fun_so_from_utf_16le
    };

static const rb_transcoder
rb_to_UTF_16LE = {
“UTF-8”, “UTF-16LE”, &to_UTF_16BE, 4, 1,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le
  • NULL, NULL, NULL, &fun_so_to_utf_16le
    };

<%=
@@ -254,13 +254,13 @@ rb_to_UTF_16LE = {
static const rb_transcoder
rb_from_UTF_32BE = {
“UTF-32BE”, “UTF-8”, &from_UTF_32BE, 4, 0,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32be
  • NULL, NULL, NULL, &fun_so_from_utf_32be
    };

static const rb_transcoder
rb_to_UTF_32BE = {
“UTF-8”, “UTF-32BE”, &to_UTF_16BE, 4, 1,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be
  • NULL, NULL, NULL, &fun_so_to_utf_32be
    };

<%=
@@ -276,13 +276,13 @@ rb_to_UTF_32BE = {
static const rb_transcoder
rb_from_UTF_32LE = {
“UTF-32LE”, “UTF-8”, &from_UTF_32LE, 4, 0,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32le
  • NULL, NULL, NULL, &fun_so_from_utf_32le
    };

static const rb_transcoder
rb_to_UTF_32LE = {
“UTF-8”, “UTF-32LE”, &to_UTF_16BE, 4, 1,

  • NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le
  • NULL, NULL, NULL, &fun_so_to_utf_32le
    };

void
Index: transcode_data.h

— transcode_data.h (revision 18417)
+++ transcode_data.h (working copy)
@@ -63,6 +63,8 @@ typedef struct rb_transcoding {
VALUE ruby_string_dest; /* the String used as the conversion
destination,
or NULL if something else is being converted */
unsigned char *(flush_func)(struct rb_transcoding, int, int);
+

  • unsigned char stateful[256]; /* opaque data for stateful encoding
    */
    } rb_transcoding;

/* static structure, one per supported encoding pair */
@@ -72,12 +74,11 @@ typedef struct rb_transcoder {
const BYTE_LOOKUP *conv_tree_start;
int max_output;
int from_utf8;

  • void (preprocessor)(const unsigned char*, unsigned char**, const
    unsigned char*, unsigned char*, struct rb_transcoding *);
  • void (postprocessor)(const unsigned char*, unsigned char**, const
    unsigned char*, unsigned char*, struct rb_transcoding *);
  • VALUE (func_ii)(VALUE); / info -> info */
  • VALUE (*func_si)(const unsigned char ); / start -> info */
  • int (func_io)(VALUE, const unsigned char); /* info -> output */
  • int (func_so)(const unsigned char, unsigned char*); /* start ->
    output */
  • VALUE (func_ii)(rb_transcoding, VALUE); /* info -> info */
  • VALUE (func_si)(rb_transcoding, const unsigned char*, size_t); /*
    start -> info */
  • int (func_io)(rb_transcoding, VALUE, const unsigned char*); /*
    info -> output */
  • int (func_so)(rb_transcoding, const unsigned char*, size_t,
    unsigned char*); /* start -> output */
  • int (finish_func)(rb_transcoding, unsigned char*); /* -> output
    */
    } rb_transcoder;

void rb_declare_transcoder(const char *enc1, const char *enc2, const
char *lib);
Index: tool/transcode-tblgen.rb

— tool/transcode-tblgen.rb (revision 18417)
+++ tool/transcode-tblgen.rb (working copy)
@@ -234,6 +234,12 @@ class ActionMap
“UNDEF”
when :invalid
“INVALID”

  • when :func_ii
  •  "FUNii"
    
  • when :func_si
  •  "FUNsi"
    
  • when :func_io
  •  "FUNio"
    
    when :func_so
    “FUNso”
    when /\A([0-9a-f][0-9a-f])\z/i
    Index: test/ruby/test_transcode.rb
    ===================================================================
    — test/ruby/test_transcode.rb (revision 18417)
    +++ test/ruby/test_transcode.rb (working copy)
    @@ -321,12 +321,13 @@ class TestTranscode < Test::Unit::TestCa
    assert_raise(RuntimeError) { “\x1b(A”.encode(“utf-8”,
    “iso-2022-jp”) }
    assert_raise(RuntimeError) { “\x1b$(A”.encode(“utf-8”,
    “iso-2022-jp”) }
    assert_raise(RuntimeError) { “\x1b$C”.encode(“utf-8”,
    “iso-2022-jp”) }
  • assert_raise(RuntimeError) { “\x1e”.encode(“utf-8”, “iso-2022-jp”)
    }
  • assert_raise(RuntimeError) { “\x0e”.encode(“utf-8”, “iso-2022-jp”)
    }
    assert_raise(RuntimeError) { “\x80”.encode(“utf-8”, “iso-2022-jp”)
    }
    assert_raise(RuntimeError) { “\x1b$(Dd!\x1b(B”.encode(“utf-8”,
    “iso-2022-jp”) }
    assert_raise(RuntimeError) { “\u9299”.encode(“iso-2022-jp”) }
  • #@@@@ TODO: the next test should actually fail, because iso-2022-jp
    does not include half-width kana
  • check_both_ways("\uff71\uff72\uff73\uff74\uff75",
    “\x1b(I12345\x1b(B”, “iso-2022-jp”) # JIS X 0201 §¨©ª«
  • assert_raise(RuntimeError) { “\u9299”.encode(“iso-2022-jp”) }
  • assert_raise(RuntimeError) {
    “\uff71\uff72\uff73\uff74\uff75”.encode(“iso-2022-jp”) }
  • assert_raise(RuntimeError) { “\x1b(I12345\x1b(B”.encode(“utf-8”,
    “iso-2022-jp”) }
    end

def test_iso_2022_jp_1
Index: transcode.c

— transcode.c (revision 18417)
+++ transcode.c (working copy)
@@ -25,53 +25,78 @@ static VALUE sym_invalid, sym_undef, sym

  • Dispatch data and logic
    */

-static st_table *transcoder_table, *transcoder_lib_table;
+typedef struct {

  • const char *from;
  • const char *to;
  • const char lib; / maybe null. it means that don’t load the
    library. */
  • const rb_transcoder *transcoder;
    +} transcoder_entry_t;

-#define TRANSCODER_INTERNAL_SEPARATOR ‘\t’
+static st_table *transcoder_table;

-static char *
-transcoder_key(const char *from_e, const char *to_e)
+static transcoder_entry_t *
+make_transcoder_entry(const char *from, const char *to)
{

  • int to_len = strlen(to_e);
  • int from_len = strlen(from_e);
  • char *const key = xmalloc(to_len + from_len + 2);
  • st_data_t val;
  • st_table *table2;
  • memcpy(key, to_e, to_len);
  • memcpy(key + to_len + 1, from_e, from_len + 1);
  • key[to_len] = TRANSCODER_INTERNAL_SEPARATOR;
  • return key;
  • if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
  •    val = (st_data_t)st_init_strcasetable();
    
  •    st_add_direct(transcoder_table, (st_data_t)from, val);
    
  • }
  • table2 = (st_table *)val;
  • if (!st_lookup(table2, (st_data_t)to, &val)) {
  •    transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
    
  •    entry->from = from;
    
  •    entry->to = to;
    
  •    entry->lib = NULL;
    
  •    entry->transcoder = NULL;
    
  •    val = (st_data_t)entry;
    
  •    st_add_direct(table2, (st_data_t)to, val);
    
  • }
  • return (transcoder_entry_t *)val;
    +}

+static transcoder_entry_t *
+get_transcoder_entry(const char *from, const char *to)
+{

  • st_data_t val;
  • st_table *table2;
  • if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
  •    return NULL;
    
  • }
  • table2 = (st_table *)val;
  • if (!st_lookup(table2, (st_data_t)to, &val)) {
  •    return NULL;
    
  • }
  • return (transcoder_entry_t *)val;
    }

void
rb_register_transcoder(const rb_transcoder *tr)
{

  • st_data_t k, val = 0;
    const char *const from_e = tr->from_encoding;
    const char *const to_e = tr->to_encoding;

  • char *const key = transcoder_key(from_e, to_e);

  • if (st_lookup(transcoder_table, (st_data_t)key, &val)) {

  • xfree(key);

  • transcoder_entry_t *entry;
  • entry = make_transcoder_entry(from_e, to_e);
  • if (entry->transcoder) {
    rb_raise(rb_eArgError, “transcoder from %s to %s has been already
    registered”,
    from_e, to_e);
    }
  • k = (st_data_t)key;
  • if (st_delete(transcoder_lib_table, &k, &val)) {
  • xfree((char *)k);
  • }
  • st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr);
  • entry->transcoder = tr;
    }

static void
declare_transcoder(const char *to, const char *from, const char *lib)
{

  • const char *const key = transcoder_key(to, from);
  • st_data_t k = (st_data_t)key, val;
  • transcoder_entry_t *entry;
  • if (st_delete(transcoder_lib_table, &k, &val)) {
  • xfree((char *)k);
  • }
  • st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib);
  • entry = make_transcoder_entry(from, to);
  • entry->lib = lib;
    }

#define MAX_TRANSCODER_LIBNAME_LEN 64
@@ -90,38 +115,166 @@ rb_declare_transcoder(const char *enc1,

#define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)

+typedef struct search_path_queue_tag {

  • struct search_path_queue_tag *next;
  • const char *enc;
    +} search_path_queue_t;

+typedef struct {

  • st_table *visited;
  • search_path_queue_t *queue;
  • search_path_queue_t **queue_last_ptr;
  • const char *base_enc;
    +} search_path_bfs_t;

+static int
+transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
+{

  • const char *to = (const char *)key;
  • search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
  • search_path_queue_t *q;
  • if (st_lookup(bfs->visited, (st_data_t)to, &val)) {
  •    return ST_CONTINUE;
    
  • }
  • q = ALLOC(search_path_queue_t);
  • q->enc = to;
  • q->next = NULL;
  • *bfs->queue_last_ptr = q;
  • bfs->queue_last_ptr = &q->next;
  • st_add_direct(bfs->visited, (st_data_t)to,
    (st_data_t)bfs->base_enc);
  • return ST_CONTINUE;
    +}

+static int
+transcode_search_path(const char *from, const char *to,

  • void (*callback)(const char *from, const char *to, int depth, void
    *arg),
  • void *arg)
    +{
  • search_path_bfs_t bfs;
  • search_path_queue_t *q;
  • st_data_t val;
  • st_table *table2;
  • int found;
  • q = ALLOC(search_path_queue_t);
  • q->enc = from;
  • q->next = NULL;
  • bfs.queue_last_ptr = &q->next;
  • bfs.queue = q;
  • bfs.visited = st_init_strcasetable();
  • st_add_direct(bfs.visited, (st_data_t)from, (st_data_t)NULL);
  • while (bfs.queue) {
  •    q = bfs.queue;
    
  •    bfs.queue = q->next;
    
  •    if (!bfs.queue)
    
  •        bfs.queue_last_ptr = &bfs.queue;
    
  •    if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
    
  •        xfree(q);
    
  •        continue;
    
  •    }
    
  •    table2 = (st_table *)val;
    
  •    if (st_lookup(table2, (st_data_t)to, &val)) {
    
  •        st_add_direct(bfs.visited, (st_data_t)to, 
    

(st_data_t)q->enc);

  •        xfree(q);
    
  •        found = 1;
    
  •        goto cleanup;
    
  •    }
    
  •    bfs.base_enc = q->enc;
    
  •    st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
    
  •    bfs.base_enc = NULL;
    
  •    xfree(q);
    
  • }
  • found = 0;

+cleanup:

  • while (bfs.queue) {
  •    q = bfs.queue;
    
  •    bfs.queue = q->next;
    
  •    xfree(q);
    
  • }
  • if (found) {
  •    const char *enc = to;
    
  •    int depth = 0;
    
  •    while (1) {
    
  •        st_lookup(bfs.visited, (st_data_t)enc, &val);
    
  •        if (!val)
    
  •            break;
    
  •        depth++;
    
  •        enc = (const char *)val;
    
  •    }
    
  •    enc = to;
    
  •    while (1) {
    
  •        st_lookup(bfs.visited, (st_data_t)enc, &val);
    
  •        if (!val)
    
  •            break;
    
  •        callback((const char *)val, enc, --depth, arg);
    
  •        enc = (const char *)val;
    
  •    }
    
  • }
  • st_free_table(bfs.visited);
  • return found;
    +}

+static void
+transcode_dispatch_cb(const char *from, const char *to, int depth, void
*arg)
+{

  • const rb_transcoder **first_transcoder_ptr = (const rb_transcoder
    **)arg;
  • transcoder_entry_t *entry;
  • if (!*first_transcoder_ptr)
  •    return;
    
  • entry = get_transcoder_entry(from, to);
  • if (!entry)
  •    goto failed;
    
  • if (!entry->transcoder && entry->lib) {
  •    const char *lib = entry->lib;
    
  •    int len = strlen(lib);
    
  •    char path[sizeof(transcoder_lib_prefix) + 
    

MAX_TRANSCODER_LIBNAME_LEN];
+

  •    entry->lib = NULL;
    
  •    if (len > MAX_TRANSCODER_LIBNAME_LEN) goto failed;
    
  •    memcpy(path, transcoder_lib_prefix, 
    

sizeof(transcoder_lib_prefix) - 1);

  •    memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
    
  •    if (!rb_require(path)) goto failed;
    
  • }
  • if (!entry->transcoder)
  •    goto failed;
    
  • if (depth == 0)
  •    *first_transcoder_ptr = entry->transcoder;
    
  • return;

+failed:

  • *first_transcoder_ptr = NULL;
  • return;
    +}

static const rb_transcoder *
transcode_dispatch(const char *from_encoding, const char *to_encoding)
{

  • char *const key = transcoder_key(from_encoding, to_encoding);
  • st_data_t k, val = 0;
  • const rb_transcoder *first_transcoder = (rb_transcoder *)1;
  • while (!st_lookup(transcoder_table, (k = (st_data_t)key), &val) &&
  • st_delete(transcoder_lib_table, &k, &val)) {
    
  • const char *const lib = (const char *)val;
  • int len = strlen(lib);
  • char path[sizeof(transcoder_lib_prefix) +
    MAX_TRANSCODER_LIBNAME_LEN];
  • xfree((char *)k);
  • if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL;
  • memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) -
    1);
  • memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
  • if (!rb_require(path)) return NULL;
  • }
  • if (!val) {
  • if (!st_lookup(transcoder_table, (st_data_t)key, &val)) {
  •  xfree(key);
    
  •  /* multistep logic, via UTF-8 */
    
  •  if (!encoding_equal(from_encoding, "UTF-8") &&
    
  • !encoding_equal(to_encoding, “UTF-8”) &&
  • transcode_dispatch(“UTF-8”, to_encoding)) { /* check that we have
    a second step */
  • return transcode_dispatch(from_encoding, “UTF-8”); /* return first
    step */
  •  }
    
  •  return NULL;
    
  • }
  • if (transcode_search_path(from_encoding, to_encoding,
    transcode_dispatch_cb, (void *)&first_transcoder)) {
  •    return first_transcoder;
    
    }
  • xfree(key);
  • return (rb_transcoder *)val;
  • return NULL;
    }

static void
@@ -245,17 +398,17 @@ transcode_loop(const unsigned char **in_
*out_p++ = getBT3(next_info);
continue;
case FUNii:

  •  next_info = (VALUE)(*my_transcoder->func_ii)(next_info);
    
  •  next_info = (VALUE)(*my_transcoder->func_ii)(my_transcoding, 
    

next_info);
goto follow_info;
case FUNsi:

  •  next_info = (VALUE)(*my_transcoder->func_si)(char_start);
    
  •  next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, 
    

char_start, (size_t)(in_p-char_start));
goto follow_info;
break;
case FUNio:

  •  out_p += (VALUE)(*my_transcoder->func_io)(next_info, out_p);
    
  •  out_p += (VALUE)(*my_transcoder->func_io)(my_transcoding, 
    

next_info, out_p);
break;
case FUNso:

  •  out_p += (VALUE)(*my_transcoder->func_so)(char_start, out_p);
    
  •  out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, 
    

char_start, (size_t)(in_p-char_start), out_p);
break;
case INVALID:
goto invalid;
@@ -290,6 +443,16 @@ transcode_loop(const unsigned char *in_
continue;
}
/
cleanup */

  • if (my_transcoder->finish_func) {
  • if (out_p >= out_s) {
  •  int len = (out_p - *out_pos);
    
  •  int new_len = (len + my_transcoder->max_output) * 2;
    
  •  *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
    

new_len);

  •  out_p = *out_pos + len;
    
  •  out_s = *out_pos + new_len - my_transcoder->max_output;
    
  • }
  •    out_p += my_transcoder->finish_func(my_transcoding, out_p);
    
  • }
    *in_pos = in_p;
    *out_pos = out_p;
    }
    @@ -401,21 +564,8 @@ str_transcode(int argc, VALUE *argv, VAL
    }

my_transcoding.transcoder = my_transcoder;

  •    memset(my_transcoding.stateful, 0, 
    

sizeof(my_transcoding.stateful));

  • if (my_transcoder->preprocessor) {
  •  fromp = sp = (unsigned char *)RSTRING_PTR(str);
    
  •  slen = RSTRING_LEN(str);
    
  •  blen = slen + 30; /* len + margin */
    
  •  dest = rb_str_tmp_new(blen);
    
  •  bp = (unsigned char *)RSTRING_PTR(dest);
    
  •  my_transcoding.ruby_string_dest = dest;
    
  •  (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), 
    

&my_transcoding);

  •  if (fromp != sp+slen) {
    
  • rb_raise(rb_eArgError, “not fully converted, %“PRIdPTRDIFF” bytes
    left”, sp+slen-fromp);
  •  }
    
  •  rb_str_set_len(dest, (char *)bp - RSTRING_PTR(dest));
    
  •  str = dest;
    
  • }
    fromp = sp = (unsigned char )RSTRING_PTR(str);
    slen = RSTRING_LEN(str);
    blen = slen + 30; /
    len + margin */
    @@ -431,21 +581,6 @@ str_transcode(int argc, VALUE *argv, VAL
    buf = (unsigned char *)RSTRING_PTR(dest);
    *bp = ‘\0’;
    rb_str_set_len(dest, bp - buf);
  • if (my_transcoder->postprocessor) {
  •  str = dest;
    
  •  fromp = sp = (unsigned char *)RSTRING_PTR(str);
    
  •  slen = RSTRING_LEN(str);
    
  •  blen = slen + 30; /* len + margin */
    
  •  dest = rb_str_tmp_new(blen);
    
  •  bp = (unsigned char *)RSTRING_PTR(dest);
    
  •  my_transcoding.ruby_string_dest = dest;
    
  •  (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), 
    

(bp+blen), &my_transcoding);

  •  if (fromp != sp+slen) {
    
  • rb_raise(rb_eArgError, “not fully converted, %“PRIdPTRDIFF” bytes
    left”, sp+slen-fromp);

  •  }
    
  •  buf = (unsigned char *)RSTRING_PTR(dest);
    
  •  rb_str_set_len(dest, bp - buf);
    
  • }

    if (encoding_equal(my_transcoder->to_encoding, to_e)) {
    final_encoding = 1;
    @@ -541,7 +676,6 @@ void
    Init_transcode(void)
    {
    transcoder_table = st_init_strcasetable();

  • transcoder_lib_table = st_init_strcasetable();

    sym_invalid = ID2SYM(rb_intern(“invalid”));
    sym_undef = ID2SYM(rb_intern(“undef”));

    [“c’† “N][‚½‚È‚© ‚ ‚«‚ç][Tanaka A.]

This forum is not affiliated to the Ruby language, Ruby on Rails framework, nor any Ruby applications discussed here.

| Privacy Policy | Terms of Service | Remote Ruby Jobs