[FEATURE:trunk] EncDet again


#1

Yuguie$B$G$9!#e(B

ZnZe$B$5$s$NF|5-e(B(http://znz.s1.xrea.com/t/?date=20090102#c01
)e$B7PM3$G$+$D$Fe(B
e$B$Ne(BEncDete$B%i%$%V%i%j$N5DO@e(B[ruby-dev:33628]e$B$r;W$$=P$7$^$7$?!#e(B

e$B$5$F!"8=:_;d$,CN$k8B$je(BRDoce$B$He(BERBe$B$He(BIRBe$B$,$=$l$>$lFH<+$K%^%8%C%/%3%a%s%H$r2re(B
e$B<a$7$F%U%!%$%k$r3+$/5!G=$r<BAu$7$F$$$^$9!#$3$N=EJ#6q9g$O2?$i$+$N6&DL2=$Ne(B
e$BI,MW@-$r<($7$F$$$k$N$G$O$J$$$+$H;W$$$^$9!#e(B

e$BA0$N5DO@$G$O%U%!%$%kL>$G<g$K0U8+$,0lCW$;$:$KH/;6$7$F$7$^$C$?$h$&$G$9!#e(B
encdet.rb <-> encoding/detector.rb

e$B;d$Oe(BIOe$B$X$N5!G=DI2C$,NI$$$N$G$O$J$$$+$H;W$$$^$7$?!#e(B
io/encdet.rb
IO::magic_open(*args) -> e$BFbIt$Ge(BIO::opene$B$r8F$S=P$7e(B

e$B<BAu$rD4@0$7$J$$$H%i%s%@%`%"%/%;%9$G$-$J$$e(BIOe$B$G$O:$$k$o$1$G$9$,!#e(B


#2

e$B$^$D$b$He(B e$B$f$-$R$m$G$9e(B

In message “Re: [ruby-dev:37679] [FEATURE:trunk] EncDet again”
on Sat, 3 Jan 2009 19:21:37 +0900, “Yugui (Yuki S.)”
removed_email_address@domain.invalid writes:

|e$B;d$Oe(BIOe$B$X$N5!G=DI2C$,NI$$$N$G$O$J$$$+$H;W$$$^$7$?!#e(B
| io/encdet.rb

ence$B$Oe(Bencodinge$B$N>JN,$H$7$Fe(B(e$B;d$K$Oe(B)e$BG’CN$G$-$^$9$,!“e(Bdete$B$re(Bdetect
e$B$N>JN,7A$H$7$FG’<1$9$k$N$K$O$d$d:$Fq$G$9!#$?$@!”$I$&$7$F$b%@e(B
e$B%a$H$$$&$[$I$G$O$J$$$G$9!#e(B

| IO::magic_open(*args) -> e$BFbIt$Ge(BIO::opene$B$r8F$S=P$7e(B

e$BH?BP!#e(Bmagice$B$G$be(Bmagicale$B$G$bL>A0$,!VKbK!!W$9$.$F$J$K$r$9$k$N$+e(B
e$B$^$C$?$/$o$+$j$^$;$s!#e(B


#3

e$B!!$5$5$@$G$9!%e(B

NARUSE, Yui wrote::

e$B$$$C$=e(B prelude e$B$KF~$l$F$7$^$$$^$;$s$+!#e(B
e$B$=$&$9$l$P%i%$%V%i%jL>$K4X$7$F$OG:$`I,MW$,$J$/$J$j$^$9!#e(B

e$B!!$=$s$J$K;H$&$b$N$J$s$G$9$+!)e(B


#4

e$B@.@%$G$9!#e(B

Yukihiro M. wrote:

e$B%a$H$$$&$[$I$G$O$J$$$G$9!#e(B
e$B$$$C$=e(B prelude e$B$KF~$l$F$7$^$$$^$;$s$+!#e(B
e$B$=$&$9$l$P%i%$%V%i%jL>$K4X$7$F$OG:$`I,MW$,$J$/$J$j$^$9!#e(B

| IO::magic_open(*args) -> e$BFbIt$Ge(BIO::opene$B$r8F$S=P$7e(B

e$BH?BP!#e(Bmagice$B$G$be(Bmagicale$B$G$bL>A0$,!VKbK!!W$9$.$F$J$K$r$9$k$N$+e(B
e$B$^$C$?$/$o$+$j$^$;$s!#e(B

e$BAGD>$Ke(B IO::detect_open e$B$"$?$j$O!#e(B


#5

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Kazuhiro NISHIYAMA)

Pythone$B$@$He(Bchardete$B$H$$$&L>A0$N%i%$%V%i%j$,$"$k$h$&$J$N$G!“e(B
encdete$B$G$bNI$5$=$&$J5$$,$7$^$9$,!”$I$&$G$7$g$&$+e(B?

http://chardet.feedparser.org/

http://redmine.ruby-lang.org/issues/show/973


#6

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Yui NARUSE)

chardete$B$O%"%&%H$@$H;W$$$^$9!“e(B"det"e$B$8$c$J$/$Fe(B"char"e$B$NJ}$,!#e(B
dete$B$NJ}$b=t<j$r5s$2$F;?@.$H$$$&$o$1$G$O$”$j$^$;$s!#e(B

e$B$7$+$7!"8=>ue(Brdoce$BMm$_$de(Berbe$B<~$j$J$I!"e(BEncDete$B$r:FH/L@$7$h$&$H$7$?5s6g$K<:GT$7$F$7$^$C$?Nc$,;68+$5$l$F$*$j!"e(B
e$B$=$m$=$m$3$N%i%$%V%i%j$OI8=`E:IU$7$J$$$H0-$7$-0d;:$r;D$7$+$M$J$$$HM+N8$7$F$$$^$9!#e(B

e$B8@$$49$($k$H!"$3$N%i%$%V%i%j$NMQES$OMM!9$J>lLL$GI,MW$G$"$k0lJ}!"<BAu$,0U30$HFq$7$$$N$G!"e(B
e$BE,@Z$J%i%$%V%i%j$rI8=`E:IU$GDs6!$7$J$1$l$P$J$i$J$$$H;W$C$F$$$^$9!#e(B
e$B$?$H$(L>A0$G9g0U$,$D$+$J$+$C$?$H$7$F$b!"e(Byuguie$B$5$sH=CG$GL>A0$r7hDj$7!“E:IU$9$k$Y$-$G$”$m$&$H!#e(B

e$B$G!"$o$?$7$Oe(B encdet e$B$G$b$$$$$H;W$C$F$$$^$9!#e(B
e$B$J$<$J$i!"$3$l$O;H$o$l$k%i%$%V%i%j$G$"$j!";H$C$F$l$P$I$&$;47$l$k$+$i!#e(B

http://redmine.ruby-lang.org/issues/show/973


#7

e$B?\F#$G$9!#e(B

In removed_email_address@domain.invalid
“[ruby-dev:39775] [Feature #973] EncDet again” on Thu, 26 Nov 2009
00:19:42 +0900,
Kazuhiro NISHIYAMA removed_email_address@domain.invalid wrote:

Pythone$B$@$He(Bchardete$B$H$$$&L>A0$N%i%$%V%i%j$,$"$k$h$&$J$N$G!“e(B
encdete$B$G$bNI$5$=$&$J5$$,$7$^$9$,!”$I$&$G$7$g$&$+e(B?

http://chardet.feedparser.org/

e$B$o$6$o$6$o$+$j$K$/$$L>A0$r??;w$9$kI,MW$O$J$$$H;W$$$^$9!#e(B
e$B!Je(Be$B;d$K$Oe(Be$B$o$+$j$K$/$$$G$9!#!Ke(B


#8

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Akinori MUSHA)

e$B%i%$%V%i%jL>$Oe(B encoding e$B$G$$$$$s$8$c$J$$$G$9$+!)e(B

e$B>-Mh$[$+$K$be(BEncodinge$BMm$_$NDI2C%i%$%V%i%j$NI,MW$,=P$F$-$?$i!“e(B
encoding/detect e$B$J$I$K0$7$Fe(B encoding e$B$O$h$/;H$$$=$&$Je(B
encoding/* e$B$r$9$Y$Fe(B require
e$B!J$”$k$$$Oe(Bautoloade$B!K$9$k$h$&$K$9$l$P8_49@-$rJ]$F$^$9!#e(B

e$B$^$?%/%i%9L>$b!"<BAu$Oe(B Encoding::Detector
e$B$J$I$o$+$j$d$9$$L>A0$N2<$G9T$$$D$D!“e(B
APIe$B$O!J>/$J$/$H$b%”%W%j%1!<%7%g%s$O!Ke(BEncodinge$B0J30$N%/%i%9L>$r;H$o$J$/$F:Q$`$h$&$Ke(B
e$B9)IW$9$l$P$$$$$H;W$$$^$9!#e(B

http://redmine.ruby-lang.org/issues/show/973


#9

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Yui NARUSE)

e$B%i%$%V%i%jL>$Oe(B encoding e$B$G$$$$$s$8$c$J$$$G$9$+!)e(B

encoding.rb e$B$O$A$g$C$H;~4|>0Aa$8$c$J$$$+$J$!!#e(B

e$B$^$?%/%i%9L>$b!"<BAu$Oe(B Encoding::Detector e$B$J$I$o$+$j$d$9$$L>A0$N2<$G9T$$$D$D!"e(B

e$B$=$N<j$Ne(B Java e$BE*$J!V$o$+$j$d$9$$L>A0!W$,!"I,$:$7$be(B Ruby
e$B$K$*$$$F!V$$$$L>A0!W$G$O$J$$!"e(B
e$B$C$F$N$O$3$NOC$NO@E@$N0l$D$G$9$h$M!#e(B

APIe$B$O!J>/$J$/$H$b%"%W%j%1!<%7%g%s$O!Ke(BEncodinge$B0J30$N%/%i%9L>$r;H$o$J$/$F:Q$`$h$&$Ke(B
e$B9)IW$9$l$P$$$$$H;W$$$^$9!#e(B

e$B$=$N9)IW$,;W$$$D$+$J$/$F$?$J;/$7$K$J$C$F$$$k8=>u!"M-8z$J0F$@$H$O;W$($^$;$s!#e(B

e$B$J$*!"e(Bopen e$BEy$Ke(B detect e$B$r$D$C$3$a$k$[$Ie(B EncDet
e$B$K$D$$$FCN8+$,=8$^$C$F$$$k$H$b9M$($,$?$$$G$9!#e(B

http://redmine.ruby-lang.org/issues/show/973


#10

e$B?\F#$G$9!#e(B

In removed_email_address@domain.invalid
“[ruby-dev:39781] [Feature #973] EncDet again” on Thu, 26 Nov 2009
02:27:32 +0900,
Yui NARUSE removed_email_address@domain.invalid wrote:

e$B$^$?%/%i%9L>$b!"<BAu$Oe(B Encoding::Detector e$B$J$I$o$+$j$d$9$$L>A0$N2<$G9T$$$D$D!"e(B

e$B$=$N<j$Ne(B Java e$BE*$J!V$o$+$j$d$9$$L>A0!W$,!"I,$:$7$be(B Ruby e$B$K$*$$$F!V$$$$L>A0!W$G$O$J$$!"e(B
e$B$C$F$N$O$3$NOC$NO@E@$N0l$D$G$9$h$M!#e(B

http://doc.okkez.net/static/192/library/builtin.html
e$B$K$"$k%/%i%9L>!&%b%8%e!<%kL>$r$6$C$/$j8+$F$
$k$H!"%/%i%9L>!&e(B
e$B%b%8%e!<%kL>$r>JN,$7$F$$$J$$J}$,B?$$$h$&$K8+$($^$9!#e(B

e$B$o$+$j$d$9$$$H$$$&$N$O>JN,$7$J$$$H$$$&$3$H$H2r<a$7$?$N$G$9$,!"e(B
e$B$o$+$j$d$9$$$H$$$&$N$He(BJavae$BE*$H$$$&$N$O4X78$J$$5$$,$7$^$9!#e(B


#11

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Akinori MUSHA)

e$B%i%$%V%i%jL>$Oe(B encoding e$B$G$$$$$s$8$c$J$$$G$9$+!)e(B

encoding.rb e$B$O$A$g$C$H;~4|>0Aa$8$c$J$$$+$J$!!#e(B

e$B$=$NM}M3$K$D$$$F2?$i$+$N:`NA$r$$$?$@$1$J$$$G$7$g$&$+!#e(B
e$B;d$N<gD%$O%i%$%V%i%jL>$H$7$F:#;H$C$F$bLdBj$J$$$@$m$&$H$$$&E@$J$N$G!#e(B

e$B$^$?%/%i%9L>$b!"<BAu$Oe(B Encoding::Detector e$B$J$I$o$+$j$d$9$$L>A0$N2<$G9T$$$D$D!"e(B

e$B$=$N<j$Ne(B Java e$BE*$J!V$o$+$j$d$9$$L>A0!W$,!"I,$:$7$be(B Ruby e$B$K$*$$$F!V$$$$L>A0!W$G$O$J$$!"e(B
e$B$C$F$N$O$3$NOC$NO@E@$N0l$D$G$9$h$M!#e(B

e$B%(%s%I%f!<%6$,;H$o$J$$ItJ,$NL>A0$H$7$F5s$2$?$N$G!"$=$3$O$I$&$G$b$$$$$H;W$$$^$9!#e(B

APIe$B$O!J>/$J$/$H$b%"%W%j%1!<%7%g%s$O!Ke(BEncodinge$B0J30$N%/%i%9L>$r;H$o$J$/$F:Q$`$h$&$Ke(B
e$B9)IW$9$l$P$$$$$H;W$$$^$9!#e(B

e$B$=$N9)IW$,;W$$$D$+$J$/$F$?$J;/$7$K$J$C$F$$$k8=>u!"M-8z$J0F$@$H$O;W$($^$;$s!#e(B

e$B$^$:$O%i%$%V%i%jL>$d<BAu%/%i%9L>$NLdBj$r<h$j=|$$$F!"%(%s%I%f!<%6$,;H$&e(BAPIe$B$Ke(B
e$B%U%)!<%+%9$9$l$P$3$Ne(Bissuee$B$O%7%s%W%k$K$J$k$N$G$O$J$$$G$7$g$&$+!#e(B

e$B$J$*!"e(Bopen e$BEy$Ke(B detect e$B$r$D$C$3$a$k$[$Ie(B EncDet e$B$K$D$$$FCN8+$,=8$^$C$F$$$k$H$b9M$($,$?$$$G$9!#e(B

e$B$4<+J,$,5s$2$i$l$?e(B IO::detect_open
e$B$KB(:B$NH?BP$O4s$;$i$l$F$$$J$$$7!"e(B
e$B==J,$$$$L>A0$@$H;W$$$^$9$h!#2?$re(Bdetecte$B$9$k$N$+$H$$$&Ld$$$O!"e(BEncodinge$B<+BN$re(B
e$BI,$:$7$bJ8;z%3!<%I$@$1$K<}$^$i$J$$35G0$H$9$l$PEz$($K$J$k$G$7$g$&!#e(B
e$B!J$=$3$NJ}?K$r;d$OGD0.$7$F$$$J$$$N$G$9$,!Ke(B

[ruby-dev:33628]e$B$+$iAae(B20e$B%v7n!#e(B1.9e$B$K$O$A$c$s$HHV?M$,5o$F$/$l$k$N$@$7!"e(B
trunke$B$KF~$l$F$$F!";H$C$F$$F$OD>$7$r7+$jJV$7$F$3$=LdBj$b8+$($FMh$k$O$:!#e(B

e$B;d$J$I$O?7$7$/=q$/%3!<%I$K$be(BNKFe$B$r;H$&M-MM$G$9$7!"e(BNokogirie$B$J$I$r8+$F$b!"e(B
e$B$$s$JF1$8$J$s$@$J$"$H;W$$$^$9!#:G=*HG$G$J$/$F$$$$$+$i!"!V$$$:$l$3$s$J46$8$Ge(B
e$B$G$-$k$h$&$K$J$k$h!W$H$$$&$N$r8+$;$F$[$7$$$H$
$s$J;W$C$F$$$^$9$h!#e(B

http://redmine.ruby-lang.org/issues/show/973


#12

e$B@.@%$G$9!#e(B

Kouhei S. wrote:

e$B$^$?%/%i%9L>$b!"<BAu$Oe(B Encoding::Detector e$B$J$I$o$+$j$d$9$$L>A0$N2<$G9T$$$D$D!"e(B
e$B$=$N<j$Ne(B Java e$BE*$J!V$o$+$j$d$9$$L>A0!W$,!"I,$:$7$be(B Ruby e$B$K$*$$$F!V$$$$L>A0!W$G$O$J$$!"e(B
e$B$C$F$N$O$3$NOC$NO@E@$N0l$D$G$9$h$M!#e(B

http://doc.okkez.net/static/192/library/builtin.html
e$B$K$"$k%/%i%9L>!&%b%8%e!<%kL>$r$6$C$/$j8+$F$
$k$H!"%/%i%9L>!&e(B
e$B%b%8%e!<%kL>$r>JN,$7$F$$$J$$J}$,B?$$$h$&$K8+$($^$9!#e(B

e$B$=$3$K$"$k?t!9$N%/%i%9!&%b%8%e!<%k$N$&$A!“e(B
e$B<+J,$G$=$NL>A0$r=q$/J*$C$F0lIt$G$O$J$$$G$7$g$&$+!#e(B
e$BNc$($P!“e(BBasicObject, Encoding::Converter, File::Constants
e$B$J$I$,e(B
e$BG0F,$K$”$k$N$@$H;W$$$^$9$,!”$I$l$bDL>o=q$/;v$O$J$$$O$:$G$9!#e(B

e$B$^$?!“N,$5$l$F$$$k$N$Oe(B
ARGF, Bignum, Dir, ENV, Fixnum, GC, Hash, Proc, Regexp
e$B$”$?$j$G$9$,!"$I$l$bD6M-L>%/%i%9$G$9$h$M!#e(B

e$B$o$+$j$d$9$$$H$$$&$N$O>JN,$7$J$$$H$$$&$3$H$H2r<a$7$?$N$G$9$,!"e(B
e$B$o$+$j$d$9$$$H$$$&$N$He(BJavae$BE*$H$$$&$N$O4X78$J$$5$$,$7$^$9!#e(B

Java e$B$G$O!V$o$+$j$d$9$$L>A0!We(B(=e$B>JN,$7$J$$L>A0e(B)e$B$r!“e(B
e$B!V$$$$L>A0!W$G$”$k$H$7$F$$$k$H$$$&%$%a!<%8$,$"$k$N$G!#e(B

Ruby e$B$K$*$$$F>JN,$7$J$$;v$,I,$:$7$b$$$$;v$G$O$J$/!“e(B
e$B$`$7$m$”$^$j;H$&$Y$-$G$J$$$b$N$KBP$7$F;H$o$l$k;v$,B?$$$N$Oe(B
e$BA0Ds$KCV$/$Y$-$8$c$J$$$G$9$+$M!#e(B


#13

e$B?\F#$G$9!#e(B

e$B$J$s$+!"K<AE*$JOC$8$c$J$$N.$l$J5$$,$7$^$9$,!#!#!#e(B

In removed_email_address@domain.invalid
“[ruby-dev:39784] Re: [Feature #973] EncDet again” on Thu, 26 Nov 2009
03:45:54 +0900,
“NARUSE, Yui” removed_email_address@domain.invalid wrote:

ARGF, Bignum, Dir, ENV, Fixnum, GC, Hash, Proc, Regexp
e$B$"$?$j$G$9$,!"$I$l$bD6M-L>%/%i%9$G$9$h$M!#e(B

Bignum, Fixnum, GC(!), Hash, (Proc,) Regexpe$B$ODL>o=q$/$3$H$Oe(B
e$B$J$$$H;W$$$^$9!#$H$$$&$N$O$h$$$H$7$F!"N,$5$l$F$$$k$b$N$G$b!"e(B
e$B$=$&$$$&Iw$KN,$5$l$k$3$H$,B?$$$b$N$J5$$,$7$^$9!#!Je(BARGFe$B$O0c$$e(B
e$B$^$9$,!#!Ke(B

e$BDL>o=q$/$3$H$,$"$j!“N,$5$l$F$$$J$$D6M-L>%/%i%9$be(BFile,
Thread(?), Time(?), LoadErrore$B$J$I$,$”$C$?$j$7$^$9!#e(B

e$BN,$9$K$7$F$b$=$l$,$J$s$J$N$+$,A[A|$G$-$k$b$N$,$h$$$J$!$H;W$$e(B
e$B$^$9!#;d$K$Oe(BDete$B$OFq$7$$$G$9!#e(B

Ruby e$B$K$*$$$F>JN,$7$J$$;v$,I,$:$7$b$$$$;v$G$O$J$/!“e(B
e$B$`$7$m$”$^$j;H$&$Y$-$G$J$$$b$N$KBP$7$F;H$o$l$k;v$,B?$$$N$Oe(B
e$BA0Ds$KCV$/$Y$-$8$c$J$$$G$9$+$M!#e(B

e$B$=$&$$$&$o$1$8$c$J$$$H;W$C$F$$$^$7$?!#e(B
e$B$^$:$O!">JN,$;$:$K$7$F$*$$$F!"$h$/;H$&$+$iC;$/$7$?$$$J$!!"$8$ce(B
e$B$"!“C;$/$9$k$+!”$H$$$&N.$l$+$H;W$C$F$$$^$7$?!#e(B

e$B%(%s%3!<%G%#%s%08!=P=hM}$O!"$I$N$/$i$$$h$/;H$o$l$k=hM}!#!#!#e(B
e$B$J$s$G$9$+$M$’!#$I$&$J$N$+$7$i!#e(B


#14

e$B@.@%$G$9!#e(B

Akinori MUSHA wrote:

e$B%i%$%V%i%jL>$Oe(B encoding e$B$G$$$$$s$8$c$J$$$G$9$+!)e(B
encoding.rb e$B$O$A$g$C$H;~4|>0Aa$8$c$J$$$+$J$!!#e(B

e$B$=$NM}M3$K$D$$$F2?$i$+$N:`NA$r$$$?$@$1$J$$$G$7$g$&$+!#e(B
e$B;d$N<gD%$O%i%$%V%i%jL>$H$7$F:#;H$C$F$bLdBj$J$$$@$m$&$H$$$&E@$J$N$G!#e(B

e$B$^$:A0Ds$H$7$F!"e(BRuby e$B$N%(%s%3!<%G%#%s%0$NL?L>$O!“e(B
IANA Charset e$B$N<B:]>e$N1?MQ$H$O0[$J$C$F$$$^$9!#e(B
e$B$D$^$j!”@$4V$G$Oe(B Shift_JIS e$B$H$$$&L>A0$re(B Windows-31J
e$B$H$7$F;H$C$F$$$k$N$KBP$7!“e(B
Ruby e$B$O$=$N6hJL$r873J$K$9$k;v$r5a$a!”%k!<%:$K$7$F$$$k$He(B Windows
e$B4D6-$G$Oe(B
e$BNc30$,>e$,$k$h$&$K@_7W$5$l$F$$$^$9!#e(B

Encoding e$B$d!“e(Bmagic comment e$B$rFI$`e(B EncDet
e$B$O$3$NOHFb$GF0$$$F$$$^$9!#e(B
e$B$N$G!”$3$3$^$G$Oe(B Encoding e$B$KE}9g2DG=$G$O$"$k$N$G$9$,!“e(B
Encoding e$B$r$”$^$jHnBg2=$5$;$k$H!"$=$l0J30$N<B:]>e$Ne(B IANA Charset
e$BE*$J!"e(B
Ruby
e$B$H$7$F$O4V0c$C$?@$3&$N$b$N$b07$&I,MW$,=P$F$/$k$h$&$K;W$$$^$9!#e(B
e$B$=$N>l9g$NH=CG$O8e=R$NM}M3$+$i!"8=;~E@$G$OHr$1$?$$$H9M$($F$$$^$9!#e(B

e$B$D$^$j!"e(BEncoding e$B$NK<A$K$D$$$F:#$OH=CG$rHr$1$?$$$N$G$9!#e(B

APIe$B$O!J>/$J$/$H$b%"%W%j%1!<%7%g%s$O!Ke(BEncodinge$B0J30$N%/%i%9L>$r;H$o$J$/$F:Q$`$h$&$Ke(B
e$B9)IW$9$l$P$$$$$H;W$$$^$9!#e(B
e$B$=$N9)IW$,;W$$$D$+$J$/$F$?$J;/$7$K$J$C$F$$$k8=>u!"M-8z$J0F$@$H$O;W$($^$;$s!#e(B

e$B$^$:$O%i%$%V%i%jL>$d<BAu%/%i%9L>$NLdBj$r<h$j=|$$$F!"%(%s%I%f!<%6$,;H$&e(BAPIe$B$Ke(B
e$B%U%)!<%+%9$9$l$P$3$Ne(Bissuee$B$O%7%s%W%k$K$J$k$N$G$O$J$$$G$7$g$&$+!#e(B

e$B$3$l$O;?@.$G$9!#e(B

e$B$J$*!"e(Bopen e$BEy$Ke(B detect e$B$r$D$C$3$a$k$[$Ie(B EncDet e$B$K$D$$$FCN8+$,=8$^$C$F$$$k$H$b9M$($,$?$$$G$9!#e(B

e$B$4<+J,$,5s$2$i$l$?e(B IO::detect_open e$B$KB(:B$NH?BP$O4s$;$i$l$F$$$J$$$7!"e(B
e$B==J,$$$$L>A0$@$H;W$$$^$9$h!#2?$re(Bdetecte$B$9$k$N$+$H$$$&Ld$$$O!"e(BEncodinge$B<+BN$re(B
e$BI,$:$7$bJ8;z%3!<%I$@$1$K<}$^$i$J$$35G0$H$9$l$PEz$($K$J$k$G$7$g$&!#e(B
e$B!J$=$3$NJ}?K$r;d$OGD0.$7$F$$$J$$$N$G$9$,!Ke(B

Encoding
e$B$OJ8;z%3!<%I$N$_$r07$&$Y$-$@$H!"$o$?$7$O8=;~E@$G;W$C$F$$$^$9!#e(B

e$B;d$J$I$O?7$7$/=q$/%3!<%I$K$be(BNKFe$B$r;H$&M-MM$G$9$7!"e(BNokogirie$B$J$I$r8+$F$b!"e(B
e$B$$s$JF1$8$J$s$@$J$"$H;W$$$^$9!#:G=*HG$G$J$/$F$$$$$+$i!"!V$$$:$l$3$s$J46$8$Ge(B
e$B$G$-$k$h$&$K$J$k$h!W$H$$$&$N$r8+$;$F$[$7$$$H$
$s$J;W$C$F$$$^$9$h!#e(B

Nokogiri e$B$OA0=R$Ne(B IANA Charset e$B%Y!<%9$NOC$J$N$G!"e(BEncDet
e$B$h$jOC$OHa;4$G$9!#e(B
e$B$D$^$j!"e(Bcharset=Shift_JIS e$B$^$o$j$GCOMk$rF’$`$3$H$G$7$g$&!#e(B

e$B$3$C$A$,$^$H$b$K$J$k$N$O$b$&$7$P$i$/$+$+$k$H;W$o$l$^$9!#e(B
e$B$3$NJU$K$D$$$F!“e(BIANA
e$BB&$GF0$/5$G[$,$”$k$N$G!"$=$A$i$NF0$-$,8+$($k$^$G$Oe(B
Ruby e$BB&$GBP=h$9$k$D$b$j$O$"$j$^$;$s!#e(B


#15

In removed_email_address@domain.invalid
“[ruby-dev:39804] Re: [Feature #973] EncDet again” on Sat, 28 Nov 2009
00:34:59 +0900,
“NARUSE, Yui” removed_email_address@domain.invalid wrote:

[ruby-dev:37679] e$B$K$F!"I8=`%i%$%V%i%jFb$@$1$G;0NcJs9p$5$l$F$$$^$9!#e(B

e$B$5$F!"8=:_;d$,CN$k8B$je(BRDoce$B$He(BERBe$B$He(BIRBe$B$,$=$l$>$lFH<+$K%^%8%C%/%3%a%s%H$r2re(B
e$B<a$7$F%U%!%$%k$r3+$/5!G=$r<BAu$7$F$$$^$9!#$3$N=EJ#6q9g$O2?$i$+$N6&DL2=$Ne(B
e$BI,MW@-$r<($7$F$$$k$N$G$O$J$$$+$H;W$$$^$9!#e(B

e$B;d$K$H$C$Fe(B3e$BNc$OB?$/$J$$$N$G!"e(BEnce$B$HN,$9$[$I$G$O$J$$$H46$8$^$9!#e(B


#16

Kouhei S. wrote:

e$BN,$9$K$7$F$b$=$l$,$J$s$J$N$+$,A[A|$G$-$k$b$N$,$h$$$J$!$H;W$$e(B
e$B$^$9!#;d$K$Oe(BDete$B$OFq$7$$$G$9!#e(B

e$B=i4|$NL>>N0F$Ke(B EncDetect e$B$O$"$j$^$9$M!#e(B
e$B$3$l$G$b%@%a$G$9$+!#e(B

e$B%(%s%3!<%G%#%s%08!=P=hM}$O!"$I$N$/$i$$$h$/;H$o$l$k=hM}!#!#!#e(B
e$B$J$s$G$9$+$M$’!#$I$&$J$N$+$7$i!#e(B

[ruby-dev:37679]
e$B$K$F!"I8=`%i%$%V%i%jFb$@$1$G;0NcJs9p$5$l$F$$$^$9!#e(B


#17

2009/11/28 Kouhei S. removed_email_address@domain.invalid:

e$B;d$K$H$C$Fe(B3e$BNc$OB?$/$J$$$N$G!"e(BEnce$B$HN,$9$[$I$G$O$J$$$H46$8$^$9!#e(B

Encodinge$B%/%i%9$,$"$k$N$K!“e(BEncXXXe$B$rDj5A$9$k$N$O$H$F$b7y$G$9!#e(B
e$B@.@%$5$s$N7|G0$bJ,$+$j$^$9$N$G!!e(Bencoding.rbe$B$OHr$1$k$H$7$F$b!”$d$O$je(BEncoding::XXXe$B$G$"$C$FM_$7$$$G$9!#e(B

e$B$H$3$m$G!"@hF|e(BIRCe$B$G5s$,$C$?OCBj$G$9$,!"e(BEncDete$B$O2?$rDs6!$9$k$N$G$7$g$&$+!#e(B
e$BEv=iDs0F$5$l$?e(BEncDete$B$N5!G=$O%^%8%C%/%3%a%s%H$He(BBOMe$B$N2r<a$G$7$?!#$3$N$&$A!“e(BBOMe$B$O4{$Ke(BFile.opene$B$K<BAu$5$l$F$$$^$9!#e(B
e$B$G$9$+$i!”:#A[Dj$5$l$k$N$O%U%!%$%k$N:G=i$N0l9T$+$i%^%8%C%/%3%a%s%H$C$]$$%P%$%HNs$rC5$7$F$=$l$K4p$E$$$F3+$/5!G=$N$$G$9!#e(B
e$B$3$N5!G=$,J#?t2U=j$G<BAu$5$l$F$$$k$N$G$3$l$O!“2?$i$+$NNI$$%G%U%)%k%H<BAu$rDs6!$7$J$1$l$P$J$i$J$$$H$$$&$N$,;d$N<gD%$G$7$?!#e(B
e$B$3$N5!G=$@$1$G$”$k$H$9$k$H!"<B$OEvLLe(BFile.opene$B$N5!G=3HD%$N$
$G:Q$s$G$7$^$&$N$G$O$J$$$G$7$g$&$+!#$?$H$($P!"e(B

File.open(path, “r:magic-comment”)
e$B$J$$$7e(B
File.open(path, “r:auto”)
e$B$N$h$&$K$7$F!#e(B


#18

e$B%A%1%C%He(B #973 e$B$,99?7$5$l$^$7$?!#e(B (by Yusuke E.)

e$B1sF#$G$9!#e(B

EncDet e$B$N7o!"$I$&$7$^$9$+!)e(B
Yugui e$B$5$s$,0J2<$N$h$&$JDs0F$b$7$F$$$^$9!#e(B

e$B<B$OEvLLe(BFile.opene$B$N5!G=3HD%$N$_$G:Q$s$G$7$^$&$N$G$O$J$$$G$7$g$&$+!#$?$H$($P!"e(B

File.open(path, “r:magic-comment”)
e$B$J$$$7e(B
File.open(path, “r:auto”)
e$B$N$h$&$K$7$F!#e(B

e$B$5$F!"e(B

  1. EncDet e$BJ}<0$G7hDj$9$ke(B
    (e$BL>A0$O$^$D$b$H$5$s$N9%$_$G7h$a$ke(B)
  2. File.open e$BJ}<0$G7hDj$9$ke(B
  3. 1.9.2 e$B$r8+Aw$C$F$8$C$/$j5DO@$9$ke(B

e$B$N$I$l$K$7$^$7$g$&$+!#e(B

2 e$B2s$bL>A0$N5DO@$GH/;6$7$?$h$&$J$N$G!“5DO@$r:F3+$9$k$h$j!”$^$D$b$He(B
e$B$5$s$N9%$_$G7h$a$F$7$^$&$N$,$h$$$N$G$O$J$$$+$H;W$$$^$9!#e(B

3 e$B7nKv$^$G$K7hDj$G$-$J$$$H<+F0E*$Ke(B #3 e$B$K$J$j$^$9!#e(B

#2 e$B$NA*Br;h$O<B8=2DG=@-$,8!>Z$5$l$F$$$J$$5$$,$9$k$N$G!"!V%Q%C%A$re(B
e$B=q$$$F$_$?$i<B$OFq$7$$$3$H$,$o$+$C$?e(B e$B"*e(B 1.9.2
e$B8+Aw$j!W$H$$$&4m81$,e(B
e$B$"$k$+$b$7$l$^$;$s!#e(B


Yusuke ENDOH removed_email_address@domain.invalid

http://redmine.ruby-lang.org/issues/show/973


#19

e$B$J$+$@$G$9!#e(B

At Wed, 17 Mar 2010 22:46:43 +0900,
Yusuke E. wrote in [ruby-dev:40687]:

  1. File.open e$BJ}<0$G7hDj$9$ke(B

#2 e$B$NA*Br;h$O<B8=2DG=@-$,8!>Z$5$l$F$$$J$$5$$,$9$k$N$G!"!V%Q%C%A$re(B
e$B=q$$$F$_$?$i<B$OFq$7$$$3$H$,$o$+$C$?e(B e$B"*e(B 1.9.2 e$B8+Aw$j!W$H$$$&4m81$,e(B
e$B$"$k$+$b$7$l$^$;$s!#e(B

e$BJL$KFq$7$/$O$"$j$^$;$s!#e(B

$ ./ruby -Eus-ascii -e ‘ARGV.each{|file|open(file,
“r:magic-comment”){|f|p [f.path, f.external_encoding]}}’ version.h
lib/rexml/rexml.rb lib/rubygems/package.rb
[“version.h”, #Encoding:US-ASCII]
[“lib/rexml/rexml.rb”, #Encoding:UTF-8]
[“lib/rubygems/package.rb”, #Encoding:ISO-8859-1]

diff --git c/include/ruby/io.h i/include/ruby/io.h
index e05a0f5…f067831 100644
— c/include/ruby/io.h
+++ i/include/ruby/io.h
@@ -96,4 +96,5 @@ typedef struct rb_io_t {
/* #define FMODE_PREP 0x00010000 */
#define FMODE_SETENC_BY_BOM 0x00100000
+#define FMODE_SETENC_BY_MAGIC_COMMENT 0x00200000

#define GetOpenFile(obj,fp) rb_io_check_closed((fp) =
RFILE(rb_io_taint_check(obj))->fptr)
diff --git c/io.c i/io.c
index 60afd6c…60761f0 100644
— c/io.c
+++ i/io.c
@@ -4125,4 +4125,6 @@ rb_io_ext_int_to_encs(rb_encoding *ext,
rb_encoding *intern, rb_encoding **enc,
}

+#define is_magic_comment(str) (STRCASECMP(str, “magic-comment”) == 0)
+
static void
parse_mode_enc(const char *estr, rb_encoding **enc_p, rb_encoding
**enc2_p)
@@ -4166,5 +4168,5 @@ parse_mode_enc(const char *estr, rb_encoding
**enc_p, rb_encoding **enc2_p)
ext_enc = rb_enc_from_index(idx);
else {

  • if (idx != -2)
  • if (idx != -2 && !is_magic_comment(estr))
    rb_warn(“Unsupported encoding %s ignored”, estr);
    ext_enc = NULL;
    @@ -4337,6 +4339,11 @@ rb_io_extract_modeenc(VALUE *vmode_p, VALUE
    *vperm_p, VALUE opthash,
    has_enc = 1;
    parse_mode_enc(p+1, &enc, &enc2);
  •  if (io_encname_bom_p(p+1, 0))
    
  •  if (io_encname_bom_p(p+1, 0)) {
    
    fmode |= FMODE_SETENC_BY_BOM;
  • p += 4;
  •  }
    
  •  if (is_magic_comment(p+1)) {
    
  • fmode |= FMODE_SETENC_BY_BOM | FMODE_SETENC_BY_MAGIC_COMMENT;
  •  }
       }
    
    else {
    @@ -4605,10 +4612,44 @@ io_strip_bom(VALUE io)
    }

-static void
-io_set_encoding_by_bom(VALUE io)
+int rb_magic_comment_encoding(const char *str, long len);
+
+static int
+io_parse_encoding_comment(VALUE io)
{

  • int idx = io_strip_bom(io);
  • VALUE line = rb_io_gets(io);
  • char *s;
  • long n;
  • if (NIL_P(line)) return 0;
  • s = RSTRING_PTR(line);
  • n = RSTRING_LEN(line);
  • if (n >= 2 && s[0] == ‘#’ && s[1] == ‘!’) {
  • VALUE shbang = line;
  • line = rb_io_gets(io);
  • if (NIL_P(line)) {
  •  rb_io_ungetbyte(io, shbang);
    
  •  return 0;
    
  • }
  • rb_io_ungetbyte(io, line);
  • s = RSTRING_PTR(line);
  • n = RSTRING_LEN(line);
  • line = shbang;
  • }
  • rb_io_ungetbyte(io, line);
  • while (n > 0 && (*s == ’ ’ || *s == ‘\t’)) {
  • s++;
  • n–;
  • }
  • if (n <= 0 || *s != ‘#’) return 0;
  • return rb_magic_comment_encoding(s, n);
    +}
  • if (idx) {
    +static void
    +io_guess_encoding(VALUE io, int fmode)
    +{
  • int idx;
  • if (((fmode & FMODE_SETENC_BY_BOM) &&
  • (idx = io_strip_bom(io)) != 0) ||
  • ((fmode & FMODE_SETENC_BY_MAGIC_COMMENT) &&
  • (idx = io_parse_encoding_comment(io)) != 0)) {
    rb_io_t *fptr;
    GetOpenFile(io, fptr);
    @@ -4638,5 +4679,5 @@ rb_file_open_generic(VALUE io, VALUE filename, int
    oflags, int fmode, convconfig
    fptr->fd = rb_sysopen(fptr->pathv, oflags, perm);
    io_check_tty(fptr);
  • if (fmode & FMODE_SETENC_BY_BOM) io_set_encoding_by_bom(io);
  • io_guess_encoding(io, fmode);

    return io;
    @@ -6396,5 +6437,5 @@ rb_io_initialize(int argc, VALUE *argv, VALUE io)
    fp->stdio_file = stderr;

  • if (fmode & FMODE_SETENC_BY_BOM) io_set_encoding_by_bom(io);
  • io_guess_encoding(io, fmode);
    return io;
    }
    diff --git c/parse.y i/parse.y
    index 340a825…a42c8f6 100644
    — c/parse.y
    +++ i/parse.y
    @@ -6248,13 +6248,15 @@ magic_comment_marker(const char *str, long len)
    }

+typedef int rb_magic_comment_func(const char *name, long nlen, const
char *value, long vlen, void *arg);
+
static int
-parser_magic_comment(struct parser_params *parser, const char *str,
long len)
+parse_magic_comment(const char *str, long len, rb_magic_comment_func
*func, void *arg)
{

  • VALUE name = 0, val = 0;
  • VALUE name = 0;
    const char *beg, *end, *vbeg, *vend;
    #define str_copy(_s, _p, _n) ((_s)
    ? (rb_str_resize((_s), (_n)),
    MEMCPY(RSTRING_PTR(_s), (_p), char, (_n)), (_s)) \
  • : ((_s) = STR_NEW((_p), (_n))))
  • : ((_s) = rb_str_new((_p), (_n))))

    if (len <= 7) return FALSE;
    @@ -6266,7 +6268,4 @@ parser_magic_comment(struct parser_params parser,
    const char str, long len)
    /

    %r"([^\s’":;]+)\s
    :\s*("(?:\\.|[^"])"|[^"\s;]+)[\s;]" */
    while (len > 0) {
    -#ifndef RIPPER

  • const struct magic_comment *p = magic_comments;
    -#endif
    char *s;
    int i;
    @@ -6321,24 +6320,68 @@ parser_magic_comment(struct parser_params
    *parser, const char *str, long len)
    if (s[i] == ‘-’) s[i] = ‘_’;
    }
  • if ((*func)(s, n, vbeg, vend - vbeg, arg)) break;
  • }
  • return TRUE;
    +}

+static int
+magic_comment_i(const char *name, long nlen, const char *value, long
vlen, void *arg)
+{

  • struct parser_params *parser = arg;
    #ifndef RIPPER
  • do {
  •  if (STRNCASECMP(p->name, s, n) == 0) {
    
  • n = vend - vbeg;
  • if (p->length) {
  •    n = (*p->length)(parser, vbeg, n);
    
  • }
  • str_copy(val, vbeg, n);
  • (*p->func)(parser, s, RSTRING_PTR(val));
  • break;
  • const struct magic_comment *p = magic_comments;
  • do {
  • if (STRNCASECMP(p->name, name, nlen) == 0) {
  •  char *val;
    
  •  if (p->length) {
    
  • vlen = (*p->length)(parser, value, vlen);
    }
  • } while (++p < magic_comments + numberof(magic_comments));
  •  val = ALLOCA_N(char, vlen + 1);
    
  •  memcpy(val, value, vlen);
    
  •  val[vlen] = '\0';
    
  •  (*p->func)(parser, name, val);
    
  •  break;
    
  • }
  • } while (++p < magic_comments + numberof(magic_comments));
    #else
  • dispatch2(magic_comment, name, val);
  • dispatch2(magic_comment, name, val);
    #endif
  • }
  • return FALSE;
    +}

+static int
+parser_magic_comment(struct parser_params *parser, const char *str,
long len)
+{

  • return parse_magic_comment(str, len, magic_comment_i, (void
    *)parser);
    +}

+static int
+find_magic_comment_encoding(const char *name, long nlen, const char
*value, long vlen, void *arg)
+{

  • char *val;
  • switch (nlen) {
  •  case 8:
    
  • if (STRNCASECMP(“en”, name, 2) != 0) return FALSE;
  • name += 2;
  •  case 6:
    
  • if (STRNCASECMP(“coding”, name, 6) != 0) return FALSE;
  • }
  • vlen = parser_encode_length(0, value, vlen);
  • memcpy(val = ALLOCA_N(char, vlen + 1), value, vlen);
  • val[vlen] = ‘\0’;
  • *(int *)arg = rb_enc_find_index(val);
    return TRUE;
    }

+int
+rb_magic_comment_encoding(const char *str, long len)
+{

  • int idx = 0;
  • if (!parse_magic_comment(str, len, find_magic_comment_encoding,
    &idx)) return 0;
  • return idx;
    +}

static void
set_file_encoding(struct parser_params *parser, const char *str, const
char *send)


#20

2010/3/20 Yusuke ENDOH removed_email_address@domain.invalid:

e$B:FEY$$$$$^$9$,!"e(B3 e$B7nKv$^$G$K9g0U$G$-$J$$$H<+F0E*$KN.$l$^$9$N$G!"e(B
e$BF3F~$rA@$C$F$$$k?M$?$A$O$,$s$P$C$F$/$@$5$$!#e(B

e$B5DO@$N2aDx$G$b$&$A$g$C$H<BNc$,=8$^$k$3$H$r4|BT$7$?$s$G$9$,!"7k6Ie(B3e$BNc$N$^$^$G$9$h$M!#e(B
e$B$J$i$PN.$7$F$b$&$A$g$C$H?5=E$K8!F$$7$F$bNI$$$N$G$O$J$$$G$7$g$&$+!#$^$:$Oe(Bredminee$B$Ke(Bfeaturee$B$H$7$FEPO?$5$l$?$3$H$G>-MhE*$K8!F$$5$l$k2DG=@-$,;D$C$?$H$$$&$3$H$G;d$OK~B-$G$9!#e(B