#!/usr/bin/env ruby # pure Ruby version of Uconv. # module RUconv # convert UTF-16 string into UTF-8 # def u16_to_u8(str) case str when /\A\xFF\xFE/ u16le_to_u8(str[2,-1]) when /\A\xFF\xFE/ u16be_to_u8(str[2,-1]) when /\A[\x01-\x7f]\x00[\x01-\x7f]\x00[\x01-\x7f]\x00/ ## incorrect guess u16le_to_u8(str) when /\A\x00[\x01-\x7f]\x00[\x01-\x7f]\x00[\x01-\x7f]/ ## incorrect guess u16be_to_u8(str) else raise ArgumentError, "cannot detect encoding" end module_function :u16_to_u8 end # convert UTF-16BE string into UTF-8 # def u16be_to_u8(str) buf = [] str.scan(/((?:\x00[\x00-\x7f])+)|((?:[\x00-\x07][\x00-\xff])+)|((?:[\x08-\xd7][\x00-\xff])+)|((?:[\xd8-\xdb][\x00-\xff][\xdc-\xdf][\x00-\xff])+)|((?:[\xe0-\xff][\x00-\xff])+)|(.)/){ |u1,u2,u3,u4,u5,err| if u1 (u1.length/2).times{|i| buf << u1[i*2+1] } elsif u2 (u2.length/2).times{|i| buf << (u2[i*2]<<8)+u2[i*2+1] } elsif u3 (u3.length/2).times{|i| buf << (u3[i*2]<<8)+u3[i*2+1] } elsif u4 (u4.length/4).times{|i| u_buf_f = (u4[0]<<8)+u4[1] u_buf_b = (u4[2]<<8)+u4[3] buf << ((u_buf_f-0xd800)<<10)+(u_buf_b-0xdc00) } elsif u5 (u5.length/2).times{|i| buf << (u5[i*2]<<8)+u5[i*2+1] } elsif err raise RangeError, 'illegal UTF-16 sequence' end } buf.pack("U*") end module_function :u16be_to_u8 # convert UTF-16LE string into UTF-8 # def u16le_to_u8(str) buf = Array.new(str.length/2) pos = 0 str.scan(/((?:[\x00-\x7f]\x00)+)|((?:[\x00-\xff][\x00-\x07])+)|((?:[\x00-\xff][\x08-\xd7])+)|((?:[\x00-\xff][\xdc-\xdf][\x00-\xff][\xd8-\xdb])+)|((?:[\x00-\xff][\xe0-\xff])+)|(.)/){ |u1,u2,u3,u4,u5,err| if u1 (u1.length/2).times{|i| buf[i+pos] = u1[i*2] } pos += (u1.length/2) elsif u2 (u2.length/2).times{|i| buf[i+pos] = (u2[i*2+1]<<8)+u2[i*2] } pos += (u2.length/2) elsif u3 (u3.length/2).times{|i| buf[i+pos] = (u3[i*2+1]<<8)+u3[i*2] } pos += (u3.length/2) elsif u4 (u4.length/4).times{|i| u_buf_f = (u4[3]<<8)+u4[2] u_buf_b = (u4[1]<<8)+u4[0] buf[i+pos] = ((u_buf_f-0xd800)<<10)+(u_buf_b-0xdc00) } pos += (u4.length/2) elsif u5 (u5.length/2).times{|i| buf[i+pos] = (u5[i*2+1]<<8)+u5[i*2] } pos += (u5.length/2) elsif err raise RangeError, 'illegal UTF-16 sequence' end } buf.pack("U*") end module_function :u16le_to_u8 # Byte-swaps UTF-16 string. # def u16swap(str) str.unpack("n*").pack("v*") end module_function :u16swap # convert UCS-4BE string into UTF-8 # def u4be_to_u8(str) str.unpack("N*").pack("U*") end module_function :u4be_to_u8 # convert UCS-4LE string into UTF-8 # def u4le_to_u8(str) str.unpack("V*").pack("U*") end module_function :u4le_to_u8 # Byte-swaps UCS-4 string. # def u4swap(str) str.unpack("N*").pack("V*") end module_function :u4swap # convert UCS-4(2143 order) string into UTF-8 # def u42143_to_u8(str) u4be_to_u8(u16swap(str)) end module_function :u42143_to_u8 # convert UCS-4(3412 order) string into UTF-8 # def u43412_to_u8(str) u4le_to_u8(u16swap(str)) end module_function :u43412_to_u8 # delete BOM in UTF-8 # def u8_delete_bom(str) return str[3..-1] end module_function :u8_delete_bom def iso88591_to_u8(str) str.unpack("C*").pack("U*") end module_function :iso88591_to_u8 end if __FILE__ == $0 include RUconv # print u16_to_u8(ARGF.read) str = ARGF.read 10.times{ s = u16le_to_u8(str) } # print u16le_to_u8(str) end