require 'ruconv' # encoding converter class for XML docuemnts. # class XMLConv class < ["UCS-4BE", :u4be_to_u8], "\x00\x00\x3C\x00" => ["UCS-4-2143", :u42143_to_u8], "\x00\x00\xFE\xFF" => ["UCS-4BE", :u4be_to_u8], "\x00\x00\xFF\xFE" => ["UCS-4-2143", :u42143_to_u8], "\x00\x3C\x00\x00" => ["UCS-4-3412", :u43412_to_u8], "\x00\x3C\x00\x3F" => ["UTF-16BE", :u16be_to_u8], "\x3C\x00\x00\x00" => ["UCS-4LE", :u4le_to_u8], "\x3C\x00\x3F\x00" => ["UTF-16LE", :u16le_to_u8], "\xEF\xBB\xBF\x3c" => ["UTF-8", :u8_delete_bom], "\xFE\xFF\x00\x00" => ["UCS-4-3412", :u43412_to_u8], "\xFE\xFF\x00\x3C" => ["UTF-16BE", :u16be_to_u8], "\xFF\xFE\x00\x00" => ["UCS-4LE", :u4le_to_u8], "\xFF\xFE\x3C\x00" => ["UTF-16LE", :u16le_to_u8], "\x3C\x3F\x78\x6D" => ["ASCII-COMPATIBLE", nil] } def enc_utf8(enc, str) case enc when /^iso[-_]?8859[-_]?1$/i XMLConv.iso88591_to_u8(str) when /^utf[-_]8$/i str else str end end # convert string as XML docuemnts into UTF-8 # def convert_to_utf8(str) str_head = str[0,4] encoding, func = @@encode_detect[str_head] if func converted, enc = XMLConv.__send__(func, str), encoding if converted =~ /\A\xEF\xBB\xBF/n converted[0,3] = '' end return converted, enc # from XML 1.0 Rec. # [1] document ::= prolog element Misc* # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? # [27] Misc ::= Comment | PI | S # elsif str =~ /\A(\s\000)*<\x00/ return XMLConv.u16le_to_u8(str), "UTF-16LE" elsif str =~ /\A(\000\s)*\x00/m str_head2 = $1 str_head2 =~ /<\?.*encoding=["']([^\"\']+)["']/m enc2 = $1 || "UTF-8" return enc_utf8(enc2, str), enc2 elsif str =~ /\A(\s)*