A tour of string encoding errors
12 February 2018
Tweet Follow @hazula
Tweet Follow @hazula
See https://github.com/bf4/encoded_string and specs.
Pick compatible encoding
def pick_encoding(str1, str2)
Encoding.compatible?(source_a, source_b) || Encoding.default_external
end
# picks the default external encoding for incompatible encodings
str1 = "\xa1".force_encoding("iso-8859-1")
str2 = "\xa1\xa1".force_encoding("euc-jp")
Encoding.compatible?(str1, str2) #=> nil
pick_encoding(str1, str2) #=> Encoding.default_external
# https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/encoding/compatible_spec.rb#L31
# picks a compatible encoding
str1 = "abc".force_encoding Encoding::US_ASCII
str2 = "\u3042".encode("utf-8")
pick_encoding(str1, str2) #=> Encoding::UTF_8
Raised by Encoding and String methods
In MRI 2.1 'invalid: :replace'
changed to also replace an invalid byte sequence
see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176,
https://www.ruby-forum.com/topic/6861247,
https://twitter.com/nalsh/status/553413844685438976
For example, given:
"\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
On MRI 2.1 or above: 63 `'?'`
else : 128 `"\x80"`
Ruby's default replacement string is:
U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
? ("\x3F")
REPLACE = "?"
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace,
:replace => REPLACE
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
:replace => REPLACE
}
Encoding::UndefinedConversionError
when a transcoding operation fails
if the String contains characters invalid for the target encoding
e.g. "\x80".encode('UTF-8','ASCII-8BIT')
vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
# => '<undef>'
# see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289
# ISO-8859-1 -> UTF-8 -> EUC-JP
# "\xa0" NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP
source_encoding = Encoding.find('ISO-8859-1')
incompatible_encoding = Encoding.find('EUC-JP')
string = "\xa0 hi I am not going to work".force_encoding(source_encoding)
string.encode(incompatible_encoding)
wrapped_string_template = "abaaaaaaaaaa%saaaaa"
ascii_arrow_symbol = "\xAE"
utf8_encoding = "UTF-8"
wrapped_string = sprintf(wrapped_string_template, ascii_arrow_symbol).force_encoding("ASCII-8BIT")
wrapped_string.encode(utf8_encoding)
Encoding::CompatibilityError
when Encoding.compatibile?(str1, str2) is nil
e.g. utf_16le_emoji_string.split("\n")
e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
ascii_arrow_symbol = "\xAE"
utf_8_euro_symbol = "\xE2\x82\xAC"
utf8_encoding = "UTF-8"
ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT")
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')
valid_unicode_string.encode(utf8_encoding) << ascii_string
# see https://github.com/rspec/rspec-expectations/blob/f8a1232/spec/rspec/expectations/fail_with_spec.rb#L50
# https://github.com/rspec/rspec-expectations/issues/201
# https://github.com/rspec/rspec-expectations/pull/220
binary_poop = '💩' # [128169] "\u{1F4A9}"
non_ascii_compatible_string) = "This is a pile of poo: #{binary_poop}, yuck".encode("UTF-16LE")
non_ascii_compatible_string.split("\n")
Encoding::InvalidByteSequenceError
when the string being transcoded contains a byte invalid for
either the source or target encoding
e.g. "\x80".encode('UTF-8','US-ASCII')
vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
# => '<byte>'
# invalid bytes in the target encoding
# see https://github.com/jruby/jruby/blob/c1be61a501/test/mri/ruby/test_transcode.rb#L13
source_encoding = Encoding.find('US-ASCII')
target_encoding = Encoding.find('UTF-8')
string = "I have a bad byté\x80".force_encoding(source_encoding)
string.encode(target_encoding)
# NOTE: skip on JRuby https://github.com/jruby/jruby/issues/2580
ArgumentError
when operating on a string with invalid bytes
e.g."\x80".split("\n")
utf8_encoding = "UTF-8"
message_with_invalid_byte_sequence = "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding)
message_with_invalid_byte_sequence.valid_encoding? #=> false
message_with_invalid_byte_sequence.split("\n")
TypeError
when a symbol is passed as an encoding
Encoding.find(:"UTF-8")
when calling force_encoding on an object
that doesn't respond to #to_str
Encoding.find(:"UTF-8")
Raised by transcoding methods
Encoding::ConverterNotFoundError
when a named encoding does not correspond with a known converter
e.g. 'abc'.force_encoding('UTF-8').encode('foo')
or a converter path cannot be found
e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
# see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/string/shared/encode.rb#L12
source_encoding = Encoding.find('ASCII-8BIT')
no_converter_encoding = Encoding::Emacs_Mule
string = "\x80".force_encoding(source_encoding)
string.encode(no_converter_encoding)
# Skip if RUBY_VERSION < '2.1'
# See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
# for why the behavior differs by (MRI) Ruby version.
Raised by byte <-> char conversions
RangeError: out of char range
e.g. the UTF-16LE emoji: 128169.chr
128169.chr
Handling encoding errors
Convert to matching encoding
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace,
:replace => REPLACE
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
:replace => REPLACE
}
def convert_to_matching_encoding(string, target_encoding)
string = remove_invalid_bytes(string)
string.encode(target_encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
string.encode(target_encoding, ENCODE_UNCONVERTABLE_BYTES)
rescue Encoding::ConverterNotFoundError
string.dup.force_encoding(target_encoding).encode(ENCODE_NO_CONVERTER)
end
Remove invalid bytes
Prevents raising an ArgumentError
REPLACE = "?"
if String.method_defined?(:scrub)
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
# https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242
# https://github.com/hsbt/string-scrub
# https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972
def remove_invalid_bytes(string)
string.scrub(REPLACE)
end
else
# http://stackoverflow.com/a/8711118/879854
# Loop over chars in a string replacing chars
# with invalid encoding, which is a pretty good proxy
# for the invalid byte sequence that causes an ArgumentError
def remove_invalid_bytes(string)
string.chars.map do |char|
char.valid_encoding? ? char : REPLACE
end.join
end
end
Pick compatible encoding
Encoding.compatible?(source_a, source_b) || Encoding.default_external
Append string
string = convert_to_matching_encoding(string, target_encoding)
string << convert_to_matching_encoding(another_string, target_encoding)
Split string
string = convert_to_matching_encoding(string, target_encoding)
string_or_regex = convert_to_matching_encoding(string_or_regex)
string.split(string_or_regex)
Prepare to compare two strings
encoding = Encoding.compatible?(actual, expected) || Encoding.default_external
actual = convert_to_matching_encoding(actual, encoding)
expected = convert_to_matching_encoding(expected, encoding)
diff_output = convert_to_matching_encoding("\n", encoding)
and then you can
require 'diff/lcs'
require 'diff/lcs/hunk'
@file_length_difference = 0
expected_lines = expected.split("\n").map! { |e| e.chomp }
actual_lines = actual.split("\n").map! { |e| e.chomp }
context_lines = 3
diffs = Diff::LCS.diff(expected_lines, actual_lines)
hunks = diffs.map {|piece|
Diff::LCS::Hunk.new(
expected_lines, actual_lines, piece, context_lines, @file_length_difference
).tap do |h|
@file_length_difference = h.file_length_difference
end
}
format_type = :unified
hunks.each_cons(2) do |prev_hunk, current_hunk|
begin
if current_hunk.overlaps?(prev_hunk)
current_hunk.merge(prev_hunk)
else
diff_output << prev_hunk.diff(format_type).to_s
end
ensure
diff_output << "\n"
end
end
if (last_hunk = hunks.last)
diff_output << last_hunk.diff(format_type).to_s
diff_output << "\n"
end
puts diff_output
colorized_diff = diff_output.lines.map do |line|
case line[0].chr
when "+"
green = 32
"\e[#{green}m#{line}\e[0m"
when "-"
red = 31
"\e[#{red}m#{line}\e[0m"
when "@"
if line[1].chr == "@"
blue = 34
"\e[#{blue}m#{line}\e[0m"
else
normal = 0
"\e[#{normal}m#{line}\e[0m"
end
else
normal = 0
"\e[#{normal}m#{line}\e[0m"
end
end.join
puts colorized_diff
blog comments powered by Disqus