Overlong UTF-8 Codepoint Generation
defmodule Bits do
@doc """
Prints all 8 bits in a byte
"""
def as_string(binary) do
for(<>, do: "#{x}")
|> Enum.chunk_every(8)
|> Enum.join(" ")
end
@doc """
Convenience function for creating tables
"""
def as_headers(binary) do
for(<>, do: "#{x}")
|> Enum.chunk_every(8)
|> Enum.join(" | ")
end
end
Preface
The way utf-8’s structed, it’s possible to have overlong sequences. An overlong sequence is a sequence that takes more bytes than necessary to represent the codepoint it contains.
Bits per Sequence Length
1 byte -> 6 bits
2 bytes -> 5 + 6 bits
3 bytes -> 4 + 6 + 6 bits
4 bytes -> 3 + 6 + 6 + 6 bits
5 byte and 6 byte sequences are mathematically possible but not allowed by the Unicode standard.
5 bytes -> 2 + 6 + 6 + 6 + 6 bits
6 bytes -> 1 + 6 + 6 + 6 + 6 + 6 bits
defmodule Overlong do
def make_overlong(codepoint, extra_bytes \\ 1) when is_integer(codepoint) do
case byte_size(<>) + extra_bytes do
2 ->
<> = <>
<<0b110::3, i::5, 0b10::2, ii::6>>
3 ->
<> = <>
<<0b1110::4, i::4, 0b10::2, ii::6, 0b10::2, iii::6>>
4 ->
<> = <>
<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6>>
5 ->
<> = <>
<<0b111110::6, i::2, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6, 0b10::2, v::6>>
6 ->
<> = <>
<<0b1111110::7, i::1, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6, 0b10::2, v::6,
0b10::2, vi::6>>
over ->
raise "utf-8 can't fit sequences above 6 bytes (#{over} total)"
end
end
end
input = ?e
# Using Bits for pretty-printing
IO.puts("original: <<0b#{Bits.as_string(<>)}>>")
input
|> Overlong.make_overlong(1)
|> IO.inspect(binaries: :as_binary, base: :binary, label: "overlong")
nil