Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Overlong UTF-8 Codepoint Generation

notebooks/overlong_utf8.livemd

Overlong UTF-8 Codepoint Generation

defmodule Bits do
  @doc """
  Prints all 8 bits in a byte
  """
  def as_string(binary) do
    for(<>, do: "#{x}")
    |> Enum.chunk_every(8)
    |> Enum.join(" ")
  end

  @doc """
  Convenience function for creating tables
  """
  def as_headers(binary) do
    for(<>, do: "#{x}")
    |> Enum.chunk_every(8)
    |> Enum.join(" | ")
  end
end

Preface

The way utf-8’s structed, it’s possible to have overlong sequences. An overlong sequence is a sequence that takes more bytes than necessary to represent the codepoint it contains.

Bits per Sequence Length

1 byte -> 6 bits

2 bytes -> 5 + 6 bits

3 bytes -> 4 + 6 + 6 bits

4 bytes -> 3 + 6 + 6 + 6 bits

5 byte and 6 byte sequences are mathematically possible but not allowed by the Unicode standard.

5 bytes -> 2 + 6 + 6 + 6 + 6 bits

6 bytes -> 1 + 6 + 6 + 6 + 6 + 6 bits

defmodule Overlong do
  def make_overlong(codepoint, extra_bytes \\ 1) when is_integer(codepoint) do
    case byte_size(<>) + extra_bytes do
      2 ->
        <> = <>
        <<0b110::3, i::5, 0b10::2, ii::6>>

      3 ->
        <> = <>
        <<0b1110::4, i::4, 0b10::2, ii::6, 0b10::2, iii::6>>

      4 ->
        <> = <>
        <<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6>>

      5 ->
        <> = <>
        <<0b111110::6, i::2, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6, 0b10::2, v::6>>

      6 ->
        <> = <>

        <<0b1111110::7, i::1, 0b10::2, ii::6, 0b10::2, iii::6, 0b10::2, iv::6, 0b10::2, v::6,
          0b10::2, vi::6>>

      over ->
        raise "utf-8 can't fit sequences above 6 bytes (#{over} total)"
    end
  end
end
input = ?e

# Using Bits for pretty-printing
IO.puts("original: <<0b#{Bits.as_string(<>)}>>")

input
|> Overlong.make_overlong(1)
|> IO.inspect(binaries: :as_binary, base: :binary, label: "overlong")

nil