Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Object detection by YOLOX

YoloX.livemd

Object detection by YOLOX

0.Original work

Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian
“YOLOX: Exceeding YOLO Series in 2021”

> A technical article on YOLOX in Japanese

@koshian2 “実装から見るYOLOX:2021年のYOLOシリーズを超えて”

Thanks a lot!!!


Implementation for Elixir/Nerves using TflInterp

1.Helper module

Create the module to assist with tasks such as downloading a model.

defmodule Helper do
  @model_file "yolox_s.tflite"
  @label_file "coco.label"

  @wearhouse "https://github.com/shoz-f/tinyML_livebook/releases/download/model/"
  @local "/data/"

  def model() do
    @local <> @model_file
  end

  def label() do
    @local <> @label_file
  end

  def get() do
    Req.get!(@wearhouse <> @model_file).body
    |> then(fn x -> File.write(model(), x) end)

    Req.get!(@wearhouse <> @label_file).body
    |> then(fn x -> File.write(label(), x) end)
  end

  def rm(:model), do: File.rm(model())
  def rm(:label), do: File.rm(label())

  def rm() do
    rm(:model)
    rm(:label)
  end

  def exists?(:model), do: File.exists?(model())
  def exists?(:label), do: File.exists?(label())

  def exists?() do
    exists?(:model) &amp;&amp; exists?(:label)
  end
end

Get the tflite model and the coco lable from @wearhouse and store it in @local.

Helper.get()

2.Defining the inference module: YoloX

  • Model
    Standard Model: YOLOX-s 640 converted from Pytorch model.

  • Pre-processing:
    Resize the input image to the size of @yolox_shape and create a Float32 binary sequence normalized to the range {0.0, 255.0}, NCHW, BGR.

  • Post-processing:
    Split the output tensor f32[8400][85] into class scores and bounding boxes and sieve the inference results by the score value threshold and NMS.

defmodule YoloX do
  # use TflInterp, model: Helper.model(), label: Helper.label()
  use TflInterp

  @yolox_shape {640, 640}

  def apply(jpeg) do
    img = CImg.from_binary(jpeg)

    # preprocess
    bin = 
      img
      |> CImg.resize(@yolox_shape, :ul, 114)
      |> CImg.to_binary([{:range, {0.0, 255.0}}, :nchw, :bgr])

    # prediction
    outputs =
      __MODULE__
      |> TflInterp.set_input_tensor(0, bin)
      |> TflInterp.invoke()
      |> TflInterp.get_output_tensor(0)
      |> Nx.from_binary({:f, 32}) |> Nx.reshape({:auto, 85})

    # postprocess
    boxes  = extract_boxes(outputs, scale(img))
    scores = extract_scores(outputs)

    TflInterp.non_max_suppression_multi_class(__MODULE__,
      Nx.shape(scores), Nx.to_binary(boxes), Nx.to_binary(scores)
    )
  end

  defp extract_boxes(tensor, scale) do
    {grid, strides} = grid_strides(@yolox_shape, [8, 16, 32])

    [
      Nx.add(Nx.slice_axis(tensor, 0, 2, 1), grid),
      Nx.exp(Nx.slice_axis(tensor, 2, 2, 1))
    ]
    |> Nx.concatenate(axis: 1) |> Nx.multiply(strides) |> Nx.multiply(scale)
  end

  defp grid_strides({wsize, hsize}, block) do
    reso = Enum.map(block, fn x -> {div(hsize, x), div(wsize, x), x} end)

    {
      Enum.map(reso, &amp;grid/1)    |> Nx.concatenate(axis: 0),
      Enum.map(reso, &amp;strides/1) |> Nx.concatenate(axis: 0)
    }
  end

  defp grid({hsize, wsize, _}) do
    xv = Nx.iota({wsize}) |> Nx.tile([hsize, 1])
    yv = Nx.iota({hsize}) |> Nx.tile([wsize, 1]) |> Nx.transpose()
    Nx.stack([xv, yv], axis: 2) |> Nx.reshape({:auto, 2})
  end
  
  defp strides({hsize, wsize, stride}) do
    Nx.tensor(stride) |> Nx.tile([hsize*wsize, 1])
  end

  defp extract_scores(tensor) do
    Nx.multiply(Nx.slice_axis(tensor, 4, 1, 1), Nx.slice_axis(tensor, 5, 80, 1))
  end
  
  defp scale(img) do
    {w, h, _, _}   = CImg.shape(img)
    {wsize, hsize} = @yolox_shape
    max(w/wsize, h/hsize)
  end
end

Launch YoloX.

YoloX.start_link(model: Helper.model(), label: Helper.label())

Displays the properties of the YoloX model.

TflInterp.info(YoloX)

3.Let’s try it

In one shot.

alias CImg.Builder

draw_object = fn builder, {name, boxes} ->
  Enum.reduce(boxes, builder, fn [_score | box], canvas ->
    [x0, y0, x1, y1] = Enum.map(box, &amp;round(&amp;1))
    CImg.draw_rect(canvas, x0, y0, x1, y1, {255, 0, 0})
    |> CImg.draw_text(x0, y0 - 16, name, 16, :red)
  end)
end

jpeg = Picam.next_frame()

with {:ok, res} <- YoloX.apply(jpeg) do
  # draw result box
  Enum.reduce(Map.to_list(res), Builder.from_binary(jpeg), &amp;draw_object.(&amp;2, &amp;1))
  |> Builder.runit()
else
  _ -> CImg.from_binary(jpeg)
end
|> CImg.resize({640, 480})
|> CImg.to_binary(:jpeg)
|> Kino.Image.new(:jpeg)

4.TIL ;-)

Date: Feb. 6, 2022 / Nerves-livebook rpi3

TflInterp.non_max_suppression_multi_class hangs up. Oh well, I forgot that ARM is strict about word alignment. Solved this problem by adjusting the i/f structure of non_max_suppression_multi_class() to 32-bit word alignment. This will be fixed in the next version 0.1.4.

Total processing time is about 13.7 seconds, excluding camera shooting. Of that time, the YoloX inference - TflInterp.invoke(YoloX) - takes about 5.8 seconds, and the post-processing YoloX.extract_scores/1 takes about 5.2 seconds. YoloX.extract_scores/1 seems to be taking a long time to calculate Nx.tensor f32[8400][80].

The model I tried this time was too heavy for the Raspberry Pi, so I’ll try a smaller model, tiny or nano, next.