Powered by AppSignal & Oban Pro

HailoAI Remote Inference

livebooks/remote_device_inference.livemd

HailoAI Remote Inference

Setup

This notebook runs inference on a Hailo device via Livebook’s Attached Node runtime. The node must be running on the device before connecting.

Start the node on the device:

./scripts/start_node.exs

Then in Livebook go to Runtime → Attached Node and enter the node name and cookie printed by the script.

> See livebooks/download_models.livemd to download a .hef model file before running inference.

Load Model

priv = to_string(:code.priv_dir(:nx_hailo))
{:ok, hailo_model} = NxHailo.load("#{priv}/yolov8m.hef")

[%{name: name}] = hailo_model.pipeline.input_vstream_infos
[%{name: output_key}] = hailo_model.pipeline.output_vstream_infos

:ok

Image Preprocessing

Resizes and letterbox-pads an image to the model’s square input size (640×640 for YOLOv8), filling borders with neutral grey.

evision_resize_and_pad =
  fn image, {h, w}, {target_size, target_size} ->
    size = max(h, w)
    pad_h = div(size - h, 2)
    pad_w = div(size - w, 2)

    padded =
      Evision.copyMakeBorder(
        image,
        pad_h,
        pad_h + rem(size - h, 2),
        pad_w,
        pad_w + rem(size - w, 2),
        Evision.Constant.cv_BORDER_CONSTANT(),
        value: {114, 114, 114}
      )

    Evision.resize(padded, {target_size, target_size})
  end

Drawing Utilities

Draws bounding boxes and confidence labels on a frame.

defmodule YOLODraw do
  @font_face Evision.Constant.cv_FONT_HERSHEY_SIMPLEX()
  @font_size 0.5
  @stroke_width 2
  @text_padding 5

  def draw_detected_objects(mat, detected_objects, fps_label) do
    {full_height, full_width, _} = Evision.Mat.shape(mat)

    mat = draw_fps(mat, fps_label, full_width, full_height)
    mat = Enum.reduce(detected_objects, mat, &draw_box(&2, &1))
    Enum.reduce(detected_objects, mat, &draw_label(&2, &1))
  end

  defp draw_fps(mat, label, w, h) do
    {{tw, th}, _} = Evision.getTextSize(label, @font_face, 0.7, 1)
    bg_w = tw + 2 * @text_padding
    bg_h = th + 2 * @text_padding

    mat
    |> Evision.rectangle({w - bg_w, h - bg_h}, {w, h}, {255, 0, 0}, thickness: -1)
    |> Evision.putText(label, {w - bg_w + @text_padding, h - @text_padding},
      @font_face, 0.7, {255, 255, 255},
      thickness: 1, lineType: Evision.Constant.cv_LINE_AA()
    )
  end

  defp draw_box(mat, obj) do
    Evision.rectangle(
      mat,
      {obj.xmin, obj.ymin},
      {obj.xmax, obj.ymax},
      class_color(obj.class_id),
      thickness: @stroke_width
    )
  end

  defp draw_label(mat, obj) do
    label = "#{obj.class_name} #{round(obj.score * 100)}%"
    {{tw, th}, baseline} = Evision.getTextSize(label, @font_face, @font_size, 1)

    bg_tl = {obj.xmin, max(obj.ymin - th - 2 * @text_padding - baseline, 0)}
    bg_br = {obj.xmin + tw + 2 * @text_padding, max(obj.ymin - baseline, 0)}

    mat
    |> Evision.rectangle(bg_tl, bg_br, {0, 0, 0}, thickness: -1)
    |> Evision.putText(
      label,
      {obj.xmin + @text_padding, max(obj.ymin - @text_padding - baseline, th + @text_padding)},
      @font_face, @font_size, {255, 255, 255},
      thickness: 1, lineType: Evision.Constant.cv_LINE_AA()
    )
  end

  hex_to_bgr = fn hex ->
    hex
    |> String.replace_prefix("#", "")
    |> String.to_integer(16)
    |> then(fn c ->
      {Bitwise.band(c, 0xFF), Bitwise.band(Bitwise.bsr(c, 8), 0xFF), Bitwise.band(Bitwise.bsr(c, 16), 0xFF)}
    end)
  end

  @class_colors [
    "#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF",
    "#800000", "#008000", "#000080", "#FF00FF", "#800080", "#008080",
    "#C0C0C0", "#FFA500", "#A52A2A", "#8A2BE2", "#5F9EA0", "#7FFF00",
    "#D2691E", "#FF7F50", "#6495ED", "#DC143C", "#00FFFF", "#00008B",
    "#008B8B", "#B8860B", "#A9A9A9", "#006400", "#BDB76B", "#8B008B",
    "#556B2F", "#FF8C00", "#9932CC", "#8B0000", "#E9967A", "#8FBC8F",
    "#483D8B", "#2F4F4F", "#00CED1", "#9400D3", "#FF1493", "#00BFFF",
    "#696969", "#1E90FF", "#B22222", "#FFFAF0", "#228B22", "#FF00FF",
    "#DCDCDC", "#F8F8FF", "#FFD700", "#DAA520", "#808080", "#ADFF2F",
    "#F0FFF0", "#FF69B4", "#CD5C5C", "#4B0082", "#FFFFF0", "#F0E68C",
    "#E6E6FA", "#FFF0F5", "#7CFC00", "#FFFACD", "#ADD8E6", "#F08080",
    "#E0FFFF", "#FAFAD2", "#D3D3D3", "#90EE90", "#FFB6C1", "#FFA07A",
    "#20B2AA", "#87CEFA", "#778899", "#B0C4DE", "#FFFFE0", "#00FF7F",
    "#4682B4", "#D2B48C", "#008080", "#D8BFD8", "#FF6347", "#40E0D0",
    "#EE82EE", "#F5DEB3", "#FFFFFF", "#F5F5F5"
  ]
  |> Enum.with_index(&{&2, hex_to_bgr.(&1)})
  |> Map.new()

  def class_color(class_idx), do: Map.get(@class_colors, class_idx, {255, 0, 0})
end

Camera Setup

Choose one of the sections below depending on your camera type, then proceed to the Inference section.

Option A — Standard V4L2 Camera

Use this for USB webcams or any camera accessible via /dev/video*.

find_working_cap = fn ->
  Enum.find_value(Path.wildcard("/dev/video*"), fn device ->
    cap = Evision.VideoCapture.videoCapture(device)

    cond do
      not Evision.VideoCapture.isOpened(cap) ->
        Evision.VideoCapture.release(cap)
        false

      not Evision.VideoCapture.grab(cap) ->
        Evision.VideoCapture.release(cap)
        false

      true ->
        {cap, device}
    end
  end)
end
{capture, device} = find_working_cap.()
IO.puts("Using camera: #{device}")

capture_frame = fn ->
  Evision.VideoCapture.set(capture, Evision.Constant.cv_CAP_PROP_BUFFERSIZE(), 1)
  true = Evision.VideoCapture.grab(capture)
  Evision.VideoCapture.read(capture)
end

sample_frame = capture_frame.()
{frame_h, frame_w, _} = Evision.Mat.shape(sample_frame)
input_shape = {frame_h, frame_w}

Option B — Raspberry Pi Camera (PiCam3 / libcamera)

The Pi Camera uses libcamera and is not directly accessible via OpenCV’s V4L2 backend. Instead, run rpicam-still in timelapse mode on the device (in a separate terminal) to continuously write frames to a temp file, then read that file here.

Start the camera on the device (separate terminal):

rpicam-still --nopreview -t 0 --timelapse 50 --width 640 --height 480 \
  -o /tmp/frame.jpg 2>/dev/null &

Then run this cell:

capture_frame = fn ->
  Evision.imread("/tmp/frame.jpg", flags: Evision.Constant.cv_IMREAD_COLOR())
end

sample_frame = capture_frame.()
{frame_h, frame_w, _} = Evision.Mat.shape(sample_frame)
input_shape = {frame_h, frame_w}

IO.puts("Frame size: #{frame_w}×#{frame_h}")

Inference

Loads class labels, then runs YOLOv8 inference on each frame and renders the annotated result.

padded_shape = {640, 640}

classes =
  File.read!("#{priv}/yolov8m_classes.json")
  |> Jason.decode!()
  |> Enum.with_index()
  |> Map.new(fn {v, k} -> {k, v} end)
fps = div(1000, 50)

Kino.animate(fps, fn _ ->
  input_image = capture_frame.()
  padded_image = evision_resize_and_pad.(input_image, input_shape, padded_shape)

  input_tensor =
    padded_image
    |> Evision.Mat.to_nx()
    |> Nx.reshape({640, 640, 3})
    |> Nx.backend_transfer()

  {:ok, raw_objects} =
    NxHailo.infer(
      hailo_model,
      %{name => input_tensor},
      NxHailo.Parsers.YoloV8,
      classes: classes,
      key: output_key
    )

  detected_objects =
    raw_objects
    |> Enum.reject(&amp;(&amp;1.score < 0.5))
    |> NxHailo.Parsers.YoloV8.postprocess(input_shape)

  YOLODraw.draw_detected_objects(input_image, detected_objects, "FPS: #{1000 / fps}")
end)