Human pose estimation on image sequence

pose_estimation_image_sequence.livemd

Cocoa

@cocoa-xu

tflite_elixir

Share to X

Share to Bluesky

More notebooks

Human pose estimation on image sequence

Mix.install([
  {:tflite_elixir, "~> 0.3.0"},
  {:evision, "0.1.31"},
  {:req, "~> 0.3.0"},
  {:kino, "~> 0.9.0"}
])

Introduction

Pose estimation is the task of using an ML model to estimate the pose of a person from an image or a video by estimating the spatial locations of key body joints (keypoints).

This notebook demonstrates how to apply intelligent cropping based on detections from the previous frame when the input is a sequence of frames. This allows the model to devote its attention and resources to the main subject, resulting in much better prediction quality without sacrificing the speed.

https://www.tensorflow.org/lite/examples/pose_estimation/overview

https://www.tensorflow.org/hub/tutorials/movenet

It’s useful to alias the module as something shorter when we make extensive use of the functions from certain modules.

alias Evision, as: Cv
alias TFLiteElixir, as: TFLite
alias TFLiteElixir.TFLiteTensor

Download data files

# /data is the writable portion of a Nerves system
downloads_dir =
  if Code.ensure_loaded?(Nerves.Runtime), do: "/data/livebook", else: System.tmp_dir!()

download = fn url ->
  save_as = Path.join(downloads_dir, URI.encode_www_form(url))
  unless File.exists?(save_as), do: Req.get!(url, output: save_as)
  save_as
end

data_files =
  [
    input_gif:
      "https://github.com/tensorflow/tfjs-models/raw/master/pose-detection/assets/dance_input.gif",
    movenet_model:
      "https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3?lite-format=tflite"
  ]
  |> Enum.map(fn {key, url} -> {key, download.(url)} end)
  |> Map.new()

data_files
|> Enum.map(fn {k, v} -> [name: k, location: v] end)
|> Kino.DataTable.new(name: "Data files")

Define helper functions

defmodule PoseEstimation.Color do
  def black, do: {0, 0, 0}
  def blue, do: {255, 0, 0}
  def green, do: {0, 255, 0}
  def red, do: {0, 0, 255}
  def yellow, do: {0, 255, 255}
  def magenta, do: {255, 0, 255}
  def cyan, do: {255, 255, 0}
  def white, do: {255, 255, 255}
end

defmodule PoseEstimation.Frame do
  # Load images from a GIF file and convert them into Nx tensors
  def load_gif_file(gif_file) do
    {:ok, frames, _} = StbImage.read_gif_file(gif_file)
    frames |> Enum.map(&amp;StbImage.to_nx(&amp;1))
  end

  def cv_mat_from_nx_2d(%Nx.Tensor{} = image_nx) do
    image_nx |> Cv.Mat.from_nx_2d() |> Cv.cvtColor(Cv.Constant.cv_COLOR_BGR2RGB())
  end

  def animate_frames([%Cv.Mat{} | _] = image_mat_list) do
    interval_ms = 100
    how_many = length(image_mat_list)

    Stream.interval(interval_ms)
    |> Stream.take(how_many)
    |> Kino.animate(&amp;Enum.at(image_mat_list, &amp;1))
  end
end

defmodule PoseEstimation.Cropping do
  def calc_image_in_square({original_height, original_width}, desired_size) do
    scale_factor = desired_size / Enum.max([original_height, original_width])
    inner_height = trunc(original_height * scale_factor)
    inner_width = trunc(original_width * scale_factor)

    %{
      scale_factor: scale_factor,
      desired_size: desired_size,
      inner_offset_x: div(desired_size - inner_width, 2),
      inner_offset_y: div(desired_size - inner_height, 2),
      inner_height: inner_height,
      inner_width: inner_width,
      original_height: original_height,
      original_width: original_width
    }
  end

  def resize_with_pad(%Cv.Mat{} = input_mat, desired_size) when is_number(desired_size) do
    {original_height, original_width, _} = Cv.Mat.shape(input_mat)
    data = calc_image_in_square({original_height, original_width}, desired_size)

    delta_w = desired_size - data.inner_width
    delta_h = desired_size - data.inner_height
    {top, bottom} = {div(delta_h, 2), delta_h - div(delta_h, 2)}
    {left, right} = {div(delta_w, 2), delta_w - div(delta_w, 2)}

    # The new size should be in (width, height) format
    new_mat =
      input_mat
      |> Cv.resize({data.inner_width, data.inner_height})
      |> Cv.copyMakeBorder(top, bottom, left, right, Cv.Constant.cv_BORDER_CONSTANT())

    %{data: data, mat: new_mat}
  end

  def calc_crop_region({outer_height, outer_width}, {point_x, point_y}) do
    crop_size = Enum.min([outer_height, outer_width])
    crop_size_half = div(crop_size, 2)

    if outer_width > outer_width do
      %{
        start_point: {0, point_y - crop_size_half},
        end_point: {0 + crop_size, point_y + crop_size_half}
      }
    else
      %{
        start_point: {point_x - crop_size_half, 0},
        end_point: {point_x + crop_size_half, crop_size}
      }
    end
  end
end

defmodule PoseEstimation.Keypoint do
  alias PoseEstimation.Color

  def keypoint_names do
    [
      :nose,
      :left_eye,
      :right_eye,
      :left_ear,
      :right_ear,
      :left_shoulder,
      :right_shoulder,
      :left_elbow,
      :right_elbow,
      :left_wrist,
      :right_wrist,
      :left_hip,
      :right_hip,
      :left_knee,
      :right_knee,
      :left_ankle,
      :right_ankle
    ]
  end

  def keypoint_edge_to_color() do
    %{
      {0, 1} => Color.magenta(),
      {0, 2} => Color.cyan(),
      {1, 3} => Color.magenta(),
      {2, 4} => Color.cyan(),
      {0, 5} => Color.magenta(),
      {0, 6} => Color.cyan(),
      {5, 7} => Color.magenta(),
      {7, 9} => Color.magenta(),
      {6, 8} => Color.cyan(),
      {8, 10} => Color.cyan(),
      {5, 6} => Color.yellow(),
      {5, 11} => Color.magenta(),
      {6, 12} => Color.cyan(),
      {11, 12} => Color.yellow(),
      {11, 13} => Color.magenta(),
      {13, 15} => Color.magenta(),
      {12, 14} => Color.cyan(),
      {14, 16} => Color.cyan()
    }
  end

  def calc_keypoints(keypoints_with_scores, size, {offset_x, offset_y} \\ {0, 0})
      when is_list(keypoints_with_scores) and is_number(size) do
    score_threshold = 0.11

    keypoints_with_scores
    |> Enum.zip(keypoint_names())
    |> Enum.map(fn {[y, x, score], name} ->
      if score > score_threshold do
        %{x: x * size + offset_x, y: y * size + offset_y, name: name, score: score}
      else
        nil
      end
    end)
  end

  def calc_keypoint_edges(keypoints) when is_list(keypoints) do
    keypoint_edge_to_color()
    |> Enum.reduce([], fn {{start_point_index, end_point_index}, color}, acc ->
      start_point = keypoints |> Enum.at(start_point_index)
      end_point = keypoints |> Enum.at(end_point_index)

      if is_nil(start_point) or is_nil(end_point) do
        acc
      else
        [%{start_point: start_point, end_point: end_point, color: color} | acc]
      end
    end)
  end

  def draw_keypoints(%Cv.Mat{} = image_mat, keypoints) when is_list(keypoints) do
    keypoints
    |> Enum.reduce(image_mat, fn
      nil, acc_mat ->
        acc_mat

      point, acc_mat ->
        acc_mat
        |> Cv.drawMarker({trunc(point.x), trunc(point.y)}, Color.red(),
          markerSize: 2,
          markerType: Cv.Constant.cv_MARKER_SQUARE(),
          thickness: 2
        )
    end)
  end

  def draw_keypoint_edges(%Cv.Mat{} = image_mat, edges) when is_list(edges) do
    edges
    |> Enum.reduce(image_mat, fn edge, acc_mat ->
      point1 = {trunc(edge.start_point.x), trunc(edge.start_point.y)}
      point2 = {trunc(edge.end_point.x), trunc(edge.end_point.y)}
      acc_mat |> Cv.line(point1, point2, edge.color, thickness: 2)
    end)
  end

  def calc_center_from_keypoints(keypoints) when is_list(keypoints) do
    {{center_x, center_y}, _radius} =
      keypoints
      |> Enum.reject(&amp;is_nil/1)
      |> Enum.map(&amp;[&amp;1.x, &amp;1.y])
      |> Nx.tensor()
      |> Cv.minEnclosingCircle()

    {trunc(center_x), trunc(center_y)}
  end

  def torso_visible?(keypoints, score_threshold \\ 0.2) when is_list(keypoints) do
    keypoints
    |> Enum.reject(&amp;is_nil/1)
    |> Enum.filter(&amp;(&amp;1.name in [:left_shoulder, :right_shoulder, :left_hip, :right_hip]))
    |> Enum.all?(&amp;(&amp;1.score > score_threshold))
  end
end

defmodule PoseEstimation.Inference do
  def init(movenet_model) do
    TFLite.Interpreter.new(movenet_model)
  end

  def run(interpreter, %Nx.Tensor{} = input_image_nx) do
    # Run the model
    TFLite.Interpreter.input_tensor(interpreter, 0, Nx.to_binary(input_image_nx))
    TFLite.Interpreter.invoke(interpreter)

    # Get the model prediction
    {:ok, output_data} = TFLite.Interpreter.output_tensor(interpreter, 0)
    output_data |> Nx.from_binary({:f, 32}) |> Nx.reshape({17, 3})
  end
end

Load GIF

frame_tensors = PoseEstimation.Frame.load_gif_file(data_files.input_gif)

frame_tensors
|> Enum.map(&amp;PoseEstimation.Frame.cv_mat_from_nx_2d/1)
|> PoseEstimation.Frame.animate_frames()

Determine initial crop region

defmodule PoseEstimation.InitialCropRegion do
  alias PoseEstimation.Cropping
  alias PoseEstimation.Keypoint
  alias PoseEstimation.Inference

  def get(interpreter, %Cv.Mat{} = image_mat) do
    {img_height, img_width, _} = Cv.Mat.shape(image_mat)

    calc_initial_crop_region(interpreter, image_mat) ||
      calc_default_crop_region({img_height, img_width})
  end

  defp calc_default_crop_region({img_height, img_width}) do
    Cropping.calc_crop_region(
      {img_height, img_width},
      {trunc(img_width / 2), trunc(img_height / 2)}
    )
  end

  defp calc_initial_crop_region(interpreter, %Cv.Mat{} = image_mat) do
    # Resize and pad the image to keep the aspect ratio and fit the expected size
    image_in_256x256_square = Cropping.resize_with_pad(image_mat, 256)

    %{
      inner_height: inner_height,
      inner_width: inner_width,
      inner_offset_x: inner_offset_x,
      inner_offset_y: inner_offset_y,
      original_height: original_height,
      original_width: original_width,
      scale_factor: scale_factor
    } = image_in_256x256_square.data

    initial_input_image_nx =
      image_in_256x256_square.mat
      |> Cv.Mat.to_nx(Nx.BinaryBackend)
      |> Nx.new_axis(0)
      |> Nx.as_type({:f, 32})

    # Run model inference
    keypoints_with_scores = Inference.run(interpreter, initial_input_image_nx)

    # Calculate keypoints
    keypoints = Keypoint.calc_keypoints(Nx.to_list(keypoints_with_scores), 256)

    if Keypoint.torso_visible?(keypoints) do
      # Calculate crop region in inner image based on midpoint
      crop_region =
        Cropping.calc_crop_region(
          {inner_height, inner_width},
          Keypoint.calc_center_from_keypoints(keypoints)
        )

      crop_size = if original_height > original_width, do: original_width, else: original_height

      # Project crop region onto original image
      Cropping.calc_crop_region(
        {original_height, original_width},
        {
          trunc((elem(crop_region.start_point, 0) + inner_offset_x) / scale_factor) +
            div(crop_size, 2),
          trunc((elem(crop_region.start_point, 1) + inner_offset_y) / scale_factor) +
            div(crop_size, 2)
        }
      )
    else
      nil
    end
  end
end

{:ok, interpreter} = PoseEstimation.Inference.init(data_files.movenet_model)

first_frame_mat = hd(frame_tensors) |> PoseEstimation.Frame.cv_mat_from_nx_2d()
{frame_height, frame_width, _} = first_frame_mat.shape
initial_crop_region = PoseEstimation.InitialCropRegion.get(interpreter, first_frame_mat)

first_frame_mat
|> Cv.rectangle(
  initial_crop_region.start_point,
  initial_crop_region.end_point,
  PoseEstimation.Color.green(),
  thickness: 2
)

Run inference with cropping algorithm

Input is a frame of video represented as an float32 tensor of shape: 256x256x3. Channels order is RGB with values in [0, 255].
Output is a float32 tensor of shape [1, 1, 17, 3]
See https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3

defmodule PoseEstimation.Main do
  alias PoseEstimation.Color
  alias PoseEstimation.Cropping
  alias PoseEstimation.Frame
  alias PoseEstimation.Inference
  alias PoseEstimation.InitialCropRegion
  alias PoseEstimation.Keypoint

  def main(movenet_model, [%Nx.Tensor{} | _] = frame_tensors) do
    {:ok, interpreter} = Inference.init(movenet_model)

    first_frame_nx = hd(frame_tensors) |> Frame.cv_mat_from_nx_2d()
    {frame_height, frame_width, _} = first_frame_nx.shape
    initial_crop_region = InitialCropRegion.get(interpreter, first_frame_nx)

    frame_tensors
    |> Enum.reduce(
      %{crop_region: initial_crop_region, output_images: []},
      fn frame_tensor, acc ->
        {crop_start_x, crop_start_y} = acc.crop_region.start_point
        {crop_end_x, crop_end_y} = acc.crop_region.end_point
        input_image_mat = Frame.cv_mat_from_nx_2d(frame_tensor)

        # Crop image and resize it down to 256x256
        input_image_cropped_mat =
          input_image_mat[[{crop_start_y, crop_end_y}, {crop_start_x, crop_end_x}]]
          |> Cv.resize({256, 256})

        input_image_cropped_nx =
          input_image_cropped_mat
          |> Cv.Mat.to_nx(Nx.BinaryBackend)
          |> Nx.new_axis(0)
          |> Nx.as_type({:f, 32})

        # Run model inference
        keypoints_with_scores_nx = Inference.run(interpreter, input_image_cropped_nx)

        # Calculate keypoints
        keypoints =
          keypoints_with_scores_nx
          |> Nx.to_list()
          |> Keypoint.calc_keypoints(frame_height, {crop_start_x, crop_start_y})

        if Keypoint.torso_visible?(keypoints) do
          keypoint_edges = Keypoint.calc_keypoint_edges(keypoints)

          # Fit square to image
          new_crop_region =
            Cropping.calc_crop_region(
              {frame_height, frame_width},
              Keypoint.calc_center_from_keypoints(keypoints)
            )

          # Draw prediction on image
          frame_mat =
            input_image_mat
            |> Keypoint.draw_keypoint_edges(keypoint_edges)
            |> Keypoint.draw_keypoints(keypoints)
            |> Cv.rectangle(
              new_crop_region.start_point,
              new_crop_region.end_point,
              Color.green(),
              thickness: 2
            )

          %{acc | crop_region: new_crop_region, output_images: [frame_mat | acc.output_images]}
        else
          acc
        end
      end
    )
    |> Map.fetch!(:output_images)
    |> Enum.reverse()
  end
end

output_images = PoseEstimation.Main.main(data_files.movenet_model, frame_tensors)
PoseEstimation.Frame.animate_frames(output_images)

Other notebooks:

@andyl

elix_util

MNIST

mnist.livemd

req axon exla nx

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

decimal vega_lite kino

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

Playground

rss.livemd

req easyxml

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

RSS

rss2.livemd

req easyxml

2022-8-18
Masatoshi Nishiguchi
@mnishiguchi

livebooks

OpenCV bounding box

opencv_bounding_box.livemd

nx evision kino

2024-2-28
Ryo Wakabayashi
@RyoWakabayashi

elixir-learning

Stable Diffusion

stable_diffusion.livemd

bumblebee nx exla kino

2022-12-17
Daniel Kukula
@dkuku

livebooks

MNIST

mnist.livemd

req axon exla

2024-4-23

Back