Human pose estimation on single image

pose_estimation_single_image.livemd

Cocoa

@cocoa-xu

tflite_elixir

Share to X

Share to Bluesky

More notebooks

Human pose estimation on single image

Mix.install([
  {:tflite_elixir, "~> 0.3.0"},
  {:evision, "0.1.31"},
  {:req, "~> 0.3.0"},
  {:kino, "~> 0.9.0"}
])

Introduction

Pose estimation is the task of using an ML model to estimate the pose of a person from an image or a video by estimating the spatial locations of key body joints (keypoints).

This session demonstrates the minumum working example of running the model on a single image to predict the 17 human keypoints.

https://www.tensorflow.org/lite/examples/pose_estimation/overview

https://www.tensorflow.org/hub/tutorials/movenet

It’s useful to alias the module as something shorter when we make extensive use of the functions from certain modules.

alias Evision, as: Cv
alias TFLiteElixir, as: TFLite
alias TFLiteElixir.TFLiteTensor

Download data files

# /data is the writable portion of a Nerves system
downloads_dir =
  if Code.ensure_loaded?(Nerves.Runtime), do: "/data/livebook", else: System.tmp_dir!()

download = fn url ->
  save_as = Path.join(downloads_dir, URI.encode_www_form(url))
  unless File.exists?(save_as), do: Req.get!(url, output: save_as)
  save_as
end

data_files =
  [
    movenet_model:
      "https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3?lite-format=tflite",
    test_img: "https://images.pexels.com/photos/4384679/pexels-photo-4384679.jpeg"
  ]
  |> Enum.map(fn {key, url} -> {key, download.(url)} end)
  |> Map.new()

data_files
|> Enum.map(fn {k, v} -> [name: k, location: v] end)
|> Kino.DataTable.new(name: "Data files")

test_image_input = Kino.Input.image(" ")

test_image_mat =
  case Kino.Input.read(test_image_input) do
    %{data: data, height: height, width: width} ->
      Cv.Mat.from_binary(data, {:u, 8}, height, width, 3)
      |> Cv.cvtColor(Cv.Constant.cv_COLOR_BGR2RGB())

    nil ->
      Cv.imread(data_files.test_img)
  end

:ok

Preprocess input image

Input is a frame of video or an image, represented as an float32 tensor of shape: 256x256x3. Channels order is RGB with values in [0, 255].
See https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3

resize_with_pad = fn %Cv.Mat{} = input_image_mat, desired_size when is_number(desired_size) ->
  # The original size is in (height, width) format
  {original_height, original_width, _} = Cv.Mat.shape(input_image_mat)

  ratio = desired_size / Enum.max([original_height, original_width])
  {inner_height, inner_width} = {trunc(original_height * ratio), trunc(original_width * ratio)}

  delta_w = desired_size - inner_width
  delta_h = desired_size - inner_height
  {top, bottom} = {div(delta_h, 2), delta_h - div(delta_h, 2)}
  {left, right} = {div(delta_w, 2), delta_w - div(delta_w, 2)}
  padding_color = {64, 64, 64}

  input_image_mat
  # The new size should be in (width, height) format
  |> Cv.resize({inner_width, inner_height})
  |> Cv.copyMakeBorder(top, bottom, left, right, Cv.Constant.cv_BORDER_CONSTANT(),
    value: padding_color
  )
end

test_image_resized_mat =
  test_image_mat
  |> resize_with_pad.(256)

input_image_tensor =
  test_image_resized_mat
  |> Cv.Mat.to_nx(Nx.BinaryBackend)
  |> Nx.new_axis(0)
  |> Nx.as_type({:f, 32})

[
  ["Input image", test_image_mat],
  ["Preprocessed", test_image_resized_mat]
]
|> Enum.map(fn [label, img] ->
  Kino.Layout.grid([img, Kino.Markdown.new("**#{label}**")], boxed: true)
end)
|> Kino.Layout.grid(columns: 2)

Run inference

Output is a float32 tensor of shape [1, 1, 17, 3]
See https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3

# Initialize the TFLite interpreter
{:ok, interpreter} = TFLite.Interpreter.new(data_files.movenet_model)

# Run the model
TFLite.Interpreter.input_tensor(interpreter, 0, Nx.to_binary(input_image_tensor))
TFLite.Interpreter.invoke(interpreter)

# Get the model prediction
{:ok, output_data} = TFLite.Interpreter.output_tensor(interpreter, 0)
{:ok, [tflite_tensor_index]} = TFLite.Interpreter.outputs(interpreter)
tflite_tensor = TFLite.Interpreter.tensor(interpreter, tflite_tensor_index)

{:f, 32} = output_type = TFLiteTensor.type(tflite_tensor)
{1, 1, 17, 3} = output_shape = TFLiteTensor.shape(tflite_tensor)

keypoints_with_scores =
  output_data
  |> Nx.from_binary(output_type)
  |> Nx.reshape(output_shape)

Visualize predictions

keypoint_names = [
  :nose,
  :left_eye,
  :right_eye,
  :left_ear,
  :right_ear,
  :left_shoulder,
  :right_shoulder,
  :left_elbow,
  :right_elbow,
  :left_wrist,
  :right_wrist,
  :left_hip,
  :right_hip,
  :left_knee,
  :right_knee,
  :left_ankle,
  :right_ankle
]

edge_color1 = {255, 0, 255}
edge_color2 = {255, 255, 0}
edge_color3 = {0, 255, 255}

keypoint_edge_to_color = %{
  {0, 1} => edge_color1,
  {0, 2} => edge_color2,
  {1, 3} => edge_color1,
  {2, 4} => edge_color2,
  {0, 5} => edge_color1,
  {0, 6} => edge_color2,
  {5, 7} => edge_color1,
  {7, 9} => edge_color1,
  {6, 8} => edge_color2,
  {8, 10} => edge_color2,
  {5, 6} => edge_color3,
  {5, 11} => edge_color1,
  {6, 12} => edge_color2,
  {11, 12} => edge_color3,
  {11, 13} => edge_color1,
  {13, 15} => edge_color1,
  {12, 14} => edge_color2,
  {14, 16} => edge_color2
}

display_size = 1280
keypoint_threshold = 0.11

keypoints =
  Nx.to_list(keypoints_with_scores[0][0][0..16])
  |> Enum.zip(keypoint_names)
  |> Enum.with_index(fn
    {[y, x, score], name}, index when score > keypoint_threshold ->
      %{x: x * display_size, y: y * display_size, name: name, index: index, score: score}

    _keypoint, _index ->
      nil
  end)

keypoint_edges =
  for {{edge_start_index, edge_end_index}, color} <- keypoint_edge_to_color, reduce: [] do
    acc ->
      edge_start = Enum.at(keypoints, edge_start_index)
      edge_end = Enum.at(keypoints, edge_end_index)

      if is_nil(edge_start) or is_nil(edge_end) do
        acc
      else
        [%{edge_start: edge_start, edge_end: edge_end, color: color} | acc]
      end
  end

keypoints
|> Kino.DataTable.new(name: "Keypoints")

draw_keypoints = fn %Cv.Mat{} = input_image_mat, keypoints ->
  for %{x: x, y: y} <- keypoints, reduce: input_image_mat do
    acc_mat ->
      Cv.drawMarker(
        acc_mat,
        {round(x), round(y)},
        {0, 0, 255},
        markerSize: 10,
        thickness: 3
      )
  end
end

draw_keypoint_edges = fn %Cv.Mat{} = input_image_mat, edges ->
  for %{color: color, edge_start: edge_start, edge_end: edge_end} <- edges,
      reduce: input_image_mat do
    acc_mat ->
      Cv.line(
        acc_mat,
        {round(edge_start.x), round(edge_start.y)},
        {round(edge_end.x), round(edge_end.y)},
        color,
        thickness: 3,
        lineType: 1
      )
  end
end

test_image_mat
|> resize_with_pad.(display_size)
|> draw_keypoint_edges.(keypoint_edges)
|> draw_keypoints.(keypoints)

Other notebooks:

@andyl

elix_util

MNIST

mnist.livemd

req axon exla nx

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

decimal vega_lite kino

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

Playground

rss.livemd

req easyxml

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

RSS

rss2.livemd

req easyxml

2022-8-18
@DockYard-Academy

curriculum

Weighted Voting

deprecated_weighted_voting.livemd

jason kino youtube hidden_cell

2023-6-5
Daniel Lauzon
@daneroo

elixir-garden

Chapter 5 - Traditional Machine Learning

TraditionalMachineLearning.livemd

scholar nx scidata vega_lite kino_vega_lite

2024-9-22
@alde103

Build_Large_Language_Mode...

Chapter 7: Fine-tuning to follow instructions

ch7.livemd

nx exla axon table_rex bumblebee explorer req kino_vega_lite httpoison

2024-11-25

Back