Human pose estimation on single image
Mix.install([
{:tflite_elixir, "~> 0.3.0"},
{:evision, "0.1.31"},
{:req, "~> 0.3.0"},
{:kino, "~> 0.9.0"}
])
Introduction
Pose estimation is the task of using an ML model to estimate the pose of a person from an image or a video by estimating the spatial locations of key body joints (keypoints).
This session demonstrates the minumum working example of running the model on a single image to predict the 17 human keypoints.
https://www.tensorflow.org/lite/examples/pose_estimation/overview
https://www.tensorflow.org/hub/tutorials/movenet
It’s useful to alias the module as something shorter when we make extensive use of the functions from certain modules.
alias Evision, as: Cv
alias TFLiteElixir, as: TFLite
alias TFLiteElixir.TFLiteTensor
Download data files
# /data is the writable portion of a Nerves system
downloads_dir =
if Code.ensure_loaded?(Nerves.Runtime), do: "/data/livebook", else: System.tmp_dir!()
download = fn url ->
save_as = Path.join(downloads_dir, URI.encode_www_form(url))
unless File.exists?(save_as), do: Req.get!(url, output: save_as)
save_as
end
data_files =
[
movenet_model:
"https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3?lite-format=tflite",
test_img: "https://images.pexels.com/photos/4384679/pexels-photo-4384679.jpeg"
]
|> Enum.map(fn {key, url} -> {key, download.(url)} end)
|> Map.new()
data_files
|> Enum.map(fn {k, v} -> [name: k, location: v] end)
|> Kino.DataTable.new(name: "Data files")
test_image_input = Kino.Input.image(" ")
test_image_mat =
case Kino.Input.read(test_image_input) do
%{data: data, height: height, width: width} ->
Cv.Mat.from_binary(data, {:u, 8}, height, width, 3)
|> Cv.cvtColor(Cv.Constant.cv_COLOR_BGR2RGB())
nil ->
Cv.imread(data_files.test_img)
end
:ok
Preprocess input image
- Input is a frame of video or an image, represented as an float32 tensor of shape: 256x256x3. Channels order is RGB with values in [0, 255].
- See https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3
resize_with_pad = fn %Cv.Mat{} = input_image_mat, desired_size when is_number(desired_size) ->
# The original size is in (height, width) format
{original_height, original_width, _} = Cv.Mat.shape(input_image_mat)
ratio = desired_size / Enum.max([original_height, original_width])
{inner_height, inner_width} = {trunc(original_height * ratio), trunc(original_width * ratio)}
delta_w = desired_size - inner_width
delta_h = desired_size - inner_height
{top, bottom} = {div(delta_h, 2), delta_h - div(delta_h, 2)}
{left, right} = {div(delta_w, 2), delta_w - div(delta_w, 2)}
padding_color = {64, 64, 64}
input_image_mat
# The new size should be in (width, height) format
|> Cv.resize({inner_width, inner_height})
|> Cv.copyMakeBorder(top, bottom, left, right, Cv.Constant.cv_BORDER_CONSTANT(),
value: padding_color
)
end
test_image_resized_mat =
test_image_mat
|> resize_with_pad.(256)
input_image_tensor =
test_image_resized_mat
|> Cv.Mat.to_nx(Nx.BinaryBackend)
|> Nx.new_axis(0)
|> Nx.as_type({:f, 32})
[
["Input image", test_image_mat],
["Preprocessed", test_image_resized_mat]
]
|> Enum.map(fn [label, img] ->
Kino.Layout.grid([img, Kino.Markdown.new("**#{label}**")], boxed: true)
end)
|> Kino.Layout.grid(columns: 2)
Run inference
- Output is a float32 tensor of shape [1, 1, 17, 3]
- See https://tfhub.dev/google/lite-model/movenet/singlepose/thunder/3
# Initialize the TFLite interpreter
{:ok, interpreter} = TFLite.Interpreter.new(data_files.movenet_model)
# Run the model
TFLite.Interpreter.input_tensor(interpreter, 0, Nx.to_binary(input_image_tensor))
TFLite.Interpreter.invoke(interpreter)
# Get the model prediction
{:ok, output_data} = TFLite.Interpreter.output_tensor(interpreter, 0)
{:ok, [tflite_tensor_index]} = TFLite.Interpreter.outputs(interpreter)
tflite_tensor = TFLite.Interpreter.tensor(interpreter, tflite_tensor_index)
{:f, 32} = output_type = TFLiteTensor.type(tflite_tensor)
{1, 1, 17, 3} = output_shape = TFLiteTensor.shape(tflite_tensor)
keypoints_with_scores =
output_data
|> Nx.from_binary(output_type)
|> Nx.reshape(output_shape)
Visualize predictions
keypoint_names = [
:nose,
:left_eye,
:right_eye,
:left_ear,
:right_ear,
:left_shoulder,
:right_shoulder,
:left_elbow,
:right_elbow,
:left_wrist,
:right_wrist,
:left_hip,
:right_hip,
:left_knee,
:right_knee,
:left_ankle,
:right_ankle
]
edge_color1 = {255, 0, 255}
edge_color2 = {255, 255, 0}
edge_color3 = {0, 255, 255}
keypoint_edge_to_color = %{
{0, 1} => edge_color1,
{0, 2} => edge_color2,
{1, 3} => edge_color1,
{2, 4} => edge_color2,
{0, 5} => edge_color1,
{0, 6} => edge_color2,
{5, 7} => edge_color1,
{7, 9} => edge_color1,
{6, 8} => edge_color2,
{8, 10} => edge_color2,
{5, 6} => edge_color3,
{5, 11} => edge_color1,
{6, 12} => edge_color2,
{11, 12} => edge_color3,
{11, 13} => edge_color1,
{13, 15} => edge_color1,
{12, 14} => edge_color2,
{14, 16} => edge_color2
}
display_size = 1280
keypoint_threshold = 0.11
keypoints =
Nx.to_list(keypoints_with_scores[0][0][0..16])
|> Enum.zip(keypoint_names)
|> Enum.with_index(fn
{[y, x, score], name}, index when score > keypoint_threshold ->
%{x: x * display_size, y: y * display_size, name: name, index: index, score: score}
_keypoint, _index ->
nil
end)
keypoint_edges =
for {{edge_start_index, edge_end_index}, color} <- keypoint_edge_to_color, reduce: [] do
acc ->
edge_start = Enum.at(keypoints, edge_start_index)
edge_end = Enum.at(keypoints, edge_end_index)
if is_nil(edge_start) or is_nil(edge_end) do
acc
else
[%{edge_start: edge_start, edge_end: edge_end, color: color} | acc]
end
end
keypoints
|> Kino.DataTable.new(name: "Keypoints")
draw_keypoints = fn %Cv.Mat{} = input_image_mat, keypoints ->
for %{x: x, y: y} <- keypoints, reduce: input_image_mat do
acc_mat ->
Cv.drawMarker(
acc_mat,
{round(x), round(y)},
{0, 0, 255},
markerSize: 10,
thickness: 3
)
end
end
draw_keypoint_edges = fn %Cv.Mat{} = input_image_mat, edges ->
for %{color: color, edge_start: edge_start, edge_end: edge_end} <- edges,
reduce: input_image_mat do
acc_mat ->
Cv.line(
acc_mat,
{round(edge_start.x), round(edge_start.y)},
{round(edge_end.x), round(edge_end.y)},
color,
thickness: 3,
lineType: 1
)
end
end
test_image_mat
|> resize_with_pad.(display_size)
|> draw_keypoint_edges.(keypoint_edges)
|> draw_keypoints.(keypoints)