Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

YOLOv7

livebooks/evision/yolov7.livemd

YOLOv7

Mix.install(
  [
    {:nx, "~> 0.8"},
    {:exla, "~> 0.8"},
    {:evision, "~> 0.2"},
    {:req, "~> 0.5"},
    {:kino, "~> 0.14"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

モデル読込

事前に以下のコンテナを使って ONNX 形式に変換した YOLOv7x モデルを /tmp/ に配置しているものとする

https://github.com/RyoWakabayashi/elixir-learning/tree/main/ml_model_conversion/yolov7

net = Evision.DNN.readNet("/tmp/yolov7x.onnx")
out_names = Evision.DNN.Net.getUnconnectedOutLayersNames(net)

ラベル一覧の取得

labels =
  "https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names"
  |> Req.get!()
  |> Map.get(:body)
  |> then(&amp;Enum.join(for <>, do: <>))
  |> String.split("\n")
  |> Enum.filter(&amp;(&amp;1 != ""))
Enum.count(labels)

推論の実行

画像ダウンロード

img =
  "https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg"
  |> Req.get!()
  |> Map.get(:body)
  |> Evision.imdecode(Evision.Constant.cv_IMREAD_COLOR())
{img_height, img_width, _} = Evision.Mat.shape(img)

推論用の形式へ変換

blob = Evision.DNN.blobFromImage(img, size: {640, 640}, swapRB: true, crop: false)

推論

predictions =
  net
  |> Evision.DNN.Net.setInput(
    blob,
    name: "",
    scalefactor: 1 / 255,
    mean: {0, 0, 0}
  )
  |> Evision.DNN.Net.forward(outBlobNames: out_names)

推論結果の整形

閾値を超える領域の取得

score_threshold = 0.6
predictions_tensor =
  predictions
  |> Enum.at(0)
  |> Evision.Mat.to_nx(EXLA.Backend)
all_bbox_score_tensor = predictions_tensor[[0, 0..-1//1, 4]]
greater_tensor = Nx.greater(all_bbox_score_tensor, score_threshold)
greater_count =
  greater_tensor
  |> Nx.sum()
  |> Nx.to_number()
greater_indices = Nx.argsort(greater_tensor, direction: :desc)[[0..(greater_count - 1)]]
greater_predictions_tensor = Nx.take(predictions_tensor[0], greater_indices, axis: 0)
bbox_score_tensor = greater_predictions_tensor[[0..-1//1, 4]]

クラスの判定

class_score_tensor = greater_predictions_tensor[[0..-1//1, 5..-1//1]]
class_index_tensor = Nx.argmax(class_score_tensor, axis: 1)
top_class_score_tensor = Nx.reduce_max(class_score_tensor, axes: [1])
score_tensor = Nx.multiply(bbox_score_tensor, top_class_score_tensor)

座標情報の変換

coordinate_tensor = greater_predictions_tensor[[0..-1//1, 0..3]]
bbox_half_width = Nx.divide(coordinate_tensor[[0..-1//1, 2]], 2)
bbox_half_height = Nx.divide(coordinate_tensor[[0..-1//1, 3]], 2)
min_x_tensor =
  coordinate_tensor[[0..-1//1, 0]]
  |> Nx.subtract(bbox_half_width)
  |> Nx.multiply(img_width / 640)

min_y_tensor =
  coordinate_tensor[[0..-1//1, 1]]
  |> Nx.subtract(bbox_half_height)
  |> Nx.multiply(img_height / 640)

max_x_tensor =
  coordinate_tensor[[0..-1//1, 0]]
  |> Nx.add(bbox_half_width)
  |> Nx.multiply(img_width / 640)

max_y_tensor =
  coordinate_tensor[[0..-1//1, 1]]
  |> Nx.add(bbox_half_height)
  |> Nx.multiply(img_height / 640)

formed_coordinate_tensor =
  [min_x_tensor, min_y_tensor, max_x_tensor, max_y_tensor]
  |> Nx.stack()
  |> Nx.transpose()

Non-Maximun Suppression

score_list = Nx.to_list(score_tensor)

nms_threshold = 0.7

selected_index_tensor =
  formed_coordinate_tensor
  |> Evision.DNN.nmsBoxes(score_list, score_threshold, nms_threshold)
  |> Nx.tensor()
selected_bboxes = Nx.take(formed_coordinate_tensor, selected_index_tensor)
selected_classes = Nx.take(class_index_tensor, selected_index_tensor) |> Nx.new_axis(1)
selected_scores = Nx.take(score_tensor, selected_index_tensor) |> Nx.new_axis(1)
formed_tensor = Nx.concatenate([selected_bboxes, selected_classes, selected_scores], axis: 1)

推論結果の描画

formed_tensor
|> Nx.to_list()
|> Enum.reduce(img, fn prediction, drawed_mat ->
  # 座標情報、クラス番号は整数に変換する
  [left, top, right, bottom, class_index] =
    prediction
    |> Enum.slice(0..4)
    |> Enum.map(&amp;trunc(&amp;1))

  # スコアは小数点以下3桁の文字列に変換する
  score =
    prediction
    |> Enum.at(5)
    |> Float.round(3)
    |> Float.to_string()

  # class の値に対応するラベルを取得する
  label = Enum.at(labels, class_index)

  drawed_mat
  # 四角形を描画する
  |> Evision.rectangle(
    {left, top},
    {right, bottom},
    {255, 0, 0},
    thickness: 10
  )
  # ラベル文字を書く
  |> Evision.putText(
    label <> ":" <> score,
    {left + 6, top + 26},
    Evision.Constant.cv_FONT_HERSHEY_SIMPLEX(),
    1,
    {0, 0, 255},
    thickness: 2
  )
end)

YOLOv7 のモジュール化

import Nx.Defn
defmodule YOLOv7 do
  def detect(img, net, out_names, score_threshold, nms_threshold) do
    blob = Evision.DNN.blobFromImage(img, size: {640, 640}, swapRB: true, crop: false)

    predictions =
      net
      |> Evision.DNN.Net.setInput(
        blob,
        name: "",
        scalefactor: 1 / 255,
        mean: {0, 0, 0}
      )
      |> Evision.DNN.Net.forward(outBlobNames: out_names)
      |> Enum.at(0)
      |> Evision.Mat.to_nx(EXLA.Backend)

    selected_tensor = filter_predictions(predictions, score_threshold)

    {img_height, img_width, _} = Evision.Mat.shape(img)
    coordinate_tensor = selected_tensor[[0..-1//1, 0..3]]
    formed_coordinate_tensor = format_coordinates(coordinate_tensor, img_width, img_height)

    {class_index_tensor, score_tensor} = get_class_and_score(selected_tensor)

    nms(
      formed_coordinate_tensor,
      class_index_tensor,
      score_tensor,
      score_threshold,
      nms_threshold
    )
  end

  def filter_predictions(predictions, score_threshold) do
    greater_tensor = Nx.greater(predictions[[0, 0..-1//1, 4]], score_threshold)

    greater_count =
      greater_tensor
      |> Nx.sum()
      |> Nx.to_number()

    greater_indices = Nx.argsort(greater_tensor, direction: :desc)[[0..(greater_count - 1)]]

    Nx.take(predictions[0], greater_indices, axis: 0)
  end

  defn format_coordinates(coordinate_tensor, width, height) do
    bbox_half_width = coordinate_tensor[[0..-1//1, 2]] / 2
    bbox_half_height = coordinate_tensor[[0..-1//1, 3]] / 2
    width_ratio = width / 640
    height_ratio = height / 640

    min_x_tensor = (coordinate_tensor[[0..-1//1, 0]] - bbox_half_width) * width_ratio
    min_y_tensor = (coordinate_tensor[[0..-1//1, 1]] - bbox_half_height) * height_ratio
    max_x_tensor = (coordinate_tensor[[0..-1//1, 0]] + bbox_half_width) * width_ratio
    max_y_tensor = (coordinate_tensor[[0..-1//1, 1]] + bbox_half_height) * height_ratio

    [min_x_tensor, min_y_tensor, max_x_tensor, max_y_tensor]
    |> Nx.stack()
    |> Nx.transpose()
  end

  defn get_class_and_score(selected_tensor) do
    bbox_score_tensor = selected_tensor[[0..-1//1, 4]]
    class_score_tensor = selected_tensor[[0..-1//1, 5..-1//1]]
    class_index_tensor = Nx.argmax(class_score_tensor, axis: 1)
    top_class_score_tensor = Nx.reduce_max(class_score_tensor, axes: [1])
    score_tensor = bbox_score_tensor * top_class_score_tensor

    {class_index_tensor, score_tensor}
  end

  def nms(
        formed_coordinate_tensor,
        class_index_tensor,
        score_tensor,
        score_threshold,
        nms_threshold
      ) do
    score_list = score_tensor |> Nx.to_list()

    selected_index_tensor =
      formed_coordinate_tensor
      |> Evision.DNN.nmsBoxes(score_list, score_threshold, nms_threshold)
      |> Nx.tensor()

    selected_bboxes = Nx.take(formed_coordinate_tensor, selected_index_tensor)
    selected_classes = Nx.take(class_index_tensor, selected_index_tensor) |> Nx.new_axis(1)
    selected_scores = Nx.take(score_tensor, selected_index_tensor) |> Nx.new_axis(1)

    Nx.concatenate([selected_bboxes, selected_classes, selected_scores], axis: 1)
  end

  def draw_bbox(img, bbox_tensor, labels) do
    bbox_tensor
    |> Nx.to_list()
    |> Enum.reduce(img, fn prediction, drawed_mat ->
      [left, top, right, bottom, class_index] =
        prediction
        |> Enum.slice(0..4)
        |> Enum.map(&amp;trunc(&amp;1))

      score =
        prediction
        |> Enum.at(5)
        |> Float.round(3)
        |> Float.to_string()

      # class の値に対応するラベルを取得する
      label = Enum.at(labels, class_index)

      drawed_mat
      # 四角形を描画する
      |> Evision.rectangle(
        {left, top},
        {right, bottom},
        {255, 0, 0},
        thickness: 10
      )
      # ラベル文字を書く
      |> Evision.putText(
        label <> ":" <> score,
        {left + 6, top + 26},
        Evision.Constant.cv_FONT_HERSHEY_SIMPLEX(),
        1,
        {0, 0, 255},
        thickness: 2
      )
    end)
  end
end
bbox_tensor = YOLOv7.detect(img, net, out_names, 0.6, 0.7)
YOLOv7.draw_bbox(img, bbox_tensor, labels)