YOLOv3 object detection with OpenCV DNN
Mix.install([
{:evision, "0.1.29"},
{:kino, "~> 0.8.0"},
{:req, "~> 0.3.0"}
])
Introduction
YOLO
-
YOLO (You Only Look Once)
-
a method to do object detection
OpenCV DNN module
-
DNN (Deep Neural Network)
-
used for running inference on images and videos
Download data files
default_image_url = "https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg"
input_image_url_input = Kino.Input.textarea("Image URL", default: default_image_url)
downloads_dir = System.tmp_dir!()
download = fn url ->
save_as = Path.join(downloads_dir, URI.encode_www_form(url))
unless File.exists?(save_as), do: Req.get!(url, output: save_as)
save_as
end
data_files =
[
# a YOLO v3 weights file pre-trained on COCO (Common Objects in COntext) dataset
yolov3_weights: "https://pjreddie.com/media/files/yolov3.weights",
# a YOLO v3 config file
yolov3_config: "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg",
# a text file containing 80 class names
coco_names: "https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names",
# an image that is to be used as input
input_image: Kino.Input.read(input_image_url_input)
]
|> Enum.map(fn {key, url} -> {key, download.(url)} end)
|> Map.new()
Alias modules
alias Evision, as: Cv
alias TFLiteElixir, as: TFLite
alias TFLiteElixir.TFLiteTensor
Prepare input
Read input image
input_image_mat = Cv.imread(data_files.input_image)
{img_height, img_width, _} = Cv.Mat.shape(input_image_mat)
IO.puts("height: #{img_height}")
IO.puts("width: #{img_width}")
input_image_mat
Read class names
class_names =
data_files.coco_names
|> File.stream!()
|> Enum.map(&String.trim/1)
Create network with pre-trained model and configuration
net =
Cv.DNN.readNet(
data_files.yolov3_weights,
config: data_files.yolov3_config,
framework: ""
)
Run inference
input_blob = Cv.DNN.blobFromImage(input_image_mat, size: {608, 608}, swapRB: true, crop: false)
output_layer_names = Cv.DNN.Net.getUnconnectedOutLayersNames(net)
# set input blob for the network
predictions =
net
|> Cv.DNN.Net.setInput(
input_blob,
name: "",
scalefactor: 1 / 255,
mean: {0, 0, 0}
)
|> Cv.DNN.Net.forward(outBlobNames: output_layer_names)
Get info from predictions
score_threshold = 0.8
predictions_info =
predictions
|> Enum.map(&Cv.Mat.to_nx(&1, Nx.BinaryBackend))
|> Nx.concatenate()
|> Nx.to_batched(1)
# ignore detections with low confidence
|> Enum.filter(fn t ->
score = Nx.to_number(t[0][4])
score > score_threshold
end)
|> Enum.map(fn t ->
# get class with top score
class_score_list = t[0][5..-1//1]
class_id = class_score_list |> Nx.argmax() |> Nx.to_number()
# calculate score
class_score = class_score_list[class_id] |> Nx.to_number()
score = Nx.to_number(t[0][4]) * class_score
# get bounding box
center_x = t[0][0] |> Nx.to_number()
center_y = t[0][1] |> Nx.to_number()
box_width = t[0][2] |> Nx.to_number()
box_height = t[0][3] |> Nx.to_number()
min_x = center_x - box_width / 2
min_y = center_y - box_height / 2
max_x = center_x + box_width / 2
max_y = center_y + box_height / 2
box = {min_x, min_y, max_x, max_y}
%{
box: box,
score: score,
class: class_id
}
end)
Non-max suppression (NMS)
-
remove the redundant bounding boxes with an overlap greater than a threshold
box_list = Enum.map(predictions_info, & &1.box)
score_list = Enum.map(predictions_info, & &1.score)
nms_threshold = 0.7
index_list = Cv.DNN.nmsBoxes(box_list, score_list, score_threshold, nms_threshold)
predictions_info_with_nms = Enum.map(index_list, &Enum.at(predictions_info, &1))
Visualize predictions
-
Draw the detection result on the original image
calc_prediction_box = fn {x, y, x_plus_w, y_plus_h} ->
{img_height, img_width, _} = Cv.Mat.shape(input_image_mat)
left = trunc(x * img_width)
top = trunc(y * img_height)
right = trunc(x_plus_w * img_width)
bottom = trunc(y_plus_h * img_height)
{left, top, right, bottom}
end
for prediction_info <- predictions_info_with_nms, reduce: input_image_mat do
acc_mat ->
{left, top, right, bottom} = calc_prediction_box.(prediction_info.box)
box_start_point = {left, top}
box_end_point = {right, bottom}
box_color = {255, 0, 0}
label_text = class_names |> Enum.at(prediction_info.class)
label_start_point = {left + 6, top + 30}
label_font_scale = 0.9
label_color = {0, 0, 255}
acc_mat
|> Cv.rectangle(
box_start_point,
box_end_point,
box_color,
thickness: 4
)
|> Cv.putText(
label_text,
label_start_point,
Cv.Constant.cv_FONT_HERSHEY_SIMPLEX(),
label_font_scale,
label_color,
thickness: 2
)
end