Jakub Sujak | 433a595 | 2020-06-17 15:35:03 +0100 | [diff] [blame] | 1 | # Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
|
| 2 | # SPDX-License-Identifier: MIT
|
| 3 |
|
| 4 | """
|
| 5 | This file contains shared functions used in the object detection scripts for
|
| 6 | preprocessing data, preparing the network and postprocessing.
|
| 7 | """
|
| 8 |
|
| 9 | import os
|
| 10 | import cv2
|
| 11 | import numpy as np
|
| 12 | import pyarmnn as ann
|
| 13 |
|
| 14 |
|
| 15 | def create_video_writer(video: cv2.VideoCapture, video_path: str, output_path: str):
|
| 16 | """
|
| 17 | Creates a video writer object to write processed frames to file.
|
| 18 |
|
| 19 | Args:
|
| 20 | video: Video capture object, contains information about data source.
|
| 21 | video_path: User-specified video file path.
|
| 22 | output_path: Optional path to save the processed video.
|
| 23 |
|
| 24 | Returns:
|
| 25 | Video writer object.
|
| 26 | """
|
| 27 | _, ext = os.path.splitext(video_path)
|
| 28 |
|
| 29 | if output_path is not None:
|
| 30 | assert os.path.isdir(output_path)
|
| 31 |
|
| 32 | i, filename = 0, os.path.join(output_path if output_path is not None else str(), f'object_detection_demo{ext}')
|
| 33 | while os.path.exists(filename):
|
| 34 | i += 1
|
| 35 | filename = os.path.join(output_path if output_path is not None else str(), f'object_detection_demo({i}){ext}')
|
| 36 |
|
| 37 | video_writer = cv2.VideoWriter(filename=filename,
|
| 38 | fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
|
| 39 | fps=int(video.get(cv2.CAP_PROP_FPS)),
|
| 40 | frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)),
|
| 41 | int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))))
|
| 42 | return video_writer
|
| 43 |
|
| 44 |
|
| 45 | def create_network(model_file: str, backends: list):
|
| 46 | """
|
| 47 | Creates a network based on the model file and a list of backends.
|
| 48 |
|
| 49 | Args:
|
| 50 | model_file: User-specified model file.
|
| 51 | backends: List of backends to optimize network.
|
| 52 |
|
| 53 | Returns:
|
| 54 | net_id: Unique ID of the network to run.
|
| 55 | runtime: Runtime context for executing inference.
|
| 56 | input_binding_info: Contains essential information about the model input.
|
| 57 | output_binding_info: Used to map output tensor and its memory.
|
| 58 | """
|
| 59 | if not os.path.exists(model_file):
|
| 60 | raise FileNotFoundError(f'Model file not found for: {model_file}')
|
| 61 |
|
| 62 | # Determine which parser to create based on model file extension
|
| 63 | parser = None
|
| 64 | _, ext = os.path.splitext(model_file)
|
| 65 | if ext == '.tflite':
|
| 66 | parser = ann.ITfLiteParser()
|
| 67 | elif ext == '.pb':
|
| 68 | parser = ann.ITfParser()
|
| 69 | elif ext == '.onnx':
|
| 70 | parser = ann.IOnnxParser()
|
| 71 | assert (parser is not None)
|
| 72 | network = parser.CreateNetworkFromBinaryFile(model_file)
|
| 73 |
|
| 74 | # Specify backends to optimize network
|
| 75 | preferred_backends = []
|
| 76 | for b in backends:
|
| 77 | preferred_backends.append(ann.BackendId(b))
|
| 78 |
|
| 79 | # Select appropriate device context and optimize the network for that device
|
| 80 | options = ann.CreationOptions()
|
| 81 | runtime = ann.IRuntime(options)
|
| 82 | opt_network, messages = ann.Optimize(network, preferred_backends, runtime.GetDeviceSpec(),
|
| 83 | ann.OptimizerOptions())
|
| 84 | print(f'Preferred backends: {backends}\n{runtime.GetDeviceSpec()}\n'
|
| 85 | f'Optimization warnings: {messages}')
|
| 86 |
|
| 87 | # Load the optimized network onto the Runtime device
|
| 88 | net_id, _ = runtime.LoadNetwork(opt_network)
|
| 89 |
|
| 90 | # Get input and output binding information
|
| 91 | graph_id = parser.GetSubgraphCount() - 1
|
| 92 | input_names = parser.GetSubgraphInputTensorNames(graph_id)
|
| 93 | input_binding_info = parser.GetNetworkInputBindingInfo(graph_id, input_names[0])
|
| 94 | output_names = parser.GetSubgraphOutputTensorNames(graph_id)
|
| 95 | output_binding_info = []
|
| 96 | for output_name in output_names:
|
| 97 | outBindInfo = parser.GetNetworkOutputBindingInfo(graph_id, output_name)
|
| 98 | output_binding_info.append(outBindInfo)
|
| 99 | return net_id, runtime, input_binding_info, output_binding_info
|
| 100 |
|
| 101 |
|
| 102 | def dict_labels(labels_file: str):
|
| 103 | """
|
| 104 | Creates a labels dictionary from the input labels file.
|
| 105 |
|
| 106 | Args:
|
| 107 | labels_file: Default or user-specified file containing the model output labels.
|
| 108 |
|
| 109 | Returns:
|
| 110 | A dictionary keyed on the classification index with values corresponding to
|
| 111 | labels and randomly generated RGB colors.
|
| 112 | """
|
| 113 | labels_dict = {}
|
| 114 | with open(labels_file, 'r') as labels:
|
| 115 | for index, line in enumerate(labels, 0):
|
| 116 | labels_dict[index] = line.strip('\n'), tuple(np.random.random(size=3) * 255)
|
| 117 | return labels_dict
|
| 118 |
|
| 119 |
|
| 120 | def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple):
|
| 121 | """
|
| 122 | Resizes frame while maintaining aspect ratio, padding any empty space.
|
| 123 |
|
| 124 | Args:
|
| 125 | frame: Captured frame.
|
| 126 | input_binding_info: Contains shape of model input layer.
|
| 127 |
|
| 128 | Returns:
|
| 129 | Frame resized to the size of model input layer.
|
| 130 | """
|
| 131 | aspect_ratio = frame.shape[1] / frame.shape[0]
|
| 132 | model_height, model_width = list(input_binding_info[1].GetShape())[1:3]
|
| 133 |
|
| 134 | if aspect_ratio >= 1.0:
|
| 135 | new_height, new_width = int(model_width / aspect_ratio), model_width
|
| 136 | b_padding, r_padding = model_height - new_height, 0
|
| 137 | else:
|
| 138 | new_height, new_width = model_height, int(model_height * aspect_ratio)
|
| 139 | b_padding, r_padding = 0, model_width - new_width
|
| 140 |
|
| 141 | # Resize and pad any empty space
|
| 142 | frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
|
| 143 | frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding,
|
| 144 | borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0])
|
| 145 | return frame
|
| 146 |
|
| 147 |
|
| 148 | def preprocess(frame: np.ndarray, input_binding_info: tuple):
|
| 149 | """
|
| 150 | Takes a frame, resizes, swaps channels and converts data type to match
|
| 151 | model input layer. The converted frame is wrapped in a const tensor
|
| 152 | and bound to the input tensor.
|
| 153 |
|
| 154 | Args:
|
| 155 | frame: Captured frame from video.
|
| 156 | input_binding_info: Contains shape and data type of model input layer.
|
| 157 |
|
| 158 | Returns:
|
| 159 | Input tensor.
|
| 160 | """
|
| 161 | # Swap channels and resize frame to model resolution
|
| 162 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 163 | resized_frame = resize_with_aspect_ratio(frame, input_binding_info)
|
| 164 |
|
| 165 | # Expand dimensions and convert data type to match model input
|
| 166 | data_type = np.float32 if input_binding_info[1].GetDataType() == ann.DataType_Float32 else np.uint8
|
| 167 | resized_frame = np.expand_dims(np.asarray(resized_frame, dtype=data_type), axis=0)
|
| 168 | assert resized_frame.shape == tuple(input_binding_info[1].GetShape())
|
| 169 |
|
| 170 | input_tensors = ann.make_input_tensors([input_binding_info], [resized_frame])
|
| 171 | return input_tensors
|
| 172 |
|
| 173 |
|
| 174 | def execute_network(input_tensors: list, output_tensors: list, runtime, net_id: int) -> np.ndarray:
|
| 175 | """
|
| 176 | Executes inference for the loaded network.
|
| 177 |
|
| 178 | Args:
|
| 179 | input_tensors: The input frame tensor.
|
| 180 | output_tensors: The output tensor from output node.
|
| 181 | runtime: Runtime context for executing inference.
|
| 182 | net_id: Unique ID of the network to run.
|
| 183 |
|
| 184 | Returns:
|
| 185 | Inference results as a list of ndarrays.
|
| 186 | """
|
| 187 | runtime.EnqueueWorkload(net_id, input_tensors, output_tensors)
|
| 188 | output = ann.workload_tensors_to_ndarray(output_tensors)
|
| 189 | return output
|
| 190 |
|
| 191 |
|
| 192 | def draw_bounding_boxes(frame: np.ndarray, detections: list, resize_factor, labels: dict):
|
| 193 | """
|
| 194 | Draws bounding boxes around detected objects and adds a label and confidence score.
|
| 195 |
|
| 196 | Args:
|
| 197 | frame: The original captured frame from video source.
|
| 198 | detections: A list of detected objects in the form [class, [box positions], confidence].
|
| 199 | resize_factor: Resizing factor to scale box coordinates to output frame size.
|
| 200 | labels: Dictionary of labels and colors keyed on the classification index.
|
| 201 | """
|
| 202 | for detection in detections:
|
| 203 | class_idx, box, confidence = [d for d in detection]
|
| 204 | label, color = labels[class_idx][0].capitalize(), labels[class_idx][1]
|
| 205 |
|
| 206 | # Obtain frame size and resized bounding box positions
|
| 207 | frame_height, frame_width = frame.shape[:2]
|
| 208 | x_min, y_min, x_max, y_max = [int(position * resize_factor) for position in box]
|
| 209 |
|
| 210 | # Ensure box stays within the frame
|
| 211 | x_min, y_min = max(0, x_min), max(0, y_min)
|
| 212 | x_max, y_max = min(frame_width, x_max), min(frame_height, y_max)
|
| 213 |
|
| 214 | # Draw bounding box around detected object
|
| 215 | cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2)
|
| 216 |
|
| 217 | # Create label for detected object class
|
| 218 | label = f'{label} {confidence * 100:.1f}%'
|
| 219 | label_color = (0, 0, 0) if sum(color)>200 else (255, 255, 255)
|
| 220 |
|
| 221 | # Make sure label always stays on-screen
|
| 222 | x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2]
|
| 223 |
|
| 224 | lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text)
|
| 225 | lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min)
|
| 226 | lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5)
|
| 227 |
|
| 228 | # Add label and confidence value
|
| 229 | cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1)
|
| 230 | cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50,
|
| 231 | label_color, 1, cv2.LINE_AA)
|