blob: 9a075d2b7db86e6b40266271094d8b7df42233f2 [file] [log] [blame]
David Monahane4a41dc2021-04-14 16:55:36 +01001//
2// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5
6#include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
7
8#include <cl/ClImportTensorHandle.hpp>
9#include <cl/ClImportTensorHandleFactory.hpp>
10#include <cl/test/ClContextControlFixture.hpp>
11
Sadik Armagan1625efc2021-06-10 18:24:34 +010012#include <doctest/doctest.h>
13
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010014#include <armnn/IRuntime.hpp>
15#include <armnn/INetwork.hpp>
Cathal Corbetta3f4fba2022-03-21 09:27:08 +000016#include "Network.hpp"
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010017
David Monahane4a41dc2021-04-14 16:55:36 +010018using namespace armnn;
19
Sadik Armagan1625efc2021-06-10 18:24:34 +010020TEST_SUITE("ClImportTensorHandleTests")
21{
22TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport")
David Monahane4a41dc2021-04-14 16:55:36 +010023{
24 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
25 static_cast<MemorySourceFlags>(MemorySource::Malloc));
26
27 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
28 unsigned int numElements = info.GetNumElements();
29
30 // create TensorHandle for memory import
31 auto handle = handleFactory.CreateTensorHandle(info);
32
33 // Get CLtensor
34 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
35
36 // Create and configure activation function
37 const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
38 arm_compute::CLActivationLayer act_func;
39 act_func.configure(&tensor, nullptr, act_info);
40
41 // Allocate user memory
42 const size_t totalBytes = tensor.info()->total_size();
43 const size_t alignment =
44 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010045 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010046 auto testData = std::make_unique<uint8_t[]>(space);
47 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010048 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010049
50 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010051 CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
David Monahane4a41dc2021-04-14 16:55:36 +010052
53 // Input with negative values
54 auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
55 std::fill_n(typedPtr, numElements, -5.0f);
56
57 // Execute function and sync
58 act_func.run();
59 arm_compute::CLScheduler::get().sync();
60
61 // Validate result by checking that the output has no negative values
62 for(unsigned int i = 0; i < numElements; ++i)
63 {
Jan Eilersc1c872f2021-07-22 13:17:04 +010064 CHECK(typedPtr[i] == 0);
David Monahane4a41dc2021-04-14 16:55:36 +010065 }
66}
67
Sadik Armagan1625efc2021-06-10 18:24:34 +010068TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010069{
70 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
71 static_cast<MemorySourceFlags>(MemorySource::Malloc));
72
73 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
74
75 // create TensorHandle for memory import
76 auto handle = handleFactory.CreateTensorHandle(info);
77
78 // Get CLtensor
79 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
80
81 // Allocate user memory
82 const size_t totalBytes = tensor.info()->total_size();
83 const size_t alignment =
84 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010085 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010086 auto testData = std::make_unique<uint8_t[]>(space);
87 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010088 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010089
90 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010091 CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +010092}
93
Sadik Armagan1625efc2021-06-10 18:24:34 +010094TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010095{
96 MemorySource invalidMemSource = static_cast<MemorySource>(256);
97 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
98 static_cast<MemorySourceFlags>(invalidMemSource));
99
100 TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
101
102 // create TensorHandle for memory import
103 auto handle = handleFactory.CreateTensorHandle(info);
104
105 // Allocate user memory
106 std::vector<float> inputData
107 {
108 1.0f, 2.0f, 3.0f, 4.0f
109 };
110
111 // Import non-support memory
Sadik Armagan1625efc2021-06-10 18:24:34 +0100112 CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +0100113}
114
Sadik Armagan1625efc2021-06-10 18:24:34 +0100115TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100116{
117 // Create runtime in which test will run
118 IRuntime::CreationOptions options;
119 IRuntimePtr runtime(armnn::IRuntime::Create(options));
120
121 // build up the structure of the network
122 INetworkPtr net(INetwork::Create());
123
124 IConnectableLayer* input = net->AddInputLayer(0, "Input");
125
126 ActivationDescriptor descriptor;
127 descriptor.m_Function = ActivationFunction::ReLu;
128 IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
129
130 IConnectableLayer* output = net->AddOutputLayer(0, "Output");
131
132 input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
133 activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
134
135 TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
136 unsigned int numElements = tensorInfo.GetNumElements();
137 size_t totalBytes = numElements * sizeof(float);
138
139 input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
140 activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141
142 // Optimize the network
143 OptimizerOptions optOptions;
144 optOptions.m_ImportEnabled = true;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100145 optOptions.m_ExportEnabled = true;
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100146 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
147 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100148 CHECK(optNet);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100149
150 // Loads it into the runtime.
151 NetworkId netId;
152 std::string ignoredErrorMessage;
153 // Enable Importing
154 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
155 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
156
157 // Creates structures for input & output
158 const size_t alignment =
159 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
160 size_t space = totalBytes + alignment + alignment;
161 auto inputData = std::make_unique<uint8_t[]>(space);
162 void* alignedInputPtr = inputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100163 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100164
165 // Input with negative values
166 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
167 std::fill_n(intputPtr, numElements, -5.0f);
168
169 auto outputData = std::make_unique<uint8_t[]>(space);
170 void* alignedOutputPtr = outputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100171 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100172 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
173 std::fill_n(outputPtr, numElements, -10.0f);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100174
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100175 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
176 inputTensorInfo.SetConstant(true);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100177 InputTensors inputTensors
178 {
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100179 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100180 };
181 OutputTensors outputTensors
182 {
183 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
184 };
185
186 runtime->GetProfiler(netId)->EnableProfiling(true);
187
188 // Do the inference
189 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
190
191 // Retrieve the Profiler.Print() output to get the workload execution
192 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
193 std::stringstream ss;
194 profilerManager.GetProfiler()->Print(ss);;
195 std::string dump = ss.str();
196
197 // Contains ActivationWorkload
198 std::size_t found = dump.find("ActivationWorkload");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100199 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100200
201 // Contains SyncMemGeneric
202 found = dump.find("SyncMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100203 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100204
205 // Does not contain CopyMemGeneric
206 found = dump.find("CopyMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100207 CHECK(found == std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100208
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100209 runtime->UnloadNetwork(netId);
210
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100211 // Check output is as expected
212 // Validate result by checking that the output has no negative values
213 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100214 CHECK(outputResult);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100215 for(unsigned int i = 0; i < numElements; ++i)
216 {
Sadik Armagan1625efc2021-06-10 18:24:34 +0100217 CHECK(outputResult[i] >= 0);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100218 }
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100219}
220
Nikhil Raj60ab9762022-01-13 09:34:44 +0000221TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
222{
223 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
224 static_cast<MemorySourceFlags>(MemorySource::Malloc));
225
226 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
227
228 // create TensorHandle for memory import
David Monahan3826ab62022-02-21 12:26:16 +0000229 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000230
231 // Get CLtensor
232 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
233
234 // Allocate user memory
235 const size_t totalBytes = tensor.info()->total_size();
236 const size_t alignment =
237 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
238 size_t space = totalBytes + alignment + alignment;
239 auto testData = std::make_unique<uint8_t[]>(space);
240 void* alignedPtr = testData.get();
241 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
242
243 // Import memory
244 CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
245
246}
247
248TEST_CASE("ClCanBeImportedAlignedMemory")
249{
250 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
251 static_cast<MemorySourceFlags>(MemorySource::Malloc));
252
253 TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
254
255 // create TensorHandle (Memory Managed status is irrelevant)
David Monahan3826ab62022-02-21 12:26:16 +0000256 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000257 // Get CLtensor
258 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
259
260 // Create an aligned buffer
261 const size_t totalBytes = tensor.info()->total_size();
262 const size_t alignment =
263 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
264 size_t space = totalBytes + alignment + alignment;
265 auto testData = std::make_unique<uint8_t[]>(space);
266 void* alignedPtr = testData.get();
267 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
268
269 // Check aligned buffers return true
270 CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
271
272 // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
273 // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
274 // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
275 // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
276}
277
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000278TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
279{
280 // Create runtime in which test will run
281 IRuntime::CreationOptions options;
282 IRuntimePtr runtime(armnn::IRuntime::Create(options));
283
284 // build up the structure of the network
285 INetworkPtr network(INetwork::Create());
286
287 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
288 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
289 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
290
291 kernelInfo.SetConstant(true);
292
293 std::vector<float> kernel =
294 {
295 4, 5, 6,
296 0, 0, 0,
297 3, 2, 1
298 };
299
300 const std::vector<float> expectedOutput =
301 {
302 23, 41, 33, 21,
303 44, 65, 76, 52,
304 82, 85, 79, 42
305 };
306
307 unsigned int numElements = inputInfo.GetNumElements();
308 size_t totalBytes = numElements * sizeof(float);
309
310 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
311 ARMNN_ASSERT(inputLayer);
312
313 armnn::ConstTensor weights(kernelInfo, kernel);
314
315 armnn::Convolution2dDescriptor convDesc2d;
316 convDesc2d.m_StrideX = 1;
317 convDesc2d.m_StrideY = 1;
318 convDesc2d.m_PadLeft = 1;
319 convDesc2d.m_PadRight = 1;
320 convDesc2d.m_PadTop = 1;
321 convDesc2d.m_PadBottom = 1;
322 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davisb4dd5cc2022-04-07 11:32:00 +0100323
Keith Davis721e6292022-05-17 10:06:53 +0100324 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
325 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
326
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000327 ARMNN_ASSERT(convLayer);
328
Keith Davis721e6292022-05-17 10:06:53 +0100329 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
330 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
331
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000332 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
333 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
334
335 IConnectableLayer* output = network->AddOutputLayer(0, "output");
336 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
337 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
338
339 // Optimize the network
340 OptimizerOptions optOptions;
341 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100342 optOptions.m_ExportEnabled = false;
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000343 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
344 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
345 CHECK(optNet);
346
347 // Loads it into the runtime.
348 NetworkId netId;
349 std::string ignoredErrorMessage;
350 // Enable Importing
351 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
352 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
353
354 // Creates structures for input & output
355 const size_t alignment =
356 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
357 size_t space = totalBytes + alignment + alignment;
358 auto inputData = std::make_unique<uint8_t[]>(space);
359 void* alignedInputPtr = inputData.get();
360 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
361
362 // Input with negative values
363 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
364 inputPtr[0] = 1;
365 inputPtr[1] = 5;
366 inputPtr[2] = 2;
367 inputPtr[3] = 3;
368 inputPtr[4] = 8;
369 inputPtr[5] = 7;
370 inputPtr[6] = 3;
371 inputPtr[7] = 6;
372 inputPtr[8] = 3;
373 inputPtr[9] = 3;
374 inputPtr[10] = 9;
375 inputPtr[11] = 1;
376
377
378 auto outputData = std::make_unique<uint8_t[]>(space);
379 void* alignedOutputPtr = outputData.get();
380 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
381 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
382 std::fill_n(outputPtr, numElements, -10.0f);
383
384 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
385 inputTensorInfo.SetConstant(true);
386 InputTensors inputTensors
387 {
388 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
389 };
390 OutputTensors outputTensors
391 {
392 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
393 };
394
395 runtime->GetProfiler(netId)->EnableProfiling(true);
396
397 INFO("Run ImportInputs");
398 std::vector<ImportedInputId> importedInputIds =
399 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
400 std::vector<ImportedOutputId> importedOutputIds =
401 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
402
403 // Do the inference
404 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
405
406 // Retrieve the Profiler.Print() output to get the workload execution
407 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
408 std::stringstream ss;
409 profilerManager.GetProfiler()->Print(ss);;
410 std::string dump = ss.str();
411
412 // Contains Convolution2dWorkload
413 std::size_t found = dump.find("Convolution2dWorkload");
414 CHECK(found != std::string::npos);
415
416 // Contains SyncMemGeneric
417 found = dump.find("SyncMemGeneric");
418 CHECK(found != std::string::npos);
419
420 // Does not contain CopyMemGeneric
421 found = dump.find("CopyMemGeneric");
422 CHECK(found == std::string::npos);
423
424 runtime->UnloadNetwork(netId);
425
426 // Check output is as expected
427 // Validate result by checking that the output has no negative values
428 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
429 CHECK(outputResult);
430
431 // Check the output is correct
432 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
433}
434
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000435TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
436{
437 using namespace half_float::literal;
438
439 // Create runtime in which test will run
440 IRuntime::CreationOptions options;
441 IRuntimePtr runtime(armnn::IRuntime::Create(options));
442
443 // build up the structure of the network
444 NetworkImpl network;
445
446 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
447 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
448
449 std::vector<float> expectedOutput =
450 {
451 -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
452 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
453 };
454
455 unsigned int numElements = inputInfo.GetNumElements();
456 size_t totalBytesInput = numElements * sizeof(Half);
457 size_t totalBytesOutput = numElements * sizeof(float);
458
459 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
460 ARMNN_ASSERT(inputLayer);
461
462 armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
463 ARMNN_ASSERT(convLayer);
464
465 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
466 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
467
468 IConnectableLayer* output = network.AddOutputLayer(0, "output");
469 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
470 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
471
472 // Optimize the network
473 OptimizerOptions optOptions;
474 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100475 optOptions.m_ExportEnabled = false;
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000476 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
477 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
478 CHECK(optNet);
479
480 // Loads it into the runtime.
481 NetworkId netId;
482 std::string ignoredErrorMessage;
483 // Enable Importing
484 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
485 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
486
487 // Creates structures for input & output
488 const size_t alignment =
489 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
490 size_t spaceInput = totalBytesInput + alignment + alignment;
491 size_t spaceOutput = totalBytesOutput + alignment + alignment;
492 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
493 void* alignedInputPtr = inputData.get();
494 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
495
496 // Input with negative values
497 auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
498 inputPtr[0] = -37.5_h;
499 inputPtr[1] = -15.2_h;
500 inputPtr[2] = -8.76_h;
501 inputPtr[3] = -2.0_h;
502 inputPtr[4] = -1.5_h;
503 inputPtr[5] = -1.3_h;
504 inputPtr[6] = -0.5_h;
505 inputPtr[7] = -0.4_h;
506 inputPtr[8] = 0.0_h;
507 inputPtr[9] = 1.0_h;
508 inputPtr[10] = 0.4_h;
509 inputPtr[11] = 0.5_h;
510 inputPtr[12] = 1.3_h;
511 inputPtr[13] = 1.5_h;
512 inputPtr[14] = 2.0_h;
513 inputPtr[15] = 8.76_h;
514 inputPtr[16] = 15.2_h;
515 inputPtr[17] = 37.5_h;
516
517 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
518 void* alignedOutputPtr = outputData.get();
519 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
520 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
521 std::fill_n(outputPtr, numElements, -10.0f);
522
523 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
524 inputTensorInfo.SetConstant(true);
525 InputTensors inputTensors
526 {
527 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
528 };
529 OutputTensors outputTensors
530 {
531 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
532 };
533
534 runtime->GetProfiler(netId)->EnableProfiling(true);
535
536 INFO("Run ImportInputs");
537 std::vector<ImportedInputId> importedInputIds =
538 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
539 std::vector<ImportedOutputId> importedOutputIds =
540 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
541
542 // Do the inference
543 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
544
545 // Retrieve the Profiler.Print() output to get the workload execution
546 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
547 std::stringstream ss;
548 profilerManager.GetProfiler()->Print(ss);;
549 std::string dump = ss.str();
550
551 // Contains Convolution2dWorkload
552 std::size_t found = dump.find("ConvertFp16ToFp32Workload");
553 CHECK(found != std::string::npos);
554
555 // Contains SyncMemGeneric
556 found = dump.find("SyncMemGeneric");
557 CHECK(found != std::string::npos);
558
559 // Does not contain CopyMemGeneric
560 found = dump.find("CopyMemGeneric");
561 CHECK(found == std::string::npos);
562
563 runtime->UnloadNetwork(netId);
564
565 // Check output is as expected
566 // Validate result by checking that the output has no negative values
567 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
568 CHECK(outputResult);
569
570 // Check the output is correct
571 for (size_t i = 0; i < numElements; ++i)
572 {
573 DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
574 "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
575 }
576}
577
578
579TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
580{
581 using namespace half_float::literal;
582
583 // Create runtime in which test will run
584 IRuntime::CreationOptions options;
585 IRuntimePtr runtime(armnn::IRuntime::Create(options));
586
587 // build up the structure of the network
588 NetworkImpl network;
589
590 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
591 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
592
593 std::vector<Half> expectedOutput =
594 {
595 -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
596 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
597 };
598
599 unsigned int numElements = inputInfo.GetNumElements();
600 size_t totalBytesInput = numElements * sizeof(float);
601 size_t totalBytesOutput = numElements * sizeof(Half);
602
603 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
604 ARMNN_ASSERT(inputLayer);
605
606 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
607 ARMNN_ASSERT(convLayer);
608
609 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
610 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
611
612 IConnectableLayer* output = network.AddOutputLayer(0, "output");
613 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
614 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
615
616 // Optimize the network
617 OptimizerOptions optOptions;
618 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100619 optOptions.m_ExportEnabled = false;
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000620 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
621 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
622 CHECK(optNet);
623
624 // Loads it into the runtime.
625 NetworkId netId;
626 std::string ignoredErrorMessage;
627 // Enable Importing
628 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
629 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
630
631 // Creates structures for input & output
632 const size_t alignment =
633 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
634 size_t spaceInput = totalBytesInput + alignment + alignment;
635 size_t spaceOutput = totalBytesOutput + alignment + alignment;
636 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
637 void* alignedInputPtr = inputData.get();
638 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
639
640 // Input with negative values
641 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
642 inputPtr[0] = -37.5f;
643 inputPtr[1] = -15.2f;
644 inputPtr[2] = -8.76f;
645 inputPtr[3] = -2.0f;
646 inputPtr[4] = -1.5f;
647 inputPtr[5] = -1.3f;
648 inputPtr[6] = -0.5f;
649 inputPtr[7] = -0.4f;
650 inputPtr[8] = 0.0f;
651 inputPtr[9] = 1.0f;
652 inputPtr[10] = 0.4f;
653 inputPtr[11] = 0.5f;
654 inputPtr[12] = 1.3f;
655 inputPtr[13] = 1.5f;
656 inputPtr[14] = 2.0f;
657 inputPtr[15] = 8.76f;
658 inputPtr[16] = 15.2f;
659 inputPtr[17] = 37.5f;
660
661 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
662 void* alignedOutputPtr = outputData.get();
663 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
664 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
665 std::fill_n(outputPtr, numElements, -10.0f);
666
667 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
668 inputTensorInfo.SetConstant(true);
669 InputTensors inputTensors
670 {
671 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
672 };
673 OutputTensors outputTensors
674 {
675 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
676 };
677
678 runtime->GetProfiler(netId)->EnableProfiling(true);
679
680 INFO("Run ImportInputs");
681 std::vector<ImportedInputId> importedInputIds =
682 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
683 std::vector<ImportedOutputId> importedOutputIds =
684 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
685
686 // Do the inference
687 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
688
689 // Retrieve the Profiler.Print() output to get the workload execution
690 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
691 std::stringstream ss;
692 profilerManager.GetProfiler()->Print(ss);;
693 std::string dump = ss.str();
694
695 // Contains Convolution2dWorkload
696 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
697 CHECK(found != std::string::npos);
698
699 // Contains SyncMemGeneric
700 found = dump.find("SyncMemGeneric");
701 CHECK(found != std::string::npos);
702
703 // Does not contain CopyMemGeneric
704 found = dump.find("CopyMemGeneric");
705 CHECK(found == std::string::npos);
706
707 runtime->UnloadNetwork(netId);
708
709 // Check output is as expected
710 // Validate result by checking that the output has no negative values
711 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
712 CHECK(outputResult);
713
714 // Check the output is correct
715 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
716}
717
718TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
719{
720 using namespace half_float::literal;
721
722 // Create runtime in which test will run
723 IRuntime::CreationOptions options;
724 IRuntimePtr runtime(armnn::IRuntime::Create(options));
725
726 // build up the structure of the network
727 NetworkImpl network;
728
729 armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32);
730 armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
731
732 std::vector<Half> expectedOutput = { 1.0_h };
733
734 unsigned int numElements = inputInfo.GetNumElements();
735 size_t totalBytesInput = numElements * sizeof(float);
736 size_t totalBytesOutput = numElements * sizeof(Half);
737
738 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
739 ARMNN_ASSERT(inputLayer);
740
741 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
742 ARMNN_ASSERT(convLayer);
743
744 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
745 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
746
747 IConnectableLayer* output = network.AddOutputLayer(0, "output");
748 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
749 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
750
751 // Optimize the network
752 OptimizerOptions optOptions;
753 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100754 optOptions.m_ExportEnabled = false;
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000755 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
756 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
757 CHECK(optNet);
758
759 // Loads it into the runtime.
760 NetworkId netId;
761 std::string ignoredErrorMessage;
762 // Enable Importing
763 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
764 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
765
766 // Creates structures for input & output
767 const size_t alignment =
768 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
769 size_t spaceInput = totalBytesInput + alignment + alignment;
770 size_t spaceOutput = totalBytesOutput + alignment + alignment;
771 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
772 void* alignedInputPtr = inputData.get();
773 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
774
775 // Input with negative values
776 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
777 inputPtr[0] = 1.0f;
778
779 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
780 void* alignedOutputPtr = outputData.get();
781 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
782 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
783 std::fill_n(outputPtr, numElements, -10.0f);
784
785 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
786 inputTensorInfo.SetConstant(true);
787 InputTensors inputTensors
788 {
789 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
790 };
791 OutputTensors outputTensors
792 {
793 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
794 };
795
796 runtime->GetProfiler(netId)->EnableProfiling(true);
797
798 INFO("Run ImportInputs");
799 std::vector<ImportedInputId> importedInputIds =
800 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
801 std::vector<ImportedOutputId> importedOutputIds =
802 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
803
804 // Do the inference
805 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
806
807 // Retrieve the Profiler.Print() output to get the workload execution
808 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
809 std::stringstream ss;
810 profilerManager.GetProfiler()->Print(ss);;
811 std::string dump = ss.str();
812
813 // Contains Convolution2dWorkload
814 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
815 CHECK(found != std::string::npos);
816
817 // Contains SyncMemGeneric
818 found = dump.find("SyncMemGeneric");
819 CHECK(found != std::string::npos);
820
821 // Does not contain CopyMemGeneric
822 found = dump.find("CopyMemGeneric");
823 CHECK(found == std::string::npos);
824
825 runtime->UnloadNetwork(netId);
826
827 // Check output is as expected
828 // Validate result by checking that the output has no negative values
829 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
830 CHECK(outputResult);
831
832 // Check the output is correct
833 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
834}
835
David Monahan041f17a2022-03-03 10:56:17 +0000836TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
837{
838/*
839 * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
840 * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
841 * imported correctly. For the second we use similar pointers but don't use PreImporting to force fall back to copy.
842 */
843 // Create runtime in which test will run
844 IRuntime::CreationOptions options;
845 IRuntimePtr runtime(armnn::IRuntime::Create(options));
846
847 // build up the structure of the network
848 INetworkPtr network(INetwork::Create());
849
850 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
851 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
852 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
853
854 kernelInfo.SetConstant(true);
855
856 std::vector<float> kernel =
857 {
858 4, 5, 6,
859 0, 0, 0,
860 3, 2, 1
861 };
862
863 const std::vector<float> expectedOutput =
864 {
865 23, 41, 33, 21,
866 44, 65, 76, 52,
867 82, 85, 79, 42
868 };
869
870 unsigned int numElements = inputInfo.GetNumElements();
871 size_t totalBytes = numElements * sizeof(float);
872
873 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
874 ARMNN_ASSERT(inputLayer);
875
876 armnn::ConstTensor weights(kernelInfo, kernel);
877
878 armnn::Convolution2dDescriptor convDesc2d;
879 convDesc2d.m_StrideX = 1;
880 convDesc2d.m_StrideY = 1;
881 convDesc2d.m_PadLeft = 1;
882 convDesc2d.m_PadRight = 1;
883 convDesc2d.m_PadTop = 1;
884 convDesc2d.m_PadBottom = 1;
885 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davis721e6292022-05-17 10:06:53 +0100886 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
David Monahan041f17a2022-03-03 10:56:17 +0000887 ARMNN_ASSERT(convLayer);
888
Keith Davis721e6292022-05-17 10:06:53 +0100889 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
890
891 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
892 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
893
David Monahan041f17a2022-03-03 10:56:17 +0000894 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
895 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
896
897 IConnectableLayer* output = network->AddOutputLayer(0, "output");
898 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
899 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
900
901 // Optimize the network
902 OptimizerOptions optOptions;
903 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +0100904 optOptions.m_ExportEnabled = false;
David Monahan041f17a2022-03-03 10:56:17 +0000905 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
906 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
907 CHECK(optNet);
908
909 // Loads it into the runtime.
910 NetworkId netId;
911 std::string ignoredErrorMessage;
912 // Enable Importing
913 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
914 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
915
916 // Creates structures for input & output
917 const size_t alignment =
918 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
919 size_t space = totalBytes + alignment + alignment;
920 auto inputData = std::make_unique<uint8_t[]>(space);
921 void* alignedInputPtr = inputData.get();
922 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
923
924 // Fill input with values
925 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
926 inputPtr[0] = 1;
927 inputPtr[1] = 5;
928 inputPtr[2] = 2;
929 inputPtr[3] = 3;
930 inputPtr[4] = 8;
931 inputPtr[5] = 7;
932 inputPtr[6] = 3;
933 inputPtr[7] = 6;
934 inputPtr[8] = 3;
935 inputPtr[9] = 3;
936 inputPtr[10] = 9;
937 inputPtr[11] = 1;
938
939
940 auto outputData = std::make_unique<uint8_t[]>(space);
941 void* alignedOutputPtr = outputData.get();
942 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
943 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
944 std::fill_n(outputPtr, numElements, -10.0f);
945
946 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
947 inputTensorInfo.SetConstant(true);
948 InputTensors inputTensors
949 {
950 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
951 };
952 OutputTensors outputTensors
953 {
954 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
955 };
956
957 runtime->GetProfiler(netId)->EnableProfiling(true);
958
959 INFO("Run ImportInputs");
960 std::vector<ImportedInputId> importedInputIds =
961 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
962 std::vector<ImportedOutputId> importedOutputIds =
963 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
964
965 // Do the inference
966 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
967
968 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
969 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
970 std::stringstream ss;
971 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
972 std::string dump = ss.str();
973
974 // Contains Convolution2dWorkload
975 std::size_t found = dump.find("Convolution2dWorkload");
976 CHECK(found != std::string::npos);
977
978 // Contains SyncMemGeneric
979 found = dump.find("SyncMemGeneric");
980 CHECK(found != std::string::npos);
981
982 // Does not contain CopyMemGeneric
983 found = dump.find("CopyMemGeneric");
984 CHECK(found == std::string::npos);
985
986 // Sync the outputs so we can read the data
987 arm_compute::CLScheduler::get().sync();
988
989 // Check output is as expected
990 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
991 CHECK(outputResult);
992 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
993
994 // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
995
996 // Creates structures for input & output
997 auto inputDataCopy = std::make_unique<uint8_t[]>(space);
998 void* copyInputPtr = inputDataCopy.get();
999
1000 // Fill input with values
1001 auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
1002 inputCopyPtr[0] = 1;
1003 inputCopyPtr[1] = 5;
1004 inputCopyPtr[2] = 2;
1005 inputCopyPtr[3] = 3;
1006 inputCopyPtr[4] = 8;
1007 inputCopyPtr[5] = 7;
1008 inputCopyPtr[6] = 3;
1009 inputCopyPtr[7] = 6;
1010 inputCopyPtr[8] = 3;
1011 inputCopyPtr[9] = 3;
1012 inputCopyPtr[10] = 9;
1013 inputCopyPtr[11] = 1;
1014
1015 // Output pre-filled with -10.0f
1016 auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1017 void* copyOutputPtr = outputDataCopy.get();
1018 auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1019 std::fill_n(outputCopyPtr, numElements, -10.0f);
1020
1021 InputTensors inputTensorsCopy
1022 {
1023 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1024 };
1025 OutputTensors outputTensorsCopy
1026 {
1027 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1028 };
1029
1030 // Do the inference without any pre-imported input/output ids
1031 runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1032 // Sync the outputs so we can read the data
1033 arm_compute::CLScheduler::get().sync();
1034
1035 // Check the output is correct
1036 outputResult = reinterpret_cast<float*>(copyOutputPtr);
1037 CHECK(outputResult);
1038 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1039
1040 // Query the profiler again, this will contain the results of both inferences
1041 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1042 dump = ss.str();
1043
1044 // Contains Convolution2dWorkload
1045 found = dump.find("Convolution2dWorkload");
1046 CHECK(found != std::string::npos);
1047
1048 // Should still contain the SyncMemGeneric
1049 found = dump.find("SyncMemGeneric");
1050 CHECK(found != std::string::npos);
1051
1052 // Should now also contain a CopyMemGeneric
1053 found = dump.find("CopyMemGeneric");
1054 CHECK(found != std::string::npos);
1055 runtime->UnloadNetwork(netId);
1056}
1057
1058TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1059{
1060/*
1061 * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1062 * the import.
1063 */
1064 // Create runtime in which test will run
1065 IRuntime::CreationOptions options;
1066 IRuntimePtr runtime(armnn::IRuntime::Create(options));
1067
1068 // build up the structure of the network
1069 INetworkPtr network(INetwork::Create());
1070
1071 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1072 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1073 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1074
1075 kernelInfo.SetConstant(true);
1076
1077 std::vector<float> kernel =
1078 {
1079 4, 5, 6,
1080 0, 0, 0,
1081 3, 2, 1
1082 };
1083
1084 const std::vector<float> expectedOutput =
1085 {
1086 23, 41, 33, 21,
1087 44, 65, 76, 52,
1088 82, 85, 79, 42
1089 };
1090
1091 unsigned int numElements = inputInfo.GetNumElements();
1092 size_t totalBytes = numElements * sizeof(float);
1093
1094 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1095 ARMNN_ASSERT(inputLayer);
1096
1097 armnn::ConstTensor weights(kernelInfo, kernel);
1098
1099 armnn::Convolution2dDescriptor convDesc2d;
1100 convDesc2d.m_StrideX = 1;
1101 convDesc2d.m_StrideY = 1;
1102 convDesc2d.m_PadLeft = 1;
1103 convDesc2d.m_PadRight = 1;
1104 convDesc2d.m_PadTop = 1;
1105 convDesc2d.m_PadBottom = 1;
1106 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davis721e6292022-05-17 10:06:53 +01001107
1108 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
David Monahan041f17a2022-03-03 10:56:17 +00001109 ARMNN_ASSERT(convLayer);
1110
Keith Davis721e6292022-05-17 10:06:53 +01001111 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
1112
1113 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
1114 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
1115
David Monahan041f17a2022-03-03 10:56:17 +00001116 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1117 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1118
1119 IConnectableLayer* output = network->AddOutputLayer(0, "output");
1120 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1121 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1122
1123 // Optimize the network
1124 OptimizerOptions optOptions;
1125 optOptions.m_ImportEnabled = false;
Colm Donelan03bf98a2022-05-30 15:20:36 +01001126 optOptions.m_ExportEnabled = false;
David Monahan041f17a2022-03-03 10:56:17 +00001127 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1128 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1129 CHECK(optNet);
1130
1131 // Loads it into the runtime.
1132 NetworkId netId;
1133 std::string ignoredErrorMessage;
1134 // Enable Importing
1135 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1136 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1137
1138 // Creates structures for input & output
1139 const size_t alignment =
1140 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1141 size_t space = totalBytes + alignment + alignment;
1142 auto inputData = std::make_unique<uint8_t[]>(space);
1143 void* copyInputPtr = inputData.get();
1144
1145 // Fill input with values
1146 auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1147 inputPtr[0] = 1;
1148 inputPtr[1] = 5;
1149 inputPtr[2] = 2;
1150 inputPtr[3] = 3;
1151 inputPtr[4] = 8;
1152 inputPtr[5] = 7;
1153 inputPtr[6] = 3;
1154 inputPtr[7] = 6;
1155 inputPtr[8] = 3;
1156 inputPtr[9] = 3;
1157 inputPtr[10] = 9;
1158 inputPtr[11] = 1;
1159
1160 // Create output buffer and fill it with -10.0f
1161 auto outputData = std::make_unique<uint8_t[]>(space);
1162 void* copyOutputPtr = outputData.get();
1163 auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1164 std::fill_n(outputPtr, numElements, -10.0f);
1165
1166 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1167 inputTensorInfo.SetConstant(true);
1168 InputTensors inputTensors
1169 {
1170 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1171 };
1172 OutputTensors outputTensors
1173 {
1174 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1175 };
1176
1177 runtime->GetProfiler(netId)->EnableProfiling(true);
1178
1179 // Do the inference without any pre-imported inputs/outputs
1180 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1181
1182 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1183 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1184 std::stringstream ss;
1185 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1186 std::string dump = ss.str();
1187
1188 // Contains Convolution2dWorkload
1189 std::size_t found = dump.find("Convolution2dWorkload");
1190 CHECK(found != std::string::npos);
1191
1192 // Does not contain SyncMemGeneric
1193 found = dump.find("SyncMemGeneric");
1194 CHECK(found == std::string::npos);
1195
1196 // Does contain CopyMemGeneric
1197 found = dump.find("CopyMemGeneric");
1198 CHECK(found != std::string::npos);
1199
1200 // Sync the outputs so we can read the data
1201 arm_compute::CLScheduler::get().sync();
1202
1203 // Check output is as expected
1204 auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1205 CHECK(outputResult);
1206 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1207
1208 // Repeat the inference, with new tensors and while using pre-importing to force it to import
1209
1210 // Creates structures for input & output
1211 auto inputDataImport = std::make_unique<uint8_t[]>(space);
1212 void* alignedInputImportPtr = inputDataImport.get();
1213 CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1214
1215 // Fill input with values
1216 auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1217 inputImportPtr[0] = 1;
1218 inputImportPtr[1] = 5;
1219 inputImportPtr[2] = 2;
1220 inputImportPtr[3] = 3;
1221 inputImportPtr[4] = 8;
1222 inputImportPtr[5] = 7;
1223 inputImportPtr[6] = 3;
1224 inputImportPtr[7] = 6;
1225 inputImportPtr[8] = 3;
1226 inputImportPtr[9] = 3;
1227 inputImportPtr[10] = 9;
1228 inputImportPtr[11] = 1;
1229
1230 // Output pre-filled with -10.0f
1231 auto outputDataImport = std::make_unique<uint8_t[]>(space);
1232 void* alignedOutputImportPtr = outputDataImport.get();
1233 CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1234 auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1235 std::fill_n(outputImportPtr, numElements, -10.0f);
1236
1237 InputTensors inputTensorsImport
1238 {
1239 {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1240 };
1241 OutputTensors outputTensorsImport
1242 {
1243 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1244 };
1245
1246 INFO("Run ImportInputs");
1247 std::vector<ImportedInputId> importedInputIds =
1248 runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1249 std::vector<ImportedOutputId> importedOutputIds =
1250 runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1251
1252 // Do the inference with pre-imported inputs/outputs
1253 runtime->EnqueueWorkload(netId, inputTensorsImport, outputTensorsImport, importedInputIds, importedOutputIds);
1254 // Sync the outputs so we can read the data
1255 arm_compute::CLScheduler::get().sync();
1256
1257 // Check the output is correct
1258 outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1259 CHECK(outputResult);
1260 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1261
1262
1263 // Query the profiler again, this will contain the results of both inferences
1264 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1265 dump = ss.str();
1266
1267 // Contains Convolution2dWorkload
1268 found = dump.find("Convolution2dWorkload");
1269 CHECK(found != std::string::npos);
1270
1271 // Should now contain the SyncMemGeneric
1272 found = dump.find("SyncMemGeneric");
1273 CHECK(found != std::string::npos);
1274
1275 // Should still contain a CopyMemGeneric from the first inference
1276 found = dump.find("CopyMemGeneric");
1277 CHECK(found != std::string::npos);
1278 runtime->UnloadNetwork(netId);
1279}
1280
Sadik Armagan1625efc2021-06-10 18:24:34 +01001281}