blob: 20537b3c814c3b69e6f0f316795cfcaec8c81c97 [file] [log] [blame]
David Monahane4a41dc2021-04-14 16:55:36 +01001//
2// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5
6#include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
7
8#include <cl/ClImportTensorHandle.hpp>
9#include <cl/ClImportTensorHandleFactory.hpp>
10#include <cl/test/ClContextControlFixture.hpp>
11
Sadik Armagan1625efc2021-06-10 18:24:34 +010012#include <doctest/doctest.h>
13
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010014#include <armnn/IRuntime.hpp>
15#include <armnn/INetwork.hpp>
Cathal Corbetta3f4fba2022-03-21 09:27:08 +000016#include "Network.hpp"
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010017
David Monahane4a41dc2021-04-14 16:55:36 +010018using namespace armnn;
19
Sadik Armagan1625efc2021-06-10 18:24:34 +010020TEST_SUITE("ClImportTensorHandleTests")
21{
22TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport")
David Monahane4a41dc2021-04-14 16:55:36 +010023{
24 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
25 static_cast<MemorySourceFlags>(MemorySource::Malloc));
26
27 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
28 unsigned int numElements = info.GetNumElements();
29
30 // create TensorHandle for memory import
31 auto handle = handleFactory.CreateTensorHandle(info);
32
33 // Get CLtensor
34 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
35
36 // Create and configure activation function
37 const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
38 arm_compute::CLActivationLayer act_func;
39 act_func.configure(&tensor, nullptr, act_info);
40
41 // Allocate user memory
42 const size_t totalBytes = tensor.info()->total_size();
43 const size_t alignment =
44 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010045 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010046 auto testData = std::make_unique<uint8_t[]>(space);
47 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010048 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010049
50 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010051 CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
David Monahane4a41dc2021-04-14 16:55:36 +010052
53 // Input with negative values
54 auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
55 std::fill_n(typedPtr, numElements, -5.0f);
56
57 // Execute function and sync
58 act_func.run();
59 arm_compute::CLScheduler::get().sync();
60
61 // Validate result by checking that the output has no negative values
62 for(unsigned int i = 0; i < numElements; ++i)
63 {
Jan Eilersc1c872f2021-07-22 13:17:04 +010064 CHECK(typedPtr[i] == 0);
David Monahane4a41dc2021-04-14 16:55:36 +010065 }
66}
67
Sadik Armagan1625efc2021-06-10 18:24:34 +010068TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010069{
70 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
71 static_cast<MemorySourceFlags>(MemorySource::Malloc));
72
73 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
74
75 // create TensorHandle for memory import
76 auto handle = handleFactory.CreateTensorHandle(info);
77
78 // Get CLtensor
79 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
80
81 // Allocate user memory
82 const size_t totalBytes = tensor.info()->total_size();
83 const size_t alignment =
84 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010085 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010086 auto testData = std::make_unique<uint8_t[]>(space);
87 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010088 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010089
90 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010091 CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +010092}
93
Sadik Armagan1625efc2021-06-10 18:24:34 +010094TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010095{
96 MemorySource invalidMemSource = static_cast<MemorySource>(256);
97 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
98 static_cast<MemorySourceFlags>(invalidMemSource));
99
100 TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
101
102 // create TensorHandle for memory import
103 auto handle = handleFactory.CreateTensorHandle(info);
104
105 // Allocate user memory
106 std::vector<float> inputData
107 {
108 1.0f, 2.0f, 3.0f, 4.0f
109 };
110
111 // Import non-support memory
Sadik Armagan1625efc2021-06-10 18:24:34 +0100112 CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +0100113}
114
Sadik Armagan1625efc2021-06-10 18:24:34 +0100115TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100116{
117 // Create runtime in which test will run
118 IRuntime::CreationOptions options;
119 IRuntimePtr runtime(armnn::IRuntime::Create(options));
120
121 // build up the structure of the network
122 INetworkPtr net(INetwork::Create());
123
124 IConnectableLayer* input = net->AddInputLayer(0, "Input");
125
126 ActivationDescriptor descriptor;
127 descriptor.m_Function = ActivationFunction::ReLu;
128 IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
129
130 IConnectableLayer* output = net->AddOutputLayer(0, "Output");
131
132 input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
133 activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
134
135 TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
136 unsigned int numElements = tensorInfo.GetNumElements();
137 size_t totalBytes = numElements * sizeof(float);
138
139 input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
140 activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141
142 // Optimize the network
143 OptimizerOptions optOptions;
144 optOptions.m_ImportEnabled = true;
145 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
146 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100147 CHECK(optNet);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100148
149 // Loads it into the runtime.
150 NetworkId netId;
151 std::string ignoredErrorMessage;
152 // Enable Importing
153 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
154 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
155
156 // Creates structures for input & output
157 const size_t alignment =
158 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
159 size_t space = totalBytes + alignment + alignment;
160 auto inputData = std::make_unique<uint8_t[]>(space);
161 void* alignedInputPtr = inputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100162 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100163
164 // Input with negative values
165 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
166 std::fill_n(intputPtr, numElements, -5.0f);
167
168 auto outputData = std::make_unique<uint8_t[]>(space);
169 void* alignedOutputPtr = outputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100170 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100171 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
172 std::fill_n(outputPtr, numElements, -10.0f);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100173
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100174 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
175 inputTensorInfo.SetConstant(true);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100176 InputTensors inputTensors
177 {
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100178 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100179 };
180 OutputTensors outputTensors
181 {
182 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
183 };
184
185 runtime->GetProfiler(netId)->EnableProfiling(true);
186
187 // Do the inference
188 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
189
190 // Retrieve the Profiler.Print() output to get the workload execution
191 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
192 std::stringstream ss;
193 profilerManager.GetProfiler()->Print(ss);;
194 std::string dump = ss.str();
195
196 // Contains ActivationWorkload
197 std::size_t found = dump.find("ActivationWorkload");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100198 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100199
200 // Contains SyncMemGeneric
201 found = dump.find("SyncMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100202 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100203
204 // Does not contain CopyMemGeneric
205 found = dump.find("CopyMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100206 CHECK(found == std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100207
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100208 runtime->UnloadNetwork(netId);
209
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100210 // Check output is as expected
211 // Validate result by checking that the output has no negative values
212 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100213 CHECK(outputResult);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100214 for(unsigned int i = 0; i < numElements; ++i)
215 {
Sadik Armagan1625efc2021-06-10 18:24:34 +0100216 CHECK(outputResult[i] >= 0);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100217 }
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100218}
219
Nikhil Raj60ab9762022-01-13 09:34:44 +0000220TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
221{
222 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
223 static_cast<MemorySourceFlags>(MemorySource::Malloc));
224
225 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
226
227 // create TensorHandle for memory import
David Monahan3826ab62022-02-21 12:26:16 +0000228 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000229
230 // Get CLtensor
231 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
232
233 // Allocate user memory
234 const size_t totalBytes = tensor.info()->total_size();
235 const size_t alignment =
236 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
237 size_t space = totalBytes + alignment + alignment;
238 auto testData = std::make_unique<uint8_t[]>(space);
239 void* alignedPtr = testData.get();
240 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
241
242 // Import memory
243 CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
244
245}
246
247TEST_CASE("ClCanBeImportedAlignedMemory")
248{
249 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
250 static_cast<MemorySourceFlags>(MemorySource::Malloc));
251
252 TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
253
254 // create TensorHandle (Memory Managed status is irrelevant)
David Monahan3826ab62022-02-21 12:26:16 +0000255 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000256 // Get CLtensor
257 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
258
259 // Create an aligned buffer
260 const size_t totalBytes = tensor.info()->total_size();
261 const size_t alignment =
262 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
263 size_t space = totalBytes + alignment + alignment;
264 auto testData = std::make_unique<uint8_t[]>(space);
265 void* alignedPtr = testData.get();
266 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
267
268 // Check aligned buffers return true
269 CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
270
271 // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
272 // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
273 // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
274 // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
275}
276
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000277TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
278{
279 // Create runtime in which test will run
280 IRuntime::CreationOptions options;
281 IRuntimePtr runtime(armnn::IRuntime::Create(options));
282
283 // build up the structure of the network
284 INetworkPtr network(INetwork::Create());
285
286 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
287 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
288 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
289
290 kernelInfo.SetConstant(true);
291
292 std::vector<float> kernel =
293 {
294 4, 5, 6,
295 0, 0, 0,
296 3, 2, 1
297 };
298
299 const std::vector<float> expectedOutput =
300 {
301 23, 41, 33, 21,
302 44, 65, 76, 52,
303 82, 85, 79, 42
304 };
305
306 unsigned int numElements = inputInfo.GetNumElements();
307 size_t totalBytes = numElements * sizeof(float);
308
309 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
310 ARMNN_ASSERT(inputLayer);
311
312 armnn::ConstTensor weights(kernelInfo, kernel);
313
314 armnn::Convolution2dDescriptor convDesc2d;
315 convDesc2d.m_StrideX = 1;
316 convDesc2d.m_StrideY = 1;
317 convDesc2d.m_PadLeft = 1;
318 convDesc2d.m_PadRight = 1;
319 convDesc2d.m_PadTop = 1;
320 convDesc2d.m_PadBottom = 1;
321 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davisb4dd5cc2022-04-07 11:32:00 +0100322
Keith Davis721e6292022-05-17 10:06:53 +0100323 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
324 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
325
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000326 ARMNN_ASSERT(convLayer);
327
Keith Davis721e6292022-05-17 10:06:53 +0100328 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
329 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
330
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000331 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
332 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
333
334 IConnectableLayer* output = network->AddOutputLayer(0, "output");
335 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
336 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
337
338 // Optimize the network
339 OptimizerOptions optOptions;
340 optOptions.m_ImportEnabled = false;
341 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
342 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
343 CHECK(optNet);
344
345 // Loads it into the runtime.
346 NetworkId netId;
347 std::string ignoredErrorMessage;
348 // Enable Importing
349 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
350 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
351
352 // Creates structures for input & output
353 const size_t alignment =
354 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
355 size_t space = totalBytes + alignment + alignment;
356 auto inputData = std::make_unique<uint8_t[]>(space);
357 void* alignedInputPtr = inputData.get();
358 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
359
360 // Input with negative values
361 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
362 inputPtr[0] = 1;
363 inputPtr[1] = 5;
364 inputPtr[2] = 2;
365 inputPtr[3] = 3;
366 inputPtr[4] = 8;
367 inputPtr[5] = 7;
368 inputPtr[6] = 3;
369 inputPtr[7] = 6;
370 inputPtr[8] = 3;
371 inputPtr[9] = 3;
372 inputPtr[10] = 9;
373 inputPtr[11] = 1;
374
375
376 auto outputData = std::make_unique<uint8_t[]>(space);
377 void* alignedOutputPtr = outputData.get();
378 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
379 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
380 std::fill_n(outputPtr, numElements, -10.0f);
381
382 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
383 inputTensorInfo.SetConstant(true);
384 InputTensors inputTensors
385 {
386 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
387 };
388 OutputTensors outputTensors
389 {
390 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
391 };
392
393 runtime->GetProfiler(netId)->EnableProfiling(true);
394
395 INFO("Run ImportInputs");
396 std::vector<ImportedInputId> importedInputIds =
397 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
398 std::vector<ImportedOutputId> importedOutputIds =
399 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
400
401 // Do the inference
402 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
403
404 // Retrieve the Profiler.Print() output to get the workload execution
405 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
406 std::stringstream ss;
407 profilerManager.GetProfiler()->Print(ss);;
408 std::string dump = ss.str();
409
410 // Contains Convolution2dWorkload
411 std::size_t found = dump.find("Convolution2dWorkload");
412 CHECK(found != std::string::npos);
413
414 // Contains SyncMemGeneric
415 found = dump.find("SyncMemGeneric");
416 CHECK(found != std::string::npos);
417
418 // Does not contain CopyMemGeneric
419 found = dump.find("CopyMemGeneric");
420 CHECK(found == std::string::npos);
421
422 runtime->UnloadNetwork(netId);
423
424 // Check output is as expected
425 // Validate result by checking that the output has no negative values
426 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
427 CHECK(outputResult);
428
429 // Check the output is correct
430 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
431}
432
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000433TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
434{
435 using namespace half_float::literal;
436
437 // Create runtime in which test will run
438 IRuntime::CreationOptions options;
439 IRuntimePtr runtime(armnn::IRuntime::Create(options));
440
441 // build up the structure of the network
442 NetworkImpl network;
443
444 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
445 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
446
447 std::vector<float> expectedOutput =
448 {
449 -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
450 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
451 };
452
453 unsigned int numElements = inputInfo.GetNumElements();
454 size_t totalBytesInput = numElements * sizeof(Half);
455 size_t totalBytesOutput = numElements * sizeof(float);
456
457 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
458 ARMNN_ASSERT(inputLayer);
459
460 armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
461 ARMNN_ASSERT(convLayer);
462
463 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
464 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
465
466 IConnectableLayer* output = network.AddOutputLayer(0, "output");
467 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
468 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
469
470 // Optimize the network
471 OptimizerOptions optOptions;
472 optOptions.m_ImportEnabled = false;
473 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
474 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
475 CHECK(optNet);
476
477 // Loads it into the runtime.
478 NetworkId netId;
479 std::string ignoredErrorMessage;
480 // Enable Importing
481 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
482 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
483
484 // Creates structures for input & output
485 const size_t alignment =
486 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
487 size_t spaceInput = totalBytesInput + alignment + alignment;
488 size_t spaceOutput = totalBytesOutput + alignment + alignment;
489 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
490 void* alignedInputPtr = inputData.get();
491 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
492
493 // Input with negative values
494 auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
495 inputPtr[0] = -37.5_h;
496 inputPtr[1] = -15.2_h;
497 inputPtr[2] = -8.76_h;
498 inputPtr[3] = -2.0_h;
499 inputPtr[4] = -1.5_h;
500 inputPtr[5] = -1.3_h;
501 inputPtr[6] = -0.5_h;
502 inputPtr[7] = -0.4_h;
503 inputPtr[8] = 0.0_h;
504 inputPtr[9] = 1.0_h;
505 inputPtr[10] = 0.4_h;
506 inputPtr[11] = 0.5_h;
507 inputPtr[12] = 1.3_h;
508 inputPtr[13] = 1.5_h;
509 inputPtr[14] = 2.0_h;
510 inputPtr[15] = 8.76_h;
511 inputPtr[16] = 15.2_h;
512 inputPtr[17] = 37.5_h;
513
514 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
515 void* alignedOutputPtr = outputData.get();
516 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
517 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
518 std::fill_n(outputPtr, numElements, -10.0f);
519
520 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
521 inputTensorInfo.SetConstant(true);
522 InputTensors inputTensors
523 {
524 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
525 };
526 OutputTensors outputTensors
527 {
528 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
529 };
530
531 runtime->GetProfiler(netId)->EnableProfiling(true);
532
533 INFO("Run ImportInputs");
534 std::vector<ImportedInputId> importedInputIds =
535 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
536 std::vector<ImportedOutputId> importedOutputIds =
537 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
538
539 // Do the inference
540 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
541
542 // Retrieve the Profiler.Print() output to get the workload execution
543 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
544 std::stringstream ss;
545 profilerManager.GetProfiler()->Print(ss);;
546 std::string dump = ss.str();
547
548 // Contains Convolution2dWorkload
549 std::size_t found = dump.find("ConvertFp16ToFp32Workload");
550 CHECK(found != std::string::npos);
551
552 // Contains SyncMemGeneric
553 found = dump.find("SyncMemGeneric");
554 CHECK(found != std::string::npos);
555
556 // Does not contain CopyMemGeneric
557 found = dump.find("CopyMemGeneric");
558 CHECK(found == std::string::npos);
559
560 runtime->UnloadNetwork(netId);
561
562 // Check output is as expected
563 // Validate result by checking that the output has no negative values
564 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
565 CHECK(outputResult);
566
567 // Check the output is correct
568 for (size_t i = 0; i < numElements; ++i)
569 {
570 DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
571 "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
572 }
573}
574
575
576TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
577{
578 using namespace half_float::literal;
579
580 // Create runtime in which test will run
581 IRuntime::CreationOptions options;
582 IRuntimePtr runtime(armnn::IRuntime::Create(options));
583
584 // build up the structure of the network
585 NetworkImpl network;
586
587 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
588 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
589
590 std::vector<Half> expectedOutput =
591 {
592 -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
593 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
594 };
595
596 unsigned int numElements = inputInfo.GetNumElements();
597 size_t totalBytesInput = numElements * sizeof(float);
598 size_t totalBytesOutput = numElements * sizeof(Half);
599
600 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
601 ARMNN_ASSERT(inputLayer);
602
603 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
604 ARMNN_ASSERT(convLayer);
605
606 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
607 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
608
609 IConnectableLayer* output = network.AddOutputLayer(0, "output");
610 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
611 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
612
613 // Optimize the network
614 OptimizerOptions optOptions;
615 optOptions.m_ImportEnabled = false;
616 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
617 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
618 CHECK(optNet);
619
620 // Loads it into the runtime.
621 NetworkId netId;
622 std::string ignoredErrorMessage;
623 // Enable Importing
624 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
625 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
626
627 // Creates structures for input & output
628 const size_t alignment =
629 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
630 size_t spaceInput = totalBytesInput + alignment + alignment;
631 size_t spaceOutput = totalBytesOutput + alignment + alignment;
632 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
633 void* alignedInputPtr = inputData.get();
634 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
635
636 // Input with negative values
637 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
638 inputPtr[0] = -37.5f;
639 inputPtr[1] = -15.2f;
640 inputPtr[2] = -8.76f;
641 inputPtr[3] = -2.0f;
642 inputPtr[4] = -1.5f;
643 inputPtr[5] = -1.3f;
644 inputPtr[6] = -0.5f;
645 inputPtr[7] = -0.4f;
646 inputPtr[8] = 0.0f;
647 inputPtr[9] = 1.0f;
648 inputPtr[10] = 0.4f;
649 inputPtr[11] = 0.5f;
650 inputPtr[12] = 1.3f;
651 inputPtr[13] = 1.5f;
652 inputPtr[14] = 2.0f;
653 inputPtr[15] = 8.76f;
654 inputPtr[16] = 15.2f;
655 inputPtr[17] = 37.5f;
656
657 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
658 void* alignedOutputPtr = outputData.get();
659 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
660 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
661 std::fill_n(outputPtr, numElements, -10.0f);
662
663 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
664 inputTensorInfo.SetConstant(true);
665 InputTensors inputTensors
666 {
667 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
668 };
669 OutputTensors outputTensors
670 {
671 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
672 };
673
674 runtime->GetProfiler(netId)->EnableProfiling(true);
675
676 INFO("Run ImportInputs");
677 std::vector<ImportedInputId> importedInputIds =
678 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
679 std::vector<ImportedOutputId> importedOutputIds =
680 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
681
682 // Do the inference
683 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
684
685 // Retrieve the Profiler.Print() output to get the workload execution
686 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
687 std::stringstream ss;
688 profilerManager.GetProfiler()->Print(ss);;
689 std::string dump = ss.str();
690
691 // Contains Convolution2dWorkload
692 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
693 CHECK(found != std::string::npos);
694
695 // Contains SyncMemGeneric
696 found = dump.find("SyncMemGeneric");
697 CHECK(found != std::string::npos);
698
699 // Does not contain CopyMemGeneric
700 found = dump.find("CopyMemGeneric");
701 CHECK(found == std::string::npos);
702
703 runtime->UnloadNetwork(netId);
704
705 // Check output is as expected
706 // Validate result by checking that the output has no negative values
707 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
708 CHECK(outputResult);
709
710 // Check the output is correct
711 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
712}
713
714TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
715{
716 using namespace half_float::literal;
717
718 // Create runtime in which test will run
719 IRuntime::CreationOptions options;
720 IRuntimePtr runtime(armnn::IRuntime::Create(options));
721
722 // build up the structure of the network
723 NetworkImpl network;
724
725 armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32);
726 armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
727
728 std::vector<Half> expectedOutput = { 1.0_h };
729
730 unsigned int numElements = inputInfo.GetNumElements();
731 size_t totalBytesInput = numElements * sizeof(float);
732 size_t totalBytesOutput = numElements * sizeof(Half);
733
734 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
735 ARMNN_ASSERT(inputLayer);
736
737 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
738 ARMNN_ASSERT(convLayer);
739
740 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
741 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
742
743 IConnectableLayer* output = network.AddOutputLayer(0, "output");
744 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
745 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
746
747 // Optimize the network
748 OptimizerOptions optOptions;
749 optOptions.m_ImportEnabled = false;
750 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
751 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
752 CHECK(optNet);
753
754 // Loads it into the runtime.
755 NetworkId netId;
756 std::string ignoredErrorMessage;
757 // Enable Importing
758 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
759 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
760
761 // Creates structures for input & output
762 const size_t alignment =
763 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
764 size_t spaceInput = totalBytesInput + alignment + alignment;
765 size_t spaceOutput = totalBytesOutput + alignment + alignment;
766 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
767 void* alignedInputPtr = inputData.get();
768 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
769
770 // Input with negative values
771 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
772 inputPtr[0] = 1.0f;
773
774 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
775 void* alignedOutputPtr = outputData.get();
776 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
777 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
778 std::fill_n(outputPtr, numElements, -10.0f);
779
780 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
781 inputTensorInfo.SetConstant(true);
782 InputTensors inputTensors
783 {
784 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
785 };
786 OutputTensors outputTensors
787 {
788 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
789 };
790
791 runtime->GetProfiler(netId)->EnableProfiling(true);
792
793 INFO("Run ImportInputs");
794 std::vector<ImportedInputId> importedInputIds =
795 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
796 std::vector<ImportedOutputId> importedOutputIds =
797 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
798
799 // Do the inference
800 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
801
802 // Retrieve the Profiler.Print() output to get the workload execution
803 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
804 std::stringstream ss;
805 profilerManager.GetProfiler()->Print(ss);;
806 std::string dump = ss.str();
807
808 // Contains Convolution2dWorkload
809 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
810 CHECK(found != std::string::npos);
811
812 // Contains SyncMemGeneric
813 found = dump.find("SyncMemGeneric");
814 CHECK(found != std::string::npos);
815
816 // Does not contain CopyMemGeneric
817 found = dump.find("CopyMemGeneric");
818 CHECK(found == std::string::npos);
819
820 runtime->UnloadNetwork(netId);
821
822 // Check output is as expected
823 // Validate result by checking that the output has no negative values
824 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
825 CHECK(outputResult);
826
827 // Check the output is correct
828 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
829}
830
David Monahan041f17a2022-03-03 10:56:17 +0000831TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
832{
833/*
834 * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
835 * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
836 * imported correctly. For the second we use similar pointers but don't use PreImporting to force fall back to copy.
837 */
838 // Create runtime in which test will run
839 IRuntime::CreationOptions options;
840 IRuntimePtr runtime(armnn::IRuntime::Create(options));
841
842 // build up the structure of the network
843 INetworkPtr network(INetwork::Create());
844
845 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
846 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
847 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
848
849 kernelInfo.SetConstant(true);
850
851 std::vector<float> kernel =
852 {
853 4, 5, 6,
854 0, 0, 0,
855 3, 2, 1
856 };
857
858 const std::vector<float> expectedOutput =
859 {
860 23, 41, 33, 21,
861 44, 65, 76, 52,
862 82, 85, 79, 42
863 };
864
865 unsigned int numElements = inputInfo.GetNumElements();
866 size_t totalBytes = numElements * sizeof(float);
867
868 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
869 ARMNN_ASSERT(inputLayer);
870
871 armnn::ConstTensor weights(kernelInfo, kernel);
872
873 armnn::Convolution2dDescriptor convDesc2d;
874 convDesc2d.m_StrideX = 1;
875 convDesc2d.m_StrideY = 1;
876 convDesc2d.m_PadLeft = 1;
877 convDesc2d.m_PadRight = 1;
878 convDesc2d.m_PadTop = 1;
879 convDesc2d.m_PadBottom = 1;
880 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davis721e6292022-05-17 10:06:53 +0100881 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
David Monahan041f17a2022-03-03 10:56:17 +0000882 ARMNN_ASSERT(convLayer);
883
Keith Davis721e6292022-05-17 10:06:53 +0100884 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
885
886 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
887 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
888
David Monahan041f17a2022-03-03 10:56:17 +0000889 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
890 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
891
892 IConnectableLayer* output = network->AddOutputLayer(0, "output");
893 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
894 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
895
896 // Optimize the network
897 OptimizerOptions optOptions;
898 optOptions.m_ImportEnabled = false;
899 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
900 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
901 CHECK(optNet);
902
903 // Loads it into the runtime.
904 NetworkId netId;
905 std::string ignoredErrorMessage;
906 // Enable Importing
907 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
908 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
909
910 // Creates structures for input & output
911 const size_t alignment =
912 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
913 size_t space = totalBytes + alignment + alignment;
914 auto inputData = std::make_unique<uint8_t[]>(space);
915 void* alignedInputPtr = inputData.get();
916 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
917
918 // Fill input with values
919 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
920 inputPtr[0] = 1;
921 inputPtr[1] = 5;
922 inputPtr[2] = 2;
923 inputPtr[3] = 3;
924 inputPtr[4] = 8;
925 inputPtr[5] = 7;
926 inputPtr[6] = 3;
927 inputPtr[7] = 6;
928 inputPtr[8] = 3;
929 inputPtr[9] = 3;
930 inputPtr[10] = 9;
931 inputPtr[11] = 1;
932
933
934 auto outputData = std::make_unique<uint8_t[]>(space);
935 void* alignedOutputPtr = outputData.get();
936 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
937 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
938 std::fill_n(outputPtr, numElements, -10.0f);
939
940 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
941 inputTensorInfo.SetConstant(true);
942 InputTensors inputTensors
943 {
944 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
945 };
946 OutputTensors outputTensors
947 {
948 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
949 };
950
951 runtime->GetProfiler(netId)->EnableProfiling(true);
952
953 INFO("Run ImportInputs");
954 std::vector<ImportedInputId> importedInputIds =
955 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
956 std::vector<ImportedOutputId> importedOutputIds =
957 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
958
959 // Do the inference
960 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
961
962 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
963 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
964 std::stringstream ss;
965 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
966 std::string dump = ss.str();
967
968 // Contains Convolution2dWorkload
969 std::size_t found = dump.find("Convolution2dWorkload");
970 CHECK(found != std::string::npos);
971
972 // Contains SyncMemGeneric
973 found = dump.find("SyncMemGeneric");
974 CHECK(found != std::string::npos);
975
976 // Does not contain CopyMemGeneric
977 found = dump.find("CopyMemGeneric");
978 CHECK(found == std::string::npos);
979
980 // Sync the outputs so we can read the data
981 arm_compute::CLScheduler::get().sync();
982
983 // Check output is as expected
984 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
985 CHECK(outputResult);
986 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
987
988 // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
989
990 // Creates structures for input & output
991 auto inputDataCopy = std::make_unique<uint8_t[]>(space);
992 void* copyInputPtr = inputDataCopy.get();
993
994 // Fill input with values
995 auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
996 inputCopyPtr[0] = 1;
997 inputCopyPtr[1] = 5;
998 inputCopyPtr[2] = 2;
999 inputCopyPtr[3] = 3;
1000 inputCopyPtr[4] = 8;
1001 inputCopyPtr[5] = 7;
1002 inputCopyPtr[6] = 3;
1003 inputCopyPtr[7] = 6;
1004 inputCopyPtr[8] = 3;
1005 inputCopyPtr[9] = 3;
1006 inputCopyPtr[10] = 9;
1007 inputCopyPtr[11] = 1;
1008
1009 // Output pre-filled with -10.0f
1010 auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1011 void* copyOutputPtr = outputDataCopy.get();
1012 auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1013 std::fill_n(outputCopyPtr, numElements, -10.0f);
1014
1015 InputTensors inputTensorsCopy
1016 {
1017 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1018 };
1019 OutputTensors outputTensorsCopy
1020 {
1021 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1022 };
1023
1024 // Do the inference without any pre-imported input/output ids
1025 runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1026 // Sync the outputs so we can read the data
1027 arm_compute::CLScheduler::get().sync();
1028
1029 // Check the output is correct
1030 outputResult = reinterpret_cast<float*>(copyOutputPtr);
1031 CHECK(outputResult);
1032 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1033
1034 // Query the profiler again, this will contain the results of both inferences
1035 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1036 dump = ss.str();
1037
1038 // Contains Convolution2dWorkload
1039 found = dump.find("Convolution2dWorkload");
1040 CHECK(found != std::string::npos);
1041
1042 // Should still contain the SyncMemGeneric
1043 found = dump.find("SyncMemGeneric");
1044 CHECK(found != std::string::npos);
1045
1046 // Should now also contain a CopyMemGeneric
1047 found = dump.find("CopyMemGeneric");
1048 CHECK(found != std::string::npos);
1049 runtime->UnloadNetwork(netId);
1050}
1051
1052TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1053{
1054/*
1055 * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1056 * the import.
1057 */
1058 // Create runtime in which test will run
1059 IRuntime::CreationOptions options;
1060 IRuntimePtr runtime(armnn::IRuntime::Create(options));
1061
1062 // build up the structure of the network
1063 INetworkPtr network(INetwork::Create());
1064
1065 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1066 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1067 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1068
1069 kernelInfo.SetConstant(true);
1070
1071 std::vector<float> kernel =
1072 {
1073 4, 5, 6,
1074 0, 0, 0,
1075 3, 2, 1
1076 };
1077
1078 const std::vector<float> expectedOutput =
1079 {
1080 23, 41, 33, 21,
1081 44, 65, 76, 52,
1082 82, 85, 79, 42
1083 };
1084
1085 unsigned int numElements = inputInfo.GetNumElements();
1086 size_t totalBytes = numElements * sizeof(float);
1087
1088 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1089 ARMNN_ASSERT(inputLayer);
1090
1091 armnn::ConstTensor weights(kernelInfo, kernel);
1092
1093 armnn::Convolution2dDescriptor convDesc2d;
1094 convDesc2d.m_StrideX = 1;
1095 convDesc2d.m_StrideY = 1;
1096 convDesc2d.m_PadLeft = 1;
1097 convDesc2d.m_PadRight = 1;
1098 convDesc2d.m_PadTop = 1;
1099 convDesc2d.m_PadBottom = 1;
1100 convDesc2d.m_DataLayout = DataLayout::NHWC;
Keith Davis721e6292022-05-17 10:06:53 +01001101
1102 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
David Monahan041f17a2022-03-03 10:56:17 +00001103 ARMNN_ASSERT(convLayer);
1104
Keith Davis721e6292022-05-17 10:06:53 +01001105 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
1106
1107 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
1108 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
1109
David Monahan041f17a2022-03-03 10:56:17 +00001110 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1111 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1112
1113 IConnectableLayer* output = network->AddOutputLayer(0, "output");
1114 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1115 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1116
1117 // Optimize the network
1118 OptimizerOptions optOptions;
1119 optOptions.m_ImportEnabled = false;
1120 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1121 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1122 CHECK(optNet);
1123
1124 // Loads it into the runtime.
1125 NetworkId netId;
1126 std::string ignoredErrorMessage;
1127 // Enable Importing
1128 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1129 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1130
1131 // Creates structures for input & output
1132 const size_t alignment =
1133 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1134 size_t space = totalBytes + alignment + alignment;
1135 auto inputData = std::make_unique<uint8_t[]>(space);
1136 void* copyInputPtr = inputData.get();
1137
1138 // Fill input with values
1139 auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1140 inputPtr[0] = 1;
1141 inputPtr[1] = 5;
1142 inputPtr[2] = 2;
1143 inputPtr[3] = 3;
1144 inputPtr[4] = 8;
1145 inputPtr[5] = 7;
1146 inputPtr[6] = 3;
1147 inputPtr[7] = 6;
1148 inputPtr[8] = 3;
1149 inputPtr[9] = 3;
1150 inputPtr[10] = 9;
1151 inputPtr[11] = 1;
1152
1153 // Create output buffer and fill it with -10.0f
1154 auto outputData = std::make_unique<uint8_t[]>(space);
1155 void* copyOutputPtr = outputData.get();
1156 auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1157 std::fill_n(outputPtr, numElements, -10.0f);
1158
1159 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1160 inputTensorInfo.SetConstant(true);
1161 InputTensors inputTensors
1162 {
1163 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1164 };
1165 OutputTensors outputTensors
1166 {
1167 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1168 };
1169
1170 runtime->GetProfiler(netId)->EnableProfiling(true);
1171
1172 // Do the inference without any pre-imported inputs/outputs
1173 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1174
1175 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1176 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1177 std::stringstream ss;
1178 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1179 std::string dump = ss.str();
1180
1181 // Contains Convolution2dWorkload
1182 std::size_t found = dump.find("Convolution2dWorkload");
1183 CHECK(found != std::string::npos);
1184
1185 // Does not contain SyncMemGeneric
1186 found = dump.find("SyncMemGeneric");
1187 CHECK(found == std::string::npos);
1188
1189 // Does contain CopyMemGeneric
1190 found = dump.find("CopyMemGeneric");
1191 CHECK(found != std::string::npos);
1192
1193 // Sync the outputs so we can read the data
1194 arm_compute::CLScheduler::get().sync();
1195
1196 // Check output is as expected
1197 auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1198 CHECK(outputResult);
1199 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1200
1201 // Repeat the inference, with new tensors and while using pre-importing to force it to import
1202
1203 // Creates structures for input & output
1204 auto inputDataImport = std::make_unique<uint8_t[]>(space);
1205 void* alignedInputImportPtr = inputDataImport.get();
1206 CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1207
1208 // Fill input with values
1209 auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1210 inputImportPtr[0] = 1;
1211 inputImportPtr[1] = 5;
1212 inputImportPtr[2] = 2;
1213 inputImportPtr[3] = 3;
1214 inputImportPtr[4] = 8;
1215 inputImportPtr[5] = 7;
1216 inputImportPtr[6] = 3;
1217 inputImportPtr[7] = 6;
1218 inputImportPtr[8] = 3;
1219 inputImportPtr[9] = 3;
1220 inputImportPtr[10] = 9;
1221 inputImportPtr[11] = 1;
1222
1223 // Output pre-filled with -10.0f
1224 auto outputDataImport = std::make_unique<uint8_t[]>(space);
1225 void* alignedOutputImportPtr = outputDataImport.get();
1226 CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1227 auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1228 std::fill_n(outputImportPtr, numElements, -10.0f);
1229
1230 InputTensors inputTensorsImport
1231 {
1232 {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1233 };
1234 OutputTensors outputTensorsImport
1235 {
1236 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1237 };
1238
1239 INFO("Run ImportInputs");
1240 std::vector<ImportedInputId> importedInputIds =
1241 runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1242 std::vector<ImportedOutputId> importedOutputIds =
1243 runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1244
1245 // Do the inference with pre-imported inputs/outputs
1246 runtime->EnqueueWorkload(netId, inputTensorsImport, outputTensorsImport, importedInputIds, importedOutputIds);
1247 // Sync the outputs so we can read the data
1248 arm_compute::CLScheduler::get().sync();
1249
1250 // Check the output is correct
1251 outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1252 CHECK(outputResult);
1253 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1254
1255
1256 // Query the profiler again, this will contain the results of both inferences
1257 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1258 dump = ss.str();
1259
1260 // Contains Convolution2dWorkload
1261 found = dump.find("Convolution2dWorkload");
1262 CHECK(found != std::string::npos);
1263
1264 // Should now contain the SyncMemGeneric
1265 found = dump.find("SyncMemGeneric");
1266 CHECK(found != std::string::npos);
1267
1268 // Should still contain a CopyMemGeneric from the first inference
1269 found = dump.find("CopyMemGeneric");
1270 CHECK(found != std::string::npos);
1271 runtime->UnloadNetwork(netId);
1272}
1273
Sadik Armagan1625efc2021-06-10 18:24:34 +01001274}