blob: e10e81ac26d0da44ebcf4406b1349cfed7526847 [file] [log] [blame]
David Monahane4a41dc2021-04-14 16:55:36 +01001//
2// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5
6#include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
7
8#include <cl/ClImportTensorHandle.hpp>
9#include <cl/ClImportTensorHandleFactory.hpp>
10#include <cl/test/ClContextControlFixture.hpp>
11
Sadik Armagan1625efc2021-06-10 18:24:34 +010012#include <doctest/doctest.h>
13
David Monahane4a41dc2021-04-14 16:55:36 +010014
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010015#include <armnn/IRuntime.hpp>
16#include <armnn/INetwork.hpp>
Cathal Corbetta3f4fba2022-03-21 09:27:08 +000017#include "Network.hpp"
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010018
David Monahane4a41dc2021-04-14 16:55:36 +010019using namespace armnn;
20
Sadik Armagan1625efc2021-06-10 18:24:34 +010021TEST_SUITE("ClImportTensorHandleTests")
22{
23TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport")
David Monahane4a41dc2021-04-14 16:55:36 +010024{
25 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
26 static_cast<MemorySourceFlags>(MemorySource::Malloc));
27
28 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
29 unsigned int numElements = info.GetNumElements();
30
31 // create TensorHandle for memory import
32 auto handle = handleFactory.CreateTensorHandle(info);
33
34 // Get CLtensor
35 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
36
37 // Create and configure activation function
38 const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
39 arm_compute::CLActivationLayer act_func;
40 act_func.configure(&tensor, nullptr, act_info);
41
42 // Allocate user memory
43 const size_t totalBytes = tensor.info()->total_size();
44 const size_t alignment =
45 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010046 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010047 auto testData = std::make_unique<uint8_t[]>(space);
48 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010049 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010050
51 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010052 CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
David Monahane4a41dc2021-04-14 16:55:36 +010053
54 // Input with negative values
55 auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
56 std::fill_n(typedPtr, numElements, -5.0f);
57
58 // Execute function and sync
59 act_func.run();
60 arm_compute::CLScheduler::get().sync();
61
62 // Validate result by checking that the output has no negative values
63 for(unsigned int i = 0; i < numElements; ++i)
64 {
Jan Eilersc1c872f2021-07-22 13:17:04 +010065 CHECK(typedPtr[i] == 0);
David Monahane4a41dc2021-04-14 16:55:36 +010066 }
67}
68
Sadik Armagan1625efc2021-06-10 18:24:34 +010069TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010070{
71 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
72 static_cast<MemorySourceFlags>(MemorySource::Malloc));
73
74 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
75
76 // create TensorHandle for memory import
77 auto handle = handleFactory.CreateTensorHandle(info);
78
79 // Get CLtensor
80 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
81
82 // Allocate user memory
83 const size_t totalBytes = tensor.info()->total_size();
84 const size_t alignment =
85 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +010086 size_t space = totalBytes + alignment + alignment;
David Monahane4a41dc2021-04-14 16:55:36 +010087 auto testData = std::make_unique<uint8_t[]>(space);
88 void* alignedPtr = testData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +010089 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
David Monahane4a41dc2021-04-14 16:55:36 +010090
91 // Import memory
Sadik Armagan1625efc2021-06-10 18:24:34 +010092 CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +010093}
94
Sadik Armagan1625efc2021-06-10 18:24:34 +010095TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
David Monahane4a41dc2021-04-14 16:55:36 +010096{
97 MemorySource invalidMemSource = static_cast<MemorySource>(256);
98 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
99 static_cast<MemorySourceFlags>(invalidMemSource));
100
101 TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
102
103 // create TensorHandle for memory import
104 auto handle = handleFactory.CreateTensorHandle(info);
105
106 // Allocate user memory
107 std::vector<float> inputData
108 {
109 1.0f, 2.0f, 3.0f, 4.0f
110 };
111
112 // Import non-support memory
Sadik Armagan1625efc2021-06-10 18:24:34 +0100113 CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
David Monahane4a41dc2021-04-14 16:55:36 +0100114}
115
Sadik Armagan1625efc2021-06-10 18:24:34 +0100116TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100117{
118 // Create runtime in which test will run
119 IRuntime::CreationOptions options;
120 IRuntimePtr runtime(armnn::IRuntime::Create(options));
121
122 // build up the structure of the network
123 INetworkPtr net(INetwork::Create());
124
125 IConnectableLayer* input = net->AddInputLayer(0, "Input");
126
127 ActivationDescriptor descriptor;
128 descriptor.m_Function = ActivationFunction::ReLu;
129 IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
130
131 IConnectableLayer* output = net->AddOutputLayer(0, "Output");
132
133 input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
134 activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
135
136 TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
137 unsigned int numElements = tensorInfo.GetNumElements();
138 size_t totalBytes = numElements * sizeof(float);
139
140 input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141 activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
142
143 // Optimize the network
144 OptimizerOptions optOptions;
145 optOptions.m_ImportEnabled = true;
146 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
147 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100148 CHECK(optNet);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100149
150 // Loads it into the runtime.
151 NetworkId netId;
152 std::string ignoredErrorMessage;
153 // Enable Importing
154 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
155 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
156
157 // Creates structures for input & output
158 const size_t alignment =
159 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
160 size_t space = totalBytes + alignment + alignment;
161 auto inputData = std::make_unique<uint8_t[]>(space);
162 void* alignedInputPtr = inputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100163 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100164
165 // Input with negative values
166 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
167 std::fill_n(intputPtr, numElements, -5.0f);
168
169 auto outputData = std::make_unique<uint8_t[]>(space);
170 void* alignedOutputPtr = outputData.get();
Sadik Armagan1625efc2021-06-10 18:24:34 +0100171 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100172 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
173 std::fill_n(outputPtr, numElements, -10.0f);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100174
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100175 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
176 inputTensorInfo.SetConstant(true);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100177 InputTensors inputTensors
178 {
Cathal Corbett5b8093c2021-10-22 11:12:07 +0100179 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100180 };
181 OutputTensors outputTensors
182 {
183 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
184 };
185
186 runtime->GetProfiler(netId)->EnableProfiling(true);
187
188 // Do the inference
189 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
190
191 // Retrieve the Profiler.Print() output to get the workload execution
192 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
193 std::stringstream ss;
194 profilerManager.GetProfiler()->Print(ss);;
195 std::string dump = ss.str();
196
197 // Contains ActivationWorkload
198 std::size_t found = dump.find("ActivationWorkload");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100199 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100200
201 // Contains SyncMemGeneric
202 found = dump.find("SyncMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100203 CHECK(found != std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100204
205 // Does not contain CopyMemGeneric
206 found = dump.find("CopyMemGeneric");
Sadik Armagan1625efc2021-06-10 18:24:34 +0100207 CHECK(found == std::string::npos);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100208
Narumol Prangnawarat878e0f92021-05-11 19:51:14 +0100209 runtime->UnloadNetwork(netId);
210
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100211 // Check output is as expected
212 // Validate result by checking that the output has no negative values
213 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
Sadik Armagan1625efc2021-06-10 18:24:34 +0100214 CHECK(outputResult);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100215 for(unsigned int i = 0; i < numElements; ++i)
216 {
Sadik Armagan1625efc2021-06-10 18:24:34 +0100217 CHECK(outputResult[i] >= 0);
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100218 }
Narumol Prangnawarate5f0b242021-05-07 17:52:36 +0100219}
220
Nikhil Raj60ab9762022-01-13 09:34:44 +0000221TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
222{
223 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
224 static_cast<MemorySourceFlags>(MemorySource::Malloc));
225
226 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
227
228 // create TensorHandle for memory import
David Monahan3826ab62022-02-21 12:26:16 +0000229 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000230
231 // Get CLtensor
232 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
233
234 // Allocate user memory
235 const size_t totalBytes = tensor.info()->total_size();
236 const size_t alignment =
237 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
238 size_t space = totalBytes + alignment + alignment;
239 auto testData = std::make_unique<uint8_t[]>(space);
240 void* alignedPtr = testData.get();
241 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
242
243 // Import memory
244 CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
245
246}
247
248TEST_CASE("ClCanBeImportedAlignedMemory")
249{
250 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
251 static_cast<MemorySourceFlags>(MemorySource::Malloc));
252
253 TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
254
255 // create TensorHandle (Memory Managed status is irrelevant)
David Monahan3826ab62022-02-21 12:26:16 +0000256 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
Nikhil Raj60ab9762022-01-13 09:34:44 +0000257 // Get CLtensor
258 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
259
260 // Create an aligned buffer
261 const size_t totalBytes = tensor.info()->total_size();
262 const size_t alignment =
263 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
264 size_t space = totalBytes + alignment + alignment;
265 auto testData = std::make_unique<uint8_t[]>(space);
266 void* alignedPtr = testData.get();
267 CHECK(std::align(alignment, totalBytes, alignedPtr, space));
268
269 // Check aligned buffers return true
270 CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
271
272 // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
273 // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
274 // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
275 // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
276}
277
Narumol Prangnawarate2af6f42022-01-28 17:59:18 +0000278TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
279{
280 // Create runtime in which test will run
281 IRuntime::CreationOptions options;
282 IRuntimePtr runtime(armnn::IRuntime::Create(options));
283
284 // build up the structure of the network
285 INetworkPtr network(INetwork::Create());
286
287 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
288 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
289 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
290
291 kernelInfo.SetConstant(true);
292
293 std::vector<float> kernel =
294 {
295 4, 5, 6,
296 0, 0, 0,
297 3, 2, 1
298 };
299
300 const std::vector<float> expectedOutput =
301 {
302 23, 41, 33, 21,
303 44, 65, 76, 52,
304 82, 85, 79, 42
305 };
306
307 unsigned int numElements = inputInfo.GetNumElements();
308 size_t totalBytes = numElements * sizeof(float);
309
310 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
311 ARMNN_ASSERT(inputLayer);
312
313 armnn::ConstTensor weights(kernelInfo, kernel);
314
315 armnn::Convolution2dDescriptor convDesc2d;
316 convDesc2d.m_StrideX = 1;
317 convDesc2d.m_StrideY = 1;
318 convDesc2d.m_PadLeft = 1;
319 convDesc2d.m_PadRight = 1;
320 convDesc2d.m_PadTop = 1;
321 convDesc2d.m_PadBottom = 1;
322 convDesc2d.m_DataLayout = DataLayout::NHWC;
323 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
324 weights,
325 armnn::EmptyOptional(),
326 "conv");
327 ARMNN_ASSERT(convLayer);
328
329 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
330 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
331
332 IConnectableLayer* output = network->AddOutputLayer(0, "output");
333 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
334 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
335
336 // Optimize the network
337 OptimizerOptions optOptions;
338 optOptions.m_ImportEnabled = false;
339 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
340 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
341 CHECK(optNet);
342
343 // Loads it into the runtime.
344 NetworkId netId;
345 std::string ignoredErrorMessage;
346 // Enable Importing
347 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
348 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
349
350 // Creates structures for input & output
351 const size_t alignment =
352 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
353 size_t space = totalBytes + alignment + alignment;
354 auto inputData = std::make_unique<uint8_t[]>(space);
355 void* alignedInputPtr = inputData.get();
356 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
357
358 // Input with negative values
359 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
360 inputPtr[0] = 1;
361 inputPtr[1] = 5;
362 inputPtr[2] = 2;
363 inputPtr[3] = 3;
364 inputPtr[4] = 8;
365 inputPtr[5] = 7;
366 inputPtr[6] = 3;
367 inputPtr[7] = 6;
368 inputPtr[8] = 3;
369 inputPtr[9] = 3;
370 inputPtr[10] = 9;
371 inputPtr[11] = 1;
372
373
374 auto outputData = std::make_unique<uint8_t[]>(space);
375 void* alignedOutputPtr = outputData.get();
376 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
377 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
378 std::fill_n(outputPtr, numElements, -10.0f);
379
380 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
381 inputTensorInfo.SetConstant(true);
382 InputTensors inputTensors
383 {
384 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
385 };
386 OutputTensors outputTensors
387 {
388 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
389 };
390
391 runtime->GetProfiler(netId)->EnableProfiling(true);
392
393 INFO("Run ImportInputs");
394 std::vector<ImportedInputId> importedInputIds =
395 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
396 std::vector<ImportedOutputId> importedOutputIds =
397 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
398
399 // Do the inference
400 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
401
402 // Retrieve the Profiler.Print() output to get the workload execution
403 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
404 std::stringstream ss;
405 profilerManager.GetProfiler()->Print(ss);;
406 std::string dump = ss.str();
407
408 // Contains Convolution2dWorkload
409 std::size_t found = dump.find("Convolution2dWorkload");
410 CHECK(found != std::string::npos);
411
412 // Contains SyncMemGeneric
413 found = dump.find("SyncMemGeneric");
414 CHECK(found != std::string::npos);
415
416 // Does not contain CopyMemGeneric
417 found = dump.find("CopyMemGeneric");
418 CHECK(found == std::string::npos);
419
420 runtime->UnloadNetwork(netId);
421
422 // Check output is as expected
423 // Validate result by checking that the output has no negative values
424 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
425 CHECK(outputResult);
426
427 // Check the output is correct
428 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
429}
430
Cathal Corbetta3f4fba2022-03-21 09:27:08 +0000431TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
432{
433 using namespace half_float::literal;
434
435 // Create runtime in which test will run
436 IRuntime::CreationOptions options;
437 IRuntimePtr runtime(armnn::IRuntime::Create(options));
438
439 // build up the structure of the network
440 NetworkImpl network;
441
442 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
443 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
444
445 std::vector<float> expectedOutput =
446 {
447 -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
448 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
449 };
450
451 unsigned int numElements = inputInfo.GetNumElements();
452 size_t totalBytesInput = numElements * sizeof(Half);
453 size_t totalBytesOutput = numElements * sizeof(float);
454
455 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
456 ARMNN_ASSERT(inputLayer);
457
458 armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
459 ARMNN_ASSERT(convLayer);
460
461 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
462 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
463
464 IConnectableLayer* output = network.AddOutputLayer(0, "output");
465 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
466 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
467
468 // Optimize the network
469 OptimizerOptions optOptions;
470 optOptions.m_ImportEnabled = false;
471 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
472 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
473 CHECK(optNet);
474
475 // Loads it into the runtime.
476 NetworkId netId;
477 std::string ignoredErrorMessage;
478 // Enable Importing
479 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
480 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
481
482 // Creates structures for input & output
483 const size_t alignment =
484 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
485 size_t spaceInput = totalBytesInput + alignment + alignment;
486 size_t spaceOutput = totalBytesOutput + alignment + alignment;
487 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
488 void* alignedInputPtr = inputData.get();
489 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
490
491 // Input with negative values
492 auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
493 inputPtr[0] = -37.5_h;
494 inputPtr[1] = -15.2_h;
495 inputPtr[2] = -8.76_h;
496 inputPtr[3] = -2.0_h;
497 inputPtr[4] = -1.5_h;
498 inputPtr[5] = -1.3_h;
499 inputPtr[6] = -0.5_h;
500 inputPtr[7] = -0.4_h;
501 inputPtr[8] = 0.0_h;
502 inputPtr[9] = 1.0_h;
503 inputPtr[10] = 0.4_h;
504 inputPtr[11] = 0.5_h;
505 inputPtr[12] = 1.3_h;
506 inputPtr[13] = 1.5_h;
507 inputPtr[14] = 2.0_h;
508 inputPtr[15] = 8.76_h;
509 inputPtr[16] = 15.2_h;
510 inputPtr[17] = 37.5_h;
511
512 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
513 void* alignedOutputPtr = outputData.get();
514 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
515 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
516 std::fill_n(outputPtr, numElements, -10.0f);
517
518 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
519 inputTensorInfo.SetConstant(true);
520 InputTensors inputTensors
521 {
522 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
523 };
524 OutputTensors outputTensors
525 {
526 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
527 };
528
529 runtime->GetProfiler(netId)->EnableProfiling(true);
530
531 INFO("Run ImportInputs");
532 std::vector<ImportedInputId> importedInputIds =
533 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
534 std::vector<ImportedOutputId> importedOutputIds =
535 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
536
537 // Do the inference
538 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
539
540 // Retrieve the Profiler.Print() output to get the workload execution
541 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
542 std::stringstream ss;
543 profilerManager.GetProfiler()->Print(ss);;
544 std::string dump = ss.str();
545
546 // Contains Convolution2dWorkload
547 std::size_t found = dump.find("ConvertFp16ToFp32Workload");
548 CHECK(found != std::string::npos);
549
550 // Contains SyncMemGeneric
551 found = dump.find("SyncMemGeneric");
552 CHECK(found != std::string::npos);
553
554 // Does not contain CopyMemGeneric
555 found = dump.find("CopyMemGeneric");
556 CHECK(found == std::string::npos);
557
558 runtime->UnloadNetwork(netId);
559
560 // Check output is as expected
561 // Validate result by checking that the output has no negative values
562 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
563 CHECK(outputResult);
564
565 // Check the output is correct
566 for (size_t i = 0; i < numElements; ++i)
567 {
568 DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
569 "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
570 }
571}
572
573
574TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
575{
576 using namespace half_float::literal;
577
578 // Create runtime in which test will run
579 IRuntime::CreationOptions options;
580 IRuntimePtr runtime(armnn::IRuntime::Create(options));
581
582 // build up the structure of the network
583 NetworkImpl network;
584
585 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
586 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
587
588 std::vector<Half> expectedOutput =
589 {
590 -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
591 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
592 };
593
594 unsigned int numElements = inputInfo.GetNumElements();
595 size_t totalBytesInput = numElements * sizeof(float);
596 size_t totalBytesOutput = numElements * sizeof(Half);
597
598 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
599 ARMNN_ASSERT(inputLayer);
600
601 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
602 ARMNN_ASSERT(convLayer);
603
604 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
605 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
606
607 IConnectableLayer* output = network.AddOutputLayer(0, "output");
608 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
609 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
610
611 // Optimize the network
612 OptimizerOptions optOptions;
613 optOptions.m_ImportEnabled = false;
614 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
615 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
616 CHECK(optNet);
617
618 // Loads it into the runtime.
619 NetworkId netId;
620 std::string ignoredErrorMessage;
621 // Enable Importing
622 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
623 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
624
625 // Creates structures for input & output
626 const size_t alignment =
627 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
628 size_t spaceInput = totalBytesInput + alignment + alignment;
629 size_t spaceOutput = totalBytesOutput + alignment + alignment;
630 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
631 void* alignedInputPtr = inputData.get();
632 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
633
634 // Input with negative values
635 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
636 inputPtr[0] = -37.5f;
637 inputPtr[1] = -15.2f;
638 inputPtr[2] = -8.76f;
639 inputPtr[3] = -2.0f;
640 inputPtr[4] = -1.5f;
641 inputPtr[5] = -1.3f;
642 inputPtr[6] = -0.5f;
643 inputPtr[7] = -0.4f;
644 inputPtr[8] = 0.0f;
645 inputPtr[9] = 1.0f;
646 inputPtr[10] = 0.4f;
647 inputPtr[11] = 0.5f;
648 inputPtr[12] = 1.3f;
649 inputPtr[13] = 1.5f;
650 inputPtr[14] = 2.0f;
651 inputPtr[15] = 8.76f;
652 inputPtr[16] = 15.2f;
653 inputPtr[17] = 37.5f;
654
655 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
656 void* alignedOutputPtr = outputData.get();
657 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
658 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
659 std::fill_n(outputPtr, numElements, -10.0f);
660
661 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
662 inputTensorInfo.SetConstant(true);
663 InputTensors inputTensors
664 {
665 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
666 };
667 OutputTensors outputTensors
668 {
669 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
670 };
671
672 runtime->GetProfiler(netId)->EnableProfiling(true);
673
674 INFO("Run ImportInputs");
675 std::vector<ImportedInputId> importedInputIds =
676 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
677 std::vector<ImportedOutputId> importedOutputIds =
678 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
679
680 // Do the inference
681 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
682
683 // Retrieve the Profiler.Print() output to get the workload execution
684 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
685 std::stringstream ss;
686 profilerManager.GetProfiler()->Print(ss);;
687 std::string dump = ss.str();
688
689 // Contains Convolution2dWorkload
690 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
691 CHECK(found != std::string::npos);
692
693 // Contains SyncMemGeneric
694 found = dump.find("SyncMemGeneric");
695 CHECK(found != std::string::npos);
696
697 // Does not contain CopyMemGeneric
698 found = dump.find("CopyMemGeneric");
699 CHECK(found == std::string::npos);
700
701 runtime->UnloadNetwork(netId);
702
703 // Check output is as expected
704 // Validate result by checking that the output has no negative values
705 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
706 CHECK(outputResult);
707
708 // Check the output is correct
709 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
710}
711
712TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
713{
714 using namespace half_float::literal;
715
716 // Create runtime in which test will run
717 IRuntime::CreationOptions options;
718 IRuntimePtr runtime(armnn::IRuntime::Create(options));
719
720 // build up the structure of the network
721 NetworkImpl network;
722
723 armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32);
724 armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
725
726 std::vector<Half> expectedOutput = { 1.0_h };
727
728 unsigned int numElements = inputInfo.GetNumElements();
729 size_t totalBytesInput = numElements * sizeof(float);
730 size_t totalBytesOutput = numElements * sizeof(Half);
731
732 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
733 ARMNN_ASSERT(inputLayer);
734
735 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
736 ARMNN_ASSERT(convLayer);
737
738 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
739 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
740
741 IConnectableLayer* output = network.AddOutputLayer(0, "output");
742 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
743 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
744
745 // Optimize the network
746 OptimizerOptions optOptions;
747 optOptions.m_ImportEnabled = false;
748 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
749 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
750 CHECK(optNet);
751
752 // Loads it into the runtime.
753 NetworkId netId;
754 std::string ignoredErrorMessage;
755 // Enable Importing
756 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
757 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
758
759 // Creates structures for input & output
760 const size_t alignment =
761 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
762 size_t spaceInput = totalBytesInput + alignment + alignment;
763 size_t spaceOutput = totalBytesOutput + alignment + alignment;
764 auto inputData = std::make_unique<uint8_t[]>(spaceInput);
765 void* alignedInputPtr = inputData.get();
766 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
767
768 // Input with negative values
769 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
770 inputPtr[0] = 1.0f;
771
772 auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
773 void* alignedOutputPtr = outputData.get();
774 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
775 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
776 std::fill_n(outputPtr, numElements, -10.0f);
777
778 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
779 inputTensorInfo.SetConstant(true);
780 InputTensors inputTensors
781 {
782 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
783 };
784 OutputTensors outputTensors
785 {
786 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
787 };
788
789 runtime->GetProfiler(netId)->EnableProfiling(true);
790
791 INFO("Run ImportInputs");
792 std::vector<ImportedInputId> importedInputIds =
793 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
794 std::vector<ImportedOutputId> importedOutputIds =
795 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
796
797 // Do the inference
798 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
799
800 // Retrieve the Profiler.Print() output to get the workload execution
801 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
802 std::stringstream ss;
803 profilerManager.GetProfiler()->Print(ss);;
804 std::string dump = ss.str();
805
806 // Contains Convolution2dWorkload
807 std::size_t found = dump.find("ConvertFp32ToFp16Workload");
808 CHECK(found != std::string::npos);
809
810 // Contains SyncMemGeneric
811 found = dump.find("SyncMemGeneric");
812 CHECK(found != std::string::npos);
813
814 // Does not contain CopyMemGeneric
815 found = dump.find("CopyMemGeneric");
816 CHECK(found == std::string::npos);
817
818 runtime->UnloadNetwork(netId);
819
820 // Check output is as expected
821 // Validate result by checking that the output has no negative values
822 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
823 CHECK(outputResult);
824
825 // Check the output is correct
826 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
827}
828
David Monahan041f17a2022-03-03 10:56:17 +0000829TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
830{
831/*
832 * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
833 * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
834 * imported correctly. For the second we use similar pointers but don't use PreImporting to force fall back to copy.
835 */
836 // Create runtime in which test will run
837 IRuntime::CreationOptions options;
838 IRuntimePtr runtime(armnn::IRuntime::Create(options));
839
840 // build up the structure of the network
841 INetworkPtr network(INetwork::Create());
842
843 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
844 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
845 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
846
847 kernelInfo.SetConstant(true);
848
849 std::vector<float> kernel =
850 {
851 4, 5, 6,
852 0, 0, 0,
853 3, 2, 1
854 };
855
856 const std::vector<float> expectedOutput =
857 {
858 23, 41, 33, 21,
859 44, 65, 76, 52,
860 82, 85, 79, 42
861 };
862
863 unsigned int numElements = inputInfo.GetNumElements();
864 size_t totalBytes = numElements * sizeof(float);
865
866 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
867 ARMNN_ASSERT(inputLayer);
868
869 armnn::ConstTensor weights(kernelInfo, kernel);
870
871 armnn::Convolution2dDescriptor convDesc2d;
872 convDesc2d.m_StrideX = 1;
873 convDesc2d.m_StrideY = 1;
874 convDesc2d.m_PadLeft = 1;
875 convDesc2d.m_PadRight = 1;
876 convDesc2d.m_PadTop = 1;
877 convDesc2d.m_PadBottom = 1;
878 convDesc2d.m_DataLayout = DataLayout::NHWC;
879 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
880 weights,
881 armnn::EmptyOptional(),
882 "conv");
883 ARMNN_ASSERT(convLayer);
884
885 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
886 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
887
888 IConnectableLayer* output = network->AddOutputLayer(0, "output");
889 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
890 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
891
892 // Optimize the network
893 OptimizerOptions optOptions;
894 optOptions.m_ImportEnabled = false;
895 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
896 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
897 CHECK(optNet);
898
899 // Loads it into the runtime.
900 NetworkId netId;
901 std::string ignoredErrorMessage;
902 // Enable Importing
903 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
904 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
905
906 // Creates structures for input & output
907 const size_t alignment =
908 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
909 size_t space = totalBytes + alignment + alignment;
910 auto inputData = std::make_unique<uint8_t[]>(space);
911 void* alignedInputPtr = inputData.get();
912 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
913
914 // Fill input with values
915 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
916 inputPtr[0] = 1;
917 inputPtr[1] = 5;
918 inputPtr[2] = 2;
919 inputPtr[3] = 3;
920 inputPtr[4] = 8;
921 inputPtr[5] = 7;
922 inputPtr[6] = 3;
923 inputPtr[7] = 6;
924 inputPtr[8] = 3;
925 inputPtr[9] = 3;
926 inputPtr[10] = 9;
927 inputPtr[11] = 1;
928
929
930 auto outputData = std::make_unique<uint8_t[]>(space);
931 void* alignedOutputPtr = outputData.get();
932 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
933 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
934 std::fill_n(outputPtr, numElements, -10.0f);
935
936 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
937 inputTensorInfo.SetConstant(true);
938 InputTensors inputTensors
939 {
940 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
941 };
942 OutputTensors outputTensors
943 {
944 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
945 };
946
947 runtime->GetProfiler(netId)->EnableProfiling(true);
948
949 INFO("Run ImportInputs");
950 std::vector<ImportedInputId> importedInputIds =
951 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
952 std::vector<ImportedOutputId> importedOutputIds =
953 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
954
955 // Do the inference
956 runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
957
958 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
959 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
960 std::stringstream ss;
961 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
962 std::string dump = ss.str();
963
964 // Contains Convolution2dWorkload
965 std::size_t found = dump.find("Convolution2dWorkload");
966 CHECK(found != std::string::npos);
967
968 // Contains SyncMemGeneric
969 found = dump.find("SyncMemGeneric");
970 CHECK(found != std::string::npos);
971
972 // Does not contain CopyMemGeneric
973 found = dump.find("CopyMemGeneric");
974 CHECK(found == std::string::npos);
975
976 // Sync the outputs so we can read the data
977 arm_compute::CLScheduler::get().sync();
978
979 // Check output is as expected
980 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
981 CHECK(outputResult);
982 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
983
984 // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
985
986 // Creates structures for input & output
987 auto inputDataCopy = std::make_unique<uint8_t[]>(space);
988 void* copyInputPtr = inputDataCopy.get();
989
990 // Fill input with values
991 auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
992 inputCopyPtr[0] = 1;
993 inputCopyPtr[1] = 5;
994 inputCopyPtr[2] = 2;
995 inputCopyPtr[3] = 3;
996 inputCopyPtr[4] = 8;
997 inputCopyPtr[5] = 7;
998 inputCopyPtr[6] = 3;
999 inputCopyPtr[7] = 6;
1000 inputCopyPtr[8] = 3;
1001 inputCopyPtr[9] = 3;
1002 inputCopyPtr[10] = 9;
1003 inputCopyPtr[11] = 1;
1004
1005 // Output pre-filled with -10.0f
1006 auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1007 void* copyOutputPtr = outputDataCopy.get();
1008 auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1009 std::fill_n(outputCopyPtr, numElements, -10.0f);
1010
1011 InputTensors inputTensorsCopy
1012 {
1013 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1014 };
1015 OutputTensors outputTensorsCopy
1016 {
1017 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1018 };
1019
1020 // Do the inference without any pre-imported input/output ids
1021 runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1022 // Sync the outputs so we can read the data
1023 arm_compute::CLScheduler::get().sync();
1024
1025 // Check the output is correct
1026 outputResult = reinterpret_cast<float*>(copyOutputPtr);
1027 CHECK(outputResult);
1028 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1029
1030 // Query the profiler again, this will contain the results of both inferences
1031 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1032 dump = ss.str();
1033
1034 // Contains Convolution2dWorkload
1035 found = dump.find("Convolution2dWorkload");
1036 CHECK(found != std::string::npos);
1037
1038 // Should still contain the SyncMemGeneric
1039 found = dump.find("SyncMemGeneric");
1040 CHECK(found != std::string::npos);
1041
1042 // Should now also contain a CopyMemGeneric
1043 found = dump.find("CopyMemGeneric");
1044 CHECK(found != std::string::npos);
1045 runtime->UnloadNetwork(netId);
1046}
1047
1048TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1049{
1050/*
1051 * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1052 * the import.
1053 */
1054 // Create runtime in which test will run
1055 IRuntime::CreationOptions options;
1056 IRuntimePtr runtime(armnn::IRuntime::Create(options));
1057
1058 // build up the structure of the network
1059 INetworkPtr network(INetwork::Create());
1060
1061 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1062 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1063 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1064
1065 kernelInfo.SetConstant(true);
1066
1067 std::vector<float> kernel =
1068 {
1069 4, 5, 6,
1070 0, 0, 0,
1071 3, 2, 1
1072 };
1073
1074 const std::vector<float> expectedOutput =
1075 {
1076 23, 41, 33, 21,
1077 44, 65, 76, 52,
1078 82, 85, 79, 42
1079 };
1080
1081 unsigned int numElements = inputInfo.GetNumElements();
1082 size_t totalBytes = numElements * sizeof(float);
1083
1084 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1085 ARMNN_ASSERT(inputLayer);
1086
1087 armnn::ConstTensor weights(kernelInfo, kernel);
1088
1089 armnn::Convolution2dDescriptor convDesc2d;
1090 convDesc2d.m_StrideX = 1;
1091 convDesc2d.m_StrideY = 1;
1092 convDesc2d.m_PadLeft = 1;
1093 convDesc2d.m_PadRight = 1;
1094 convDesc2d.m_PadTop = 1;
1095 convDesc2d.m_PadBottom = 1;
1096 convDesc2d.m_DataLayout = DataLayout::NHWC;
1097 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
1098 weights,
1099 armnn::EmptyOptional(),
1100 "conv");
1101 ARMNN_ASSERT(convLayer);
1102
1103 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1104 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1105
1106 IConnectableLayer* output = network->AddOutputLayer(0, "output");
1107 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1108 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1109
1110 // Optimize the network
1111 OptimizerOptions optOptions;
1112 optOptions.m_ImportEnabled = false;
1113 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1114 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1115 CHECK(optNet);
1116
1117 // Loads it into the runtime.
1118 NetworkId netId;
1119 std::string ignoredErrorMessage;
1120 // Enable Importing
1121 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1122 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1123
1124 // Creates structures for input & output
1125 const size_t alignment =
1126 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1127 size_t space = totalBytes + alignment + alignment;
1128 auto inputData = std::make_unique<uint8_t[]>(space);
1129 void* copyInputPtr = inputData.get();
1130
1131 // Fill input with values
1132 auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1133 inputPtr[0] = 1;
1134 inputPtr[1] = 5;
1135 inputPtr[2] = 2;
1136 inputPtr[3] = 3;
1137 inputPtr[4] = 8;
1138 inputPtr[5] = 7;
1139 inputPtr[6] = 3;
1140 inputPtr[7] = 6;
1141 inputPtr[8] = 3;
1142 inputPtr[9] = 3;
1143 inputPtr[10] = 9;
1144 inputPtr[11] = 1;
1145
1146 // Create output buffer and fill it with -10.0f
1147 auto outputData = std::make_unique<uint8_t[]>(space);
1148 void* copyOutputPtr = outputData.get();
1149 auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1150 std::fill_n(outputPtr, numElements, -10.0f);
1151
1152 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1153 inputTensorInfo.SetConstant(true);
1154 InputTensors inputTensors
1155 {
1156 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1157 };
1158 OutputTensors outputTensors
1159 {
1160 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1161 };
1162
1163 runtime->GetProfiler(netId)->EnableProfiling(true);
1164
1165 // Do the inference without any pre-imported inputs/outputs
1166 runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1167
1168 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1169 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1170 std::stringstream ss;
1171 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1172 std::string dump = ss.str();
1173
1174 // Contains Convolution2dWorkload
1175 std::size_t found = dump.find("Convolution2dWorkload");
1176 CHECK(found != std::string::npos);
1177
1178 // Does not contain SyncMemGeneric
1179 found = dump.find("SyncMemGeneric");
1180 CHECK(found == std::string::npos);
1181
1182 // Does contain CopyMemGeneric
1183 found = dump.find("CopyMemGeneric");
1184 CHECK(found != std::string::npos);
1185
1186 // Sync the outputs so we can read the data
1187 arm_compute::CLScheduler::get().sync();
1188
1189 // Check output is as expected
1190 auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1191 CHECK(outputResult);
1192 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1193
1194 // Repeat the inference, with new tensors and while using pre-importing to force it to import
1195
1196 // Creates structures for input & output
1197 auto inputDataImport = std::make_unique<uint8_t[]>(space);
1198 void* alignedInputImportPtr = inputDataImport.get();
1199 CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1200
1201 // Fill input with values
1202 auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1203 inputImportPtr[0] = 1;
1204 inputImportPtr[1] = 5;
1205 inputImportPtr[2] = 2;
1206 inputImportPtr[3] = 3;
1207 inputImportPtr[4] = 8;
1208 inputImportPtr[5] = 7;
1209 inputImportPtr[6] = 3;
1210 inputImportPtr[7] = 6;
1211 inputImportPtr[8] = 3;
1212 inputImportPtr[9] = 3;
1213 inputImportPtr[10] = 9;
1214 inputImportPtr[11] = 1;
1215
1216 // Output pre-filled with -10.0f
1217 auto outputDataImport = std::make_unique<uint8_t[]>(space);
1218 void* alignedOutputImportPtr = outputDataImport.get();
1219 CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1220 auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1221 std::fill_n(outputImportPtr, numElements, -10.0f);
1222
1223 InputTensors inputTensorsImport
1224 {
1225 {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1226 };
1227 OutputTensors outputTensorsImport
1228 {
1229 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1230 };
1231
1232 INFO("Run ImportInputs");
1233 std::vector<ImportedInputId> importedInputIds =
1234 runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1235 std::vector<ImportedOutputId> importedOutputIds =
1236 runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1237
1238 // Do the inference with pre-imported inputs/outputs
1239 runtime->EnqueueWorkload(netId, inputTensorsImport, outputTensorsImport, importedInputIds, importedOutputIds);
1240 // Sync the outputs so we can read the data
1241 arm_compute::CLScheduler::get().sync();
1242
1243 // Check the output is correct
1244 outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1245 CHECK(outputResult);
1246 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1247
1248
1249 // Query the profiler again, this will contain the results of both inferences
1250 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1251 dump = ss.str();
1252
1253 // Contains Convolution2dWorkload
1254 found = dump.find("Convolution2dWorkload");
1255 CHECK(found != std::string::npos);
1256
1257 // Should now contain the SyncMemGeneric
1258 found = dump.find("SyncMemGeneric");
1259 CHECK(found != std::string::npos);
1260
1261 // Should still contain a CopyMemGeneric from the first inference
1262 found = dump.find("CopyMemGeneric");
1263 CHECK(found != std::string::npos);
1264 runtime->UnloadNetwork(netId);
1265}
1266
Sadik Armagan1625efc2021-06-10 18:24:34 +01001267}