Blame - applications/threadx_demo/main.cpp - ml/ethos-u/ethos-u-core-platform

blob: 94430eefd3b522f58ae0988839df0584b014e060 [file] [log] [blame]

Lior Dekel	489e40b	2021-08-02 12:03:55 +0300	[diff] [blame]	1	/*
				2	* Copyright (c) 2019-2021 Arm Limited. All rights reserved.
				3	*
				4	* SPDX-License-Identifier: Apache-2.0
				5	*
				6	* Licensed under the Apache License, Version 2.0 (the License); you may
				7	* not use this file except in compliance with the License.
				8	* You may obtain a copy of the License at
				9	*
				10	* www.apache.org/licenses/LICENSE-2.0
				11	*
				12	* Unless required by applicable law or agreed to in writing, software
				13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
				14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				15	* See the License for the specific language governing permissions and
				16	* limitations under the License.
				17	*/
				18
				19	/****************************************************************************
				20	* Includes
				21	****************************************************************************/
				22	#include "tx_api.h"
				23
				24	#include <inttypes.h>
				25	#include <stdio.h>
				26	#include <vector>
				27
				28	#include "inference_process.hpp"
				29
				30	// Model data (Defined & changable by modifiying compile definition in CMakeLists.txt)
				31	#include "input.h"
				32	#include "model.h"
				33	#include "output.h"
				34
				35	using namespace std;
				36	using namespace InferenceProcess;
				37
				38	/****************************************************************************
				39	* Defines
				40	****************************************************************************/
				41	// Nr. of threads to process inferences with. Thread reserves driver & runs inference (Normally 1 per NPU, but not a
				42	// must)
				43	#define NUM_INFERENCE_THREADS 1
				44	// Nr. of threads to create jobs and recieve responses
				45	#define NUM_JOB_THREADS 2
				46	// Nr. of jobs to create per job thread
				47	#define NUM_JOBS_PER_THREAD 1
				48
				49	#define PROCESS_THREAD_STACK_SIZE (16 * 1024)
				50	#define SENDER_THREAD_STACK_SIZE (2 * 1024)
				51	#define PROCESS_THREAD_CONTEXT_SIZE (sizeof(TX_THREAD))
				52	#define SENDER_THREAD_CONTEXT_SIZE (sizeof(TX_THREAD))
				53
				54	// Tensor arena size
				55	#ifdef TENSOR_ARENA_SIZE // If defined in model.h
				56	#define TENSOR_ARENA_SIZE_PER_INFERENCE TENSOR_ARENA_SIZE
				57	#else // If not defined, use maximum available - 2M
				58	#define TENSOR_ARENA_SIZE 2000000
				59	#define TENSOR_ARENA_SIZE_PER_INFERENCE (TENSOR_ARENA_SIZE / NUM_INFERENCE_THREADS)
				60	#endif
				61
				62	#define PROCESS_QUEUE_SIZE (NUM_JOBS_PER_THREAD * NUM_JOB_THREADS * sizeof(xInferenceJob *))
				63	#define SENDER_QUEUE_SIZE (NUM_JOBS_PER_THREAD * sizeof(xInferenceJob *))
				64
				65	/* BYTE_POOL_SIZE_OVERHEAD is used to increase the memory byte pool size, as the number of
				66	allocatable bytes in a memory byte pool is slightly less than what was specified during creation */
				67	#define BYTE_POOL_SIZE_OVERHEAD (512)
				68	#define BYTE_POOL_SIZE \
				69	(((PROCESS_THREAD_CONTEXT_SIZE + PROCESS_THREAD_STACK_SIZE) * NUM_INFERENCE_THREADS) + \
				70	(SENDER_THREAD_CONTEXT_SIZE + SENDER_THREAD_STACK_SIZE + SENDER_QUEUE_SIZE) * NUM_JOB_THREADS + \
				71	PROCESS_QUEUE_SIZE + BYTE_POOL_SIZE_OVERHEAD)
				72
				73	/****************************************************************************
				74	* Structures
				75	****************************************************************************/
				76	struct ProcessThreadParams {
				77	ProcessThreadParams() : queueHandle(nullptr), tensorArena(nullptr), arenaSize(0) {}
				78	ProcessThreadParams(TX_QUEUE _queue, uint8_t _tensorArena, size_t _arenaSize) :
				79	queueHandle(_queue), tensorArena(_tensorArena), arenaSize(_arenaSize) {}
				80
				81	TX_QUEUE *queueHandle;
				82	uint8_t *tensorArena;
				83	size_t arenaSize;
				84	};
				85
				86	// Wrapper around InferenceProcess::InferenceJob. Adds responseQueue and status for ThreadX multi-threaded purposes.
				87	struct xInferenceJob : public InferenceJob {
				88	TX_QUEUE *responseQueue;
				89	bool status;
				90
				91	xInferenceJob() : InferenceJob(), responseQueue(nullptr), status(false) {}
				92	xInferenceJob(const string &_name,
				93	const DataPtr &_networkModel,
				94	const vector<DataPtr> &_input,
				95	const vector<DataPtr> &_output,
				96	const vector<DataPtr> &_expectedOutput,
				97	const size_t _numBytesToPrint,
				98	const vector<uint8_t> &_pmuEventConfig,
				99	const uint32_t _pmuCycleCounterEnable,
				100	TX_QUEUE *_queue) :
				101	InferenceJob(_name,
				102	_networkModel,
				103	_input,
				104	_output,
				105	_expectedOutput,
				106	_numBytesToPrint,
				107	_pmuEventConfig,
				108	_pmuCycleCounterEnable),
				109	responseQueue(_queue), status(false) {}
				110	};
				111
				112	/****************************************************************************
				113	* Global and static variables
				114	****************************************************************************/
				115	namespace {
				116	// Number of total completed jobs, needed to exit application correctly if NUM_JOB_THREADS > 1
				117	int totalCompletedJobs = 0;
				118
				119	// TensorArena static initialisation
				120	const size_t arenaSize = TENSOR_ARENA_SIZE_PER_INFERENCE;
				121
				122	TX_QUEUE inferenceProcessQueue;
				123
				124	ProcessThreadParams threadParams[NUM_INFERENCE_THREADS];
				125
				126	TX_BYTE_POOL bytePool;
				127	ULONG memoryArea[BYTE_POOL_SIZE / sizeof(ULONG)];
				128	} // namespace
				129
				130	__attribute__((section(".bss.tensor_arena"), aligned(16)))
				131	uint8_t inferenceProcessTensorArena[NUM_INFERENCE_THREADS][arenaSize];
				132
				133	/****************************************************************************
				134	* Mutex & Semaphore
				135	* Overrides weak-linked symbols in ethosu_driver.c to implement thread handling
				136	****************************************************************************/
				137	extern "C" {
				138	void *ethosu_mutex_create(void) {
				139	UINT status;
				140	TX_MUTEX *mutex;
				141
				142	mutex = new TX_MUTEX;
				143	status = tx_mutex_create(mutex, "mutex 0", TX_NO_INHERIT);
				144	if (status != TX_SUCCESS) {
				145	printf("mutex create failed, error - %d\n", status);
				146	}
				147	return (void *)mutex;
				148	}
				149
				150	void ethosu_mutex_lock(void *mutex) {
				151	UINT status;
				152	status = tx_mutex_get(reinterpret_cast<TX_MUTEX *>(mutex), TX_WAIT_FOREVER);
				153	if (status != TX_SUCCESS) {
				154	printf("mutex get failed, error - %d\n", status);
				155	}
				156	return;
				157	}
				158
				159	void ethosu_mutex_unlock(void *mutex) {
				160	UINT status;
				161	status = tx_mutex_put(reinterpret_cast<TX_MUTEX *>(mutex));
				162	if (status != TX_SUCCESS) {
				163	printf("mutex put failed, error - %d\n", status);
				164	}
				165	return;
				166	}
				167
				168	void *ethosu_semaphore_create(void) {
				169	UINT status;
				170	TX_SEMAPHORE *semaphore;
				171
				172	semaphore = new TX_SEMAPHORE;
				173	status = tx_semaphore_create(semaphore, "semaphore", 1);
				174
				175	if (status != TX_SUCCESS) {
				176	printf("Semaphore create failed, error - %d\n", status);
				177	}
				178
				179	return (void *)semaphore;
				180	}
				181
				182	void ethosu_semaphore_take(void *sem) {
				183	UINT status;
				184
				185	status = tx_semaphore_get(reinterpret_cast<TX_SEMAPHORE *>(sem), TX_WAIT_FOREVER);
				186
				187	if (status != TX_SUCCESS) {
				188	printf("Semaphore get/take, error - %d\n", status);
				189	}
				190
				191	return;
				192	}
				193
				194	void ethosu_semaphore_give(void *sem) {
				195	UINT status;
				196
				197	status = tx_semaphore_put(reinterpret_cast<TX_SEMAPHORE *>(sem));
				198
				199	if (status != TX_SUCCESS) {
				200	printf("Semaphore put/give, error - %d\n", status);
				201	}
				202
				203	return;
				204	}
				205	}
				206
				207	/****************************************************************************
				208	* Functions
				209	****************************************************************************/
				210	// inferenceProcessThread - Run jobs from queue with available driver
				211	void inferenceProcessThread(ULONG pvParameters) {
				212	ProcessThreadParams params = reinterpret_cast<ProcessThreadParams >(pvParameters);
				213	UINT tx_status = TX_QUEUE_ERROR;
				214
				215	class InferenceProcess inferenceProcess(params.tensorArena, params.arenaSize);
				216
				217	for (;;) {
				218	xInferenceJob *xJob;
				219
				220	// Get the job details from the process queue
				221	tx_status = tx_queue_receive(params.queueHandle, &xJob, TX_WAIT_FOREVER);
				222	if (tx_status != TX_SUCCESS) {
				223	printf("process failed to receive from Queue, error - %d\n", tx_status);
				224	exit(1);
				225	}
				226
				227	// run the job
				228	bool status = inferenceProcess.runJob(*xJob);
				229	xJob->status = status;
				230
				231	// Send response for the job in the response queue
				232	tx_status = tx_queue_send(xJob->responseQueue, &xJob, TX_WAIT_FOREVER);
				233	if (tx_status != TX_SUCCESS) {
				234	printf("process inferenceProcessThread failed to send to Queue, error - %d\n", tx_status);
				235	exit(1);
				236	}
				237	}
				238
				239	tx_status = tx_thread_terminate(nullptr);
				240	if (tx_status != TX_SUCCESS) {
				241	printf("process inferenceProcessThread failed to terminate thread, error - %d\n", tx_status);
				242	exit(1);
				243	}
				244	}
				245
				246	// inferenceSenderThread - Creates NUM_INFERNECE_JOBS jobs, queues them, and then listens for completion status
				247	void inferenceSenderThread(ULONG pvParameters) {
				248	int ret = 0;
				249	TX_QUEUE senderQueue;
				250	UINT status = TX_QUEUE_ERROR;
				251	TX_QUEUE inferenceProcessQueueLocal = reinterpret_cast<TX_QUEUE >(pvParameters);
				252	xInferenceJob jobs[NUM_JOBS_PER_THREAD];
				253	CHAR *senderQueuePtr = nullptr;
				254
				255	/* Allocate memory for this inference sender thread responses queue */
				256	status = tx_byte_allocate(&bytePool, reinterpret_cast<VOID **>(&senderQueuePtr), SENDER_QUEUE_SIZE, TX_NO_WAIT);
				257	if (status != TX_SUCCESS) {
				258	printf("Sender thread failed to allocate bytes for Queue, error - %d\n", status);
				259	exit(1);
				260	}
				261
				262	/* Create responses queue for this inference sender thread */
				263	status = tx_queue_create(
				264	&senderQueue, "senderQueue", sizeof(xInferenceJob *) / sizeof(uint32_t), senderQueuePtr, SENDER_QUEUE_SIZE);
				265
				266	if (status != TX_SUCCESS) {
				267	printf("Sender thread failed to create Queue, error - %d\n", status);
				268	exit(1);
				269	}
				270
				271	/* Create the jobs and queue them in the inference process queue */
				272	for (int n = 0; n < NUM_JOBS_PER_THREAD; n++) {
				273
				274	// Create job
				275	xInferenceJob *job = &jobs[n];
				276	job->name = string(modelName);
				277	job->networkModel = DataPtr(networkModelData, sizeof(networkModelData));
				278	job->input.push_back(DataPtr(inputData, sizeof(inputData)));
				279	job->expectedOutput.push_back(DataPtr(expectedOutputData, sizeof(expectedOutputData)));
				280	job->responseQueue = &senderQueue;
				281
				282	// queue job
				283	status = tx_queue_send(inferenceProcessQueueLocal, &job, TX_WAIT_FOREVER);
				284	if (status != TX_SUCCESS) {
				285	printf("Sender thread failed to send to Queue, error - %d\n", status);
				286	exit(1);
				287	}
				288	}
				289
				290	/* Listen for completion status on the response queue */
				291	do {
				292	xInferenceJob *pSendJob;
				293
				294	status = tx_queue_receive(&senderQueue, &pSendJob, TX_WAIT_FOREVER);
				295	if (status != TX_SUCCESS) {
				296	printf("Sender thread failed to receive from Queue, error - %d\n", status);
				297	exit(1);
				298	}
				299
				300	totalCompletedJobs++;
				301	ret = (pSendJob->status);
				302	if (pSendJob->status != 0) {
				303	break;
				304	}
				305	} while (totalCompletedJobs < NUM_JOBS_PER_THREAD * NUM_JOB_THREADS);
				306
				307	/* delete the response queue */
				308	status = tx_queue_delete(&senderQueue);
				309	if (status != TX_SUCCESS) {
				310	printf("Sender thread failed to delete Queue, error - %d\n", status);
				311	exit(1);
				312	}
				313
				314	exit(ret);
				315	}
				316
				317	/****************************************************************************
				318	* Application
				319	****************************************************************************/
				320	int main() {
				321	/* Enter the ThreadX kernel. */
				322	tx_kernel_enter();
				323	return 0;
				324	}
				325
				326	void tx_application_define(void *first_unused_memory) {
				327	UINT status;
				328	CHAR *senderThreadStackPtr[NUM_JOB_THREADS] = {nullptr};
				329	CHAR *processThreadStackPtr[NUM_INFERENCE_THREADS] = {nullptr};
				330	CHAR *processQueuePtr = nullptr;
				331	CHAR *senderThreadPtr[NUM_JOB_THREADS] = {nullptr};
				332	CHAR *processThreadPtr[NUM_INFERENCE_THREADS] = {nullptr};
				333
				334	/* Create a byte memory pool from which to allocate the threads stacks and queues. */
				335	status = tx_byte_pool_create(&bytePool, "byte pool", memoryArea, BYTE_POOL_SIZE);
				336	if (status != TX_SUCCESS) {
				337	printf("Main failed to allocate pool of bytes, error - %d\n", status);
				338	exit(1);
				339	}
				340
				341	/* Allocate memory for the inference process queue */
				342	status = tx_byte_allocate(&bytePool, reinterpret_cast<VOID **>(&processQueuePtr), PROCESS_QUEUE_SIZE, TX_NO_WAIT);
				343	if (status != TX_SUCCESS) {
				344	printf("Main failed to allocate bytes for process queue, error - %d\n", status);
				345	exit(1);
				346	}
				347
				348	status = tx_queue_create(&inferenceProcessQueue,
				349	"inferenceProcessQueue",
				350	sizeof(xInferenceJob *) / sizeof(uint32_t),
				351	processQueuePtr,
				352	PROCESS_QUEUE_SIZE);
				353	if (status != TX_SUCCESS) {
				354	printf("Main failed to create Queue, error - %d\n", status);
				355	exit(1);
				356	}
				357
				358	/* inferenceSender threads to create and queue the jobs */
				359	for (int n = 0; n < NUM_JOB_THREADS; n++) {
				360
				361	/* Allocate the thread context for the inference sender thread. */
				362	status =
				363	tx_byte_allocate(&bytePool, reinterpret_cast<VOID **>(&senderThreadPtr[n]), sizeof(TX_THREAD), TX_NO_WAIT);
				364	if (status != TX_SUCCESS) {
				365	printf("Main failed to allocate bytes for sender tread, error - %d\n", status);
				366	exit(1);
				367	}
				368
				369	/* Allocate the stack for the inference sender thread. */
				370	status = tx_byte_allocate(
				371	&bytePool, reinterpret_cast<VOID **>(&senderThreadStackPtr[n]), SENDER_THREAD_STACK_SIZE, TX_NO_WAIT);
				372	if (status != TX_SUCCESS) {
				373	printf("Main failed to allocate bytes for sender tread stack, error - %d\n", status);
				374	exit(1);
				375	}
				376
				377	/* Create the inference sender thread. */
				378	status = tx_thread_create(reinterpret_cast<TX_THREAD *>(senderThreadPtr[n]),
				379	"senderThread",
				380	inferenceSenderThread,
				381	reinterpret_cast<ULONG>(&inferenceProcessQueue),
				382	senderThreadStackPtr[n],
				383	SENDER_THREAD_STACK_SIZE,
				384	1,
				385	1,
				386	TX_NO_TIME_SLICE,
				387	TX_AUTO_START);
				388	if (status != TX_SUCCESS) {
				389	printf("Main failed to create Thread, error - %d\n", status);
				390	exit(1);
				391	}
				392	}
				393
				394	/* Create inferenceProcess threads to process the queued jobs */
				395	for (int n = 0; n < NUM_INFERENCE_THREADS; n++) {
				396
				397	/* Allocate the thread context for the inference process thread. */
				398	status =
				399	tx_byte_allocate(&bytePool, reinterpret_cast<VOID **>(&processThreadPtr[n]), sizeof(TX_THREAD), TX_NO_WAIT);
				400	if (status != TX_SUCCESS) {
				401	printf("Main failed to allocate bytes for process tread, error - %d\n", status);
				402	exit(1);
				403	}
				404
				405	/* Allocate the stack for the inference process thread. */
				406	status = tx_byte_allocate(
				407	&bytePool, reinterpret_cast<VOID **>(&processThreadStackPtr[n]), PROCESS_THREAD_STACK_SIZE, TX_NO_WAIT);
				408	if (status != TX_SUCCESS) {
				409	printf("Main failed to allocate bytes for process stack, error - %d\n", status);
				410	exit(1);
				411	}
				412
				413	threadParams[n] = ProcessThreadParams(
				414	&inferenceProcessQueue, inferenceProcessTensorArena[n], reinterpret_cast<size_t>(arenaSize));
				415
				416	/* Create the inference process thread. */
				417	status = tx_thread_create(reinterpret_cast<TX_THREAD *>(processThreadPtr[n]),
				418	"processThread",
				419	inferenceProcessThread,
				420	reinterpret_cast<ULONG>(&threadParams[n]),
				421	processThreadStackPtr[n],
				422	PROCESS_THREAD_STACK_SIZE,
				423	1,
				424	1,
				425	TX_NO_TIME_SLICE,
				426	TX_AUTO_START);
				427	if (status != TX_SUCCESS) {
				428	printf("Main failed to create thread, error - %d\n", status);
				429	exit(1);
				430	}
				431	}
				432
				433	printf("ThreadX application initialisation - Done \n");
				434	return;
				435	}