Blame - reference_model/src/ops/tensor_ops.cc - tosa/reference_model

blob: a735334a7b99dc275ca107e434de1683281cacf7 [file] [log] [blame]

Eric Kunze	e5e2676	2020-10-13 16:11:07 -0700	[diff] [blame^]	1
				2	// Copyright (c) 2020, ARM Limited.
				3	//
				4	// Licensed under the Apache License, Version 2.0 (the "License");
				5	// you may not use this file except in compliance with the License.
				6	// You may obtain a copy of the License at
				7	//
				8	// http://www.apache.org/licenses/LICENSE-2.0
				9	//
				10	// Unless required by applicable law or agreed to in writing, software
				11	// distributed under the License is distributed on an "AS IS" BASIS,
				12	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	// See the License for the specific language governing permissions and
				14	// limitations under the License.
				15
				16	#include "tensor_ops.h"
				17	#include "quant_util.h"
				18	#include "template_types.h"
				19
				20	using namespace TosaReference;
				21	using namespace Eigen;
				22	using namespace tosa;
				23
				24	template <int Rank, DType Dtype>
				25	OpArgMax<Rank, Dtype>::OpArgMax(TosaAttributeBase* attribute_, TosaQuantInfoBase* qinfo_, uint64_t id_)
				26	: GraphNode(Op_ARGMAX, id_)
				27	{
				28	setRequiredOperands(1, 1);
				29	setRequiredRank(0, 6);
				30
				31	INIT_ATTRIBUTE(Axis);
				32	}
				33
				34	template <int Rank, DType Dtype>
				35	OpArgMax<Rank, Dtype>::~OpArgMax()
				36	{
				37	if (attribute)
				38	delete attribute;
				39	}
				40
				41	template <int Rank, DType Dtype>
				42	int OpArgMax<Rank, Dtype>::checkTensorAttributes()
				43	{
				44	if (validateRequiredOperands())
				45	return 1;
				46
				47	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(outputs[0]))
				48	{
				49	return 1;
				50	}
				51
				52	input = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				53	output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
				54
				55	return 0;
				56	}
				57
				58	template <int Rank, DType Dtype>
				59	int OpArgMax<Rank, Dtype>::eval()
				60	{
				61	Eigen::Tensor<DenseIndex, Rank - 1> index = this->input->getTensor().argmax(attribute->axis());
				62
				63	this->output->getTensor() = index.unaryExpr([](DenseIndex in) -> OutEigenType { return (OutEigenType)in; });
				64
				65	return GraphNode::eval();
				66	}
				67
				68	template <DType Dtype>
				69	OpAvgPool2d<Dtype>::OpAvgPool2d(TosaAttributeBase* attribute_, TosaQuantInfoBase* qinfo_, uint64_t id_)
				70	: GraphNode(Op_AVG_POOL2D, id_)
				71	{
				72	setRequiredOperands(1, 1);
				73	setRequiredRank(4);
				74
				75	INIT_ATTRIBUTE(Pool2d);
				76	INIT_QINFO(Unary);
				77	}
				78
				79	template <DType Dtype>
				80	OpAvgPool2d<Dtype>::~OpAvgPool2d()
				81	{
				82	if (attribute)
				83	delete attribute;
				84	}
				85
				86	template <DType Dtype>
				87	int OpAvgPool2d<Dtype>::checkTensorAttributes()
				88	{
				89	if (validateRequiredOperands())
				90	return 1;
				91
				92	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(outputs[0]))
				93	{
				94	return 1;
				95	}
				96
				97	if (inputs[0]->matchType(*outputs[0]))
				98	{
				99	printNodeValidationError("OpAvgPool2d: input and output tensor type mismatch");
				100	return 1;
				101	}
				102
				103	in = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				104	out = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
				105
				106	if (!in->hasFormat(Format_NHWC))
				107	{
				108	printNodeValidationError("OpAvgPool2d: unsupported tensor format");
				109	return 1;
				110	}
				111
				112	if (attribute->padding().size() != 4)
				113	{
				114	printNodeValidationError("OpAvgPool2d: illegal size for attribute padding");
				115	return 1;
				116	}
				117
				118	if (attribute->kernel().size() != 2)
				119	{
				120	printNodeValidationError("OpAvgPool2d: illegal size for attribute kernel");
				121	return 1;
				122	}
				123
				124	if (attribute->stride().size() != 2)
				125	{
				126	printNodeValidationError("OpAvgPool2d: illegal size for attribute stride");
				127	return 1;
				128	}
				129
				130	return 0;
				131	}
				132
				133	template <DType Dtype>
				134	ETensor1<int32_t> OpAvgPool2d<Dtype>::calculate_div_map_1d(int in_size, int out_size, int kernel_size, int stride)
				135	{
				136	ETensor1<int32_t> result(out_size);
				137
				138	int32_t total_pad = (out_size - 1) * stride + kernel_size - in_size;
				139	total_pad = total_pad < 0 ? 0 : total_pad;
				140
				141	int32_t pad_left = total_pad >> 1;
				142	int32_t pad_right = total_pad - pad_left;
				143
				144	result.setConstant(kernel_size);
				145
				146	// the index left to 'left_index' and index right to 'right_index' indicates
				147	// the input window of this output covers a pad bit
				148	int32_t left_index = pad_left / stride;
				149	int32_t right_index = pad_right / stride;
				150
				151	// not handle ultra small activation yet
				152	ASSERT_MSG_NODE((out_size - 1 - right_index) >= left_index, "AvgPool2d: Small activations not supported yet");
				153
				154	// minus the number of pad bit this index cover
				155	while (left_index >= 0)
				156	{
				157	result(left_index) -= (pad_left - left_index * stride);
				158	left_index--;
				159	}
				160
				161	while (right_index >= 0)
				162	{
				163	result(out_size - 1 - right_index) -= (pad_right - right_index * stride);
				164	right_index--;
				165	}
				166
				167	return result;
				168	}
				169
				170	// assuming input and output tensor have same scales like tflite reference
				171	// so no need to scale input and output
				172	template <DType Dtype>
				173	int OpAvgPool2d<Dtype>::eval()
				174	{
				175	int in_batch = this->in->getShape()[0];
				176	int in_height = this->in->getShape()[1];
				177	int in_width = this->in->getShape()[2];
				178	int in_channels = this->in->getShape()[3];
				179
				180	int out_batch = this->out->getShape()[0];
				181	int out_height = this->out->getShape()[1];
				182	int out_width = this->out->getShape()[2];
				183	int out_channels = this->out->getShape()[3];
				184
				185	ASSERT_MSG_NODE(in_batch == out_batch, "OpAvgPool2d: tensor batch mismatch %d != %d", in_batch, out_batch);
				186
				187	int padding_top = this->attribute->padding()[0];
				188	int padding_bottom = this->attribute->padding()[1];
				189	int padding_left = this->attribute->padding()[2];
				190	int padding_right = this->attribute->padding()[3];
				191	int kernel_h = this->attribute->kernel()[0];
				192	int kernel_w = this->attribute->kernel()[1];
				193	int stride_h = this->attribute->stride()[0];
				194	int stride_w = this->attribute->stride()[1];
				195
				196	DEBUG_INFO(OP,
				197	"perform AvgPool2d, input.shape=[%d,%d,%d,%d], output.shape=[%d,%d,%d,%d], kernel=[%d,%d], "
				198	"stride=[%d,%d], padding=[%d,%d,%d,%d]",
				199	in_batch, in_height, in_width, in_channels, out_batch, out_height, out_width, out_channels, kernel_h,
				200	kernel_w, stride_h, stride_w, padding_top, padding_bottom, padding_left, padding_right);
				201
				202	Eigen::array<Eigen::Index, 2> im2col_input_dims;
				203	im2col_input_dims[0] = kernel_h * kernel_w;
				204	im2col_input_dims[1] = out_batch * out_height * out_width * out_channels;
				205
				206	Eigen::array<Eigen::Index, 4> col2im_output_dims;
				207	col2im_output_dims[0] = out_batch;
				208	col2im_output_dims[1] = out_height;
				209	col2im_output_dims[2] = out_width;
				210	col2im_output_dims[3] = out_channels;
				211
				212	Eigen::array<std::pair<int32_t, int32_t>, 4> padding;
				213	padding[0] = std::make_pair(0, 0);
				214	padding[1] = std::make_pair(padding_top, padding_bottom);
				215	padding[2] = std::make_pair(padding_left, padding_right);
				216	padding[3] = std::make_pair(0, 0);
				217
				218	ETensor4<InEigenType> input_val = this->in->getTensor();
				219	if (this->qinfo)
				220	{
				221	input_val = input_val - (InEigenType)this->qinfo->input_zp();
				222	}
				223
				224	ETensor4<InEigenType> input_padded = input_val.pad(padding);
				225
				226	// assuming input and output have same scales
				227	// so input and output scaling is not required
				228	// TODO: check if this assumption TOSA made
				229
				230	// extract_image_patches() output [N, KH, KW, H * W, C]
				231	// transpose to [KH, KW, N, H * W, C]
				232	// reshape to [KH * KW, N * H * W * C]
				233	ETensor2<InEigenType> input_extract_patches =
				234	input_padded.extract_image_patches(kernel_h, kernel_w, stride_h, stride_w, 1, 1, Eigen::PADDING_VALID)
				235	.shuffle(Eigen::array<Eigen::Index, 5>{ 1, 2, 0, 3, 4 })
				236	.reshape(im2col_input_dims);
				237
				238	// 1D result with [N * H * W * C]
				239	ETensor1<AccEigenType> out_1d(this->out->getElementCount());
				240	out_1d.setZero();
				241
				242	// sum pool
				243	for (size_t i = 0; i < this->out->getElementCount(); i++)
				244	{
				245	for (int32_t j = 0; j < kernel_h * kernel_w; j++)
				246	{
				247	out_1d(i) += (AccEigenType)input_extract_patches(j, i);
				248	}
				249	}
				250
				251	// reshape result to [N, H, W, C] and divide with div_map
				252	ETensor4<AccEigenType> sum = out_1d.reshape(col2im_output_dims);
				253
				254	// calculate 1d height/width div_map (number of elements this pooling window covers)
				255	// and outer product to get 2d div_map, then reshape/broadcast to [N, H, W, C]
				256	ETensor1<int32_t> div_map_h = calculate_div_map_1d(in_height, out_height, kernel_h, stride_h);
				257	ETensor1<int32_t> div_map_w = calculate_div_map_1d(in_width, out_width, kernel_w, stride_w);
				258	Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims = { Eigen::IndexPair<Eigen::Index>(1, 0) };
				259	Eigen::array<Eigen::Index, 4> bcast{ out_batch, 1, 1, out_channels };
				260
				261	ETensor4<int32_t> div_map =
				262	div_map_h.reshape(Eigen::array<Eigen::Index, 2>{ out_height, 1 })
				263	.contract(div_map_w.reshape(Eigen::array<Eigen::Index, 2>{ 1, out_width }), contract_dims)
				264	.reshape(Eigen::array<Eigen::Index, 4>{ 1, out_height, out_width, 1 })
				265	.broadcast(bcast);
				266
				267	if (Dtype != DType_FLOAT)
				268	{
				269	this->out->getTensor() = sum.binaryExpr(div_map, [](AccEigenType value, int32_t div) -> OutEigenType {
				270	int32_t multiplier, shift;
				271	TosaReference::QuantUtil<AccDtype>::reciprocal_scale(div, multiplier, shift);
				272
				273	return (OutEigenType)TosaReference::QuantUtil<AccDtype>::apply_scale(value, multiplier, shift, false);
				274	});
				275	this->out->getTensor() = this->out->getTensor() + (OutEigenType)(this->qinfo->output_zp());
				276	this->out->getTensor() = this->out->getTensor().cwiseMax((OutEigenType)QMin);
				277	this->out->getTensor() = this->out->getTensor().cwiseMin((OutEigenType)QMax);
				278	}
				279	else
				280	{
				281	this->out->getTensor() = (sum / div_map.template cast<AccEigenType>()).template cast<OutEigenType>();
				282	}
				283
				284	return GraphNode::eval();
				285	}
				286
				287	template <DType InDtype, DType WeightDtype>
				288	OpConv2d<InDtype, WeightDtype>::OpConv2d(TosaAttributeBase* attribute_, TosaQuantInfoBase* qinfo_, uint64_t id_)
				289	: GraphNode(Op_CONV2D, id_)
				290	{
				291	setRequiredOperands(3, 1);
				292	setRequiredRank(4);
				293
				294	INIT_ATTRIBUTE(Conv2d);
				295	INIT_QINFO(Conv);
				296	}
				297
				298	template <DType InDtype, DType WeightDtype>
				299	OpConv2d<InDtype, WeightDtype>::~OpConv2d()
				300	{
				301	if (attribute)
				302	delete attribute;
				303	if (qinfo)
				304	delete qinfo;
				305	}
				306
				307	template <DType InDtype, DType WeightDtype>
				308	int OpConv2d<InDtype, WeightDtype>::checkTensorAttributes()
				309	{
				310	if (validateRequiredOperands())
				311	return 1;
				312
				313	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(inputs[1]) \|\| validateRequiredRank(outputs[0]))
				314	{
				315	return 1;
				316	}
				317
				318	// 'bias' checked separatedly since it doens't make sense to make required rank ranging from 1 to 4
				319	if (inputs[2]->getRank() != 1)
				320	{
				321	printNodeValidationError("OpConv2d: bias tensor must be rank 1");
				322	}
				323
				324	if (inputs[1]->getIsConst() == 0)
				325	{
				326	printNodeValidationError("OpConv2d: weight tensor is not const typed");
				327	}
				328
				329	input = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				330	weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
				331	bias = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
				332	output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
				333
				334	if (!input->hasFormat(Format_NHWC))
				335	{
				336	printNodeValidationError("OpConv2d: unsupported input tensor format");
				337	return 1;
				338	}
				339
				340	if (!weight->hasFormat(Format_OHWI))
				341	{
				342	printNodeValidationError("OpConv2d: unsupported weight tensor format");
				343	return 1;
				344	}
				345
				346	if (attribute->padding().size() != 4)
				347	{
				348	printNodeValidationError("OpConv2d: illegal size for attribute padding");
				349	return 1;
				350	}
				351
				352	if (attribute->stride().size() != 2)
				353	{
				354	printNodeValidationError("OpConv2d: illegal size for attribute stride");
				355	return 1;
				356	}
				357
				358	if (attribute->dilation().size() != 2)
				359	{
				360	printNodeValidationError("OpConv2d: illegal size for attribute dilation");
				361	return 1;
				362	}
				363
				364	return 0;
				365	}
				366
				367	template <DType InDtype, DType WeightDtype>
				368	int OpConv2d<InDtype, WeightDtype>::eval()
				369	{
				370	int in_batch = this->input->getShape()[0];
				371	int in_height = this->input->getShape()[1];
				372	int in_width = this->input->getShape()[2];
				373	int in_channels = this->input->getShape()[3];
				374
				375	int f_out_channels = this->weight->getShape()[0];
				376	int f_height = this->weight->getShape()[1];
				377	int f_width = this->weight->getShape()[2];
				378	int f_in_channels = this->weight->getShape()[3];
				379
				380	int b_out_channels = this->bias->getShape()[0];
				381
				382	int out_batch = this->output->getShape()[0];
				383	int out_height = this->output->getShape()[1];
				384	int out_width = this->output->getShape()[2];
				385	int out_channels = this->output->getShape()[3];
				386
				387	ASSERT_MSG_NODE(in_batch == out_batch, "OpConv2d: tensor batch mismatch %d != %d", in_batch, out_batch);
				388	ASSERT_MSG_NODE(f_in_channels == in_channels, "OpConv2d: tensor input channel mismatch %d != %d", f_in_channels,
				389	in_channels);
				390	ASSERT_MSG_NODE(f_out_channels == out_channels, "OpConv2d: tensor output channel mismatch %d != %d", f_out_channels,
				391	out_channels);
				392	ASSERT_MSG_NODE(b_out_channels == out_channels, "OpConv2d: tensor output channel mismatch %d != %d", b_out_channels,
				393	out_channels);
				394
				395	int padding_top = this->attribute->padding()[0];
				396	int padding_bottom = this->attribute->padding()[1];
				397	int padding_left = this->attribute->padding()[2];
				398	int padding_right = this->attribute->padding()[3];
				399	int stride_h = this->attribute->stride()[0];
				400	int stride_w = this->attribute->stride()[1];
				401	int dilation_h = this->attribute->dilation()[0];
				402	int dilation_w = this->attribute->dilation()[1];
				403
				404	DEBUG_INFO(OP,
				405	"perform OpConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], output.shape=[%d,%d,%d,%d], "
				406	"stride=[%d,%d], dilation=[%d,%d], padding=[%d,%d,%d,%d]",
				407	in_batch, in_height, in_width, in_channels, f_height, f_width, f_in_channels, f_out_channels, out_batch,
				408	out_height, out_width, out_channels, stride_h, stride_w, dilation_h, dilation_w, padding_top,
				409	padding_bottom, padding_left, padding_right);
				410
				411	// GEMM-conv2d, left matrix is input, right matrix is weight
				412	Eigen::array<Eigen::Index, 2> im2col_input_dims;
				413	im2col_input_dims[0] = out_batch * out_height * out_width;
				414	im2col_input_dims[1] = f_height * f_width * f_in_channels;
				415
				416	Eigen::array<Eigen::Index, 2> im2col_weight_dims;
				417	im2col_weight_dims[0] = f_height * f_width * f_in_channels;
				418	im2col_weight_dims[1] = f_out_channels;
				419
				420	Eigen::array<Eigen::Index, 2> bias_reshaped_dims;
				421	bias_reshaped_dims[0] = 1;
				422	bias_reshaped_dims[1] = b_out_channels;
				423
				424	Eigen::array<Eigen::Index, 4> weight_zp_bcast_dims;
				425	weight_zp_bcast_dims[0] = f_height;
				426	weight_zp_bcast_dims[1] = f_width;
				427	weight_zp_bcast_dims[2] = f_in_channels;
				428
				429	Eigen::array<Eigen::Index, 2> bias_bcast_dims;
				430	bias_bcast_dims[0] = out_batch * out_height * out_width;
				431	bias_bcast_dims[1] = 1;
				432
				433	Eigen::array<Eigen::Index, 4> col2im_output_dims;
				434	col2im_output_dims[0] = out_batch;
				435	col2im_output_dims[1] = out_height;
				436	col2im_output_dims[2] = out_width;
				437	col2im_output_dims[3] = out_channels;
				438
				439	Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims = { Eigen::IndexPair<Eigen::Index>(1, 0) };
				440
				441	Eigen::array<std::pair<int32_t, int32_t>, 4> padding;
				442	padding[0] = std::make_pair(0, 0);
				443	padding[1] = std::make_pair(padding_top, padding_bottom);
				444	padding[2] = std::make_pair(padding_left, padding_right);
				445	padding[3] = std::make_pair(0, 0);
				446
				447	TIn input_val = this->input->getTensor();
				448	TWeight weight_val = this->weight->getTensor();
				449	if (this->qinfo)
				450	{
				451	input_val = input_val - (InEigenType)this->qinfo->input_zp();
				452	weight_val = weight_val - (WeightEigenType)this->qinfo->weight_zp();
				453	}
				454
				455	ETensor4<InEigenType> input_padded = input_val.pad(padding);
				456
				457	// extract_image_patches() output [N, KH, KW, H * W, C]
				458	// need to transpose to [N, H * W, KH, KW, C]
				459	ETensor5<InEigenType> input_extract_patches =
				460	input_padded
				461	.extract_image_patches(f_height, f_width, stride_h, stride_w, dilation_h, dilation_w, Eigen::PADDING_VALID)
				462	.shuffle(Eigen::array<Eigen::Index, 5>{ 0, 3, 1, 2, 4 });
				463
				464	// reshape input to [N * H * W, KH * KW * C]
				465	ETensor2<InEigenType> im2col_input = input_extract_patches.reshape(im2col_input_dims);
				466
				467	// transpose and reshape weight from [OC, H, W, IC] to [H * W * IC, OC]
				468	ETensor2<WeightEigenType> im2col_weight =
				469	weight_val.shuffle(Eigen::array<Eigen::Index, 4>({ 1, 2, 3, 0 })).reshape(im2col_weight_dims);
				470
				471	// don't need to apply bias_multiplier ( * bias_scale and >> bias_shift) since tflite already scale it
				472	// and reshaped from [C] to [1, C], and broadcast to [N * H * W, C]
				473	ETensor2<AccEigenType> bias_2d = this->bias->getTensor().reshape(bias_reshaped_dims).broadcast(bias_bcast_dims);
				474
				475	// output matrix is [N * H * W, C]
				476	ETensor2<AccEigenType> contracted_result =
				477	im2col_input.template cast<AccEigenType>().contract(im2col_weight.template cast<AccEigenType>(), contract_dims);
				478
				479	// adding bias
				480	ETensor2<AccEigenType> biased_output = contracted_result + bias_2d.template cast<AccEigenType>();
				481
				482	// reshape back to [N, H, W, C]
				483	this->output->getTensor() = biased_output.reshape(col2im_output_dims);
				484
				485	if (AccDtype == DType_INT48)
				486	{
				487	this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
				488	this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
				489	}
				490
				491	return GraphNode::eval();
				492	}
				493
				494	template <DType InDtype, DType WeightDtype>
				495	OpDepthwiseConv2d<InDtype, WeightDtype>::OpDepthwiseConv2d(TosaAttributeBase* attribute_,
				496	TosaQuantInfoBase* qinfo_,
				497	uint64_t id_)
				498	: GraphNode(Op_DEPTHWISE_CONV2D, id_)
				499	{
				500	setRequiredOperands(3, 1);
				501	setRequiredRank(4);
				502
				503	INIT_ATTRIBUTE(Conv2d);
				504	INIT_QINFO(Conv);
				505	}
				506
				507	template <DType InDtype, DType WeightDtype>
				508	OpDepthwiseConv2d<InDtype, WeightDtype>::~OpDepthwiseConv2d()
				509	{
				510	if (attribute)
				511	delete attribute;
				512	if (qinfo)
				513	delete qinfo;
				514	}
				515
				516	template <DType InDtype, DType WeightDtype>
				517	int OpDepthwiseConv2d<InDtype, WeightDtype>::checkTensorAttributes()
				518	{
				519	if (validateRequiredOperands())
				520	return 1;
				521
				522	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(inputs[1]) \|\| validateRequiredRank(outputs[0]))
				523	{
				524	return 1;
				525	}
				526
				527	// 'bias' checked separatedly since it doens't make sense to make required rank ranging from 1 to 4
				528	if (inputs[2]->getRank() != 1)
				529	{
				530	printNodeValidationError("OpDepthwiseConv2d: bias tensor must be rank 1");
				531	}
				532
				533	if (inputs[1]->getIsConst() == 0)
				534	{
				535	printNodeValidationError("OpDepthwiseConv2d: weight tensor is not const typed");
				536	}
				537
				538	input = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				539	weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
				540	bias = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
				541	output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
				542
				543	if (!input->hasFormat(Format_NHWC))
				544	{
				545	printNodeValidationError("OpDepthwiseConv2d: unsupported input tensor format");
				546	return 1;
				547	}
				548
				549	if (!weight->hasFormat(Format_HWIM))
				550	{
				551	printNodeValidationError("OpDepthwiseConv2d: unsupported weight tensor format");
				552	return 1;
				553	}
				554
				555	if (attribute->padding().size() != 4)
				556	{
				557	printNodeValidationError("OpDepthwiseConv2d: illegal size for attribute padding");
				558	return 1;
				559	}
				560
				561	if (attribute->stride().size() != 2)
				562	{
				563	printNodeValidationError("OpDepthwiseConv2d: illegal size for attribute stride");
				564	return 1;
				565	}
				566
				567	if (attribute->dilation().size() != 2)
				568	{
				569	printNodeValidationError("OpDepthwiseConv2d: illegal size for attribute dilation");
				570	return 1;
				571	}
				572
				573	return 0;
				574	}
				575
				576	template <DType InDtype, DType WeightDtype>
				577	int OpDepthwiseConv2d<InDtype, WeightDtype>::eval()
				578	{
				579	int in_batch = this->input->getShape()[0];
				580	int in_height = this->input->getShape()[1];
				581	int in_width = this->input->getShape()[2];
				582	int in_channels = this->input->getShape()[3];
				583
				584	int f_height = this->weight->getShape()[0];
				585	int f_width = this->weight->getShape()[1];
				586	int f_in_channels = this->weight->getShape()[2];
				587	int f_multiplier = this->weight->getShape()[3];
				588
				589	int b_out_channels = this->bias->getShape()[0];
				590
				591	int out_batch = this->output->getShape()[0];
				592	int out_height = this->output->getShape()[1];
				593	int out_width = this->output->getShape()[2];
				594	int out_channels = this->output->getShape()[3];
				595
				596	ASSERT_MSG_NODE(in_batch == out_batch, "OpDepthwiseConv2d: tensor batch mismatch %d != %d", in_batch, out_batch);
				597	ASSERT_MSG_NODE(f_in_channels == in_channels, "OpDepthwiseConv2d: tensor input channel mismatch %d != %d",
				598	f_in_channels, in_channels);
				599	ASSERT_MSG_NODE(in_channels * f_multiplier == out_channels,
				600	"OpDepthwiseConv2d: tensor output channel mismatch %d != %d", in_channels * f_multiplier,
				601	out_channels);
				602	ASSERT_MSG_NODE(b_out_channels == out_channels, "OpDepthwiseConv2d: tensor b_out_channels mismatch %d != %d",
				603	b_out_channels, out_channels);
				604
				605	int padding_top = this->attribute->padding()[0];
				606	int padding_bottom = this->attribute->padding()[1];
				607	int padding_left = this->attribute->padding()[2];
				608	int padding_right = this->attribute->padding()[3];
				609	int stride_h = this->attribute->stride()[0];
				610	int stride_w = this->attribute->stride()[1];
				611	int dilation_h = this->attribute->dilation()[0];
				612	int dilation_w = this->attribute->dilation()[1];
				613
				614	DEBUG_INFO(OP,
				615	"perform OpDepthwiseConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], "
				616	"output.shape=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d], padding=[%d,%d,%d,%d]",
				617	in_batch, in_height, in_width, in_channels, f_height, f_width, f_in_channels, f_multiplier, out_batch,
				618	out_height, out_width, out_channels, stride_h, stride_w, dilation_h, dilation_w, padding_top,
				619	padding_bottom, padding_left, padding_right);
				620
				621	Eigen::array<std::pair<int32_t, int32_t>, 4> padding;
				622	padding[0] = std::make_pair(0, 0);
				623	padding[1] = std::make_pair(padding_top, padding_bottom);
				624	padding[2] = std::make_pair(padding_left, padding_right);
				625	padding[3] = std::make_pair(0, 0);
				626
				627	TIn input_val = this->input->getTensor();
				628	TWeight weight_val = this->weight->getTensor();
				629	if (this->qinfo)
				630	{
				631	input_val = input_val - (InEigenType)this->qinfo->input_zp();
				632	weight_val = weight_val - (WeightEigenType)this->qinfo->weight_zp();
				633	}
				634
				635	ETensor4<InEigenType> input_padded = input_val.pad(padding);
				636
				637	// GEMM doesn't fit well with DepthwiseConv2d
				638	// 1. use extract_image_patches() to handle stride/dilation/padding
				639	// 2. perform direct convolution
				640
				641	// 1. extract_image_patches() output [N, KH, KW, OH * OW, IC]
				642	ETensor5<InEigenType> input_extract_patches = input_padded.extract_image_patches(
				643	f_height, f_width, stride_h, stride_w, dilation_h, dilation_w, Eigen::PADDING_VALID);
				644
				645	Eigen::array<Eigen::Index, 4> reshape_dim;
				646	reshape_dim.fill(1);
				647	reshape_dim[3] = b_out_channels;
				648
				649	Eigen::array<Eigen::Index, 4> bcast;
				650	bcast[0] = out_batch;
				651	bcast[1] = out_height;
				652	bcast[2] = out_width;
				653	bcast[3] = 1;
				654
				655	// initialize with bias
				656	this->output->getTensor() = this->bias->getTensor().reshape(reshape_dim).broadcast(bcast);
				657
				658	// 2. direct depthwise convolution
				659	for (int ob = 0; ob < out_batch; ob++)
				660	{
				661	for (int oh = 0; oh < out_height; oh++)
				662	{
				663	for (int ow = 0; ow < out_width; ow++)
				664	{
				665	for (int ic = 0; ic < in_channels; ic++)
				666	{
				667	for (int cm = 0; cm < f_multiplier; cm++)
				668	{
				669	for (int fh = 0; fh < f_height; fh++)
				670	{
				671	for (int fw = 0; fw < f_width; fw++)
				672	{
				673	this->output->getTensor()(ob, oh, ow, ic * f_multiplier + cm) +=
				674	((AccEigenType)input_extract_patches(ob, fh, fw, ow * out_height + oh, ic) *
				675	(AccEigenType)weight_val(fh, fw, ic, cm));
				676	}
				677	}
				678	}
				679	}
				680	}
				681	}
				682	}
				683
				684	if (AccDtype == DType_INT48)
				685	{
				686	this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
				687	this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
				688	}
				689
				690	return GraphNode::eval();
				691	}
				692
				693	template <DType InDtype, DType WeightDtype>
				694	OpFullyConnected<InDtype, WeightDtype>::OpFullyConnected(TosaAttributeBase* attribute_,
				695	TosaQuantInfoBase* qinfo_,
				696	uint64_t id_)
				697	: GraphNode(Op_FULLY_CONNECTED, id_)
				698	{
				699	setRequiredOperands(3, 1);
				700	setRequiredRank(2);
				701
				702	INIT_QINFO(Conv);
				703	}
				704
				705	template <DType InDtype, DType WeightDtype>
				706	OpFullyConnected<InDtype, WeightDtype>::~OpFullyConnected()
				707	{
				708	if (qinfo)
				709	delete qinfo;
				710	}
				711
				712	template <DType InDtype, DType WeightDtype>
				713	int OpFullyConnected<InDtype, WeightDtype>::checkTensorAttributes()
				714	{
				715	if (validateRequiredOperands())
				716	return 1;
				717
				718	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(inputs[1]) \|\| validateRequiredRank(outputs[0]))
				719	{
				720	return 1;
				721	}
				722
				723	input = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				724	weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
				725	bias = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
				726
				727	if (input->getShape()[1] != weight->getShape()[1])
				728	{
				729	printNodeValidationError("OpFullyConnected operator input.shape[1] should match weight.shape[1]");
				730	return 1;
				731	}
				732
				733	if (weight->getShape()[0] != bias->getShape()[0])
				734	{
				735	printNodeValidationError("OpFullyConnected operator bias.shape[0] should match weight.shape[0]");
				736	return 1;
				737	}
				738
				739	output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
				740
				741	return 0;
				742	}
				743
				744	template <DType InDtype, DType WeightDtype>
				745	int OpFullyConnected<InDtype, WeightDtype>::eval()
				746	{
				747	typedef Eigen::Tensor<int, 1>::DimensionPair DimPair;
				748	Eigen::array<DimPair, 1> dims{ { DimPair(1, 0) } };
				749
				750	Eigen::array<Eigen::Index, 2> weight_shuffle{ 1, 0 };
				751
				752	Eigen::array<Eigen::Index, 2> bias_reshape;
				753	bias_reshape[0] = 1;
				754	bias_reshape[1] = this->bias->getShape()[0];
				755
				756	Eigen::array<Eigen::Index, 2> bias_bcast;
				757	bias_bcast[0] = this->input->getShape()[0];
				758	bias_bcast[1] = 1;
				759
				760	TIn input_val = this->input->getTensor();
				761	TWeight weight_val = this->weight->getTensor().shuffle(weight_shuffle);
				762	if (this->qinfo)
				763	{
				764	input_val = input_val - (InEigenType)this->qinfo->input_zp();
				765	weight_val = weight_val - (WeightEigenType)this->qinfo->weight_zp();
				766	}
				767
				768	this->output->getTensor() =
				769	input_val.template cast<AccEigenType>().contract(weight_val.template cast<AccEigenType>(), dims) +
				770	this->bias->getTensor().reshape(bias_reshape).broadcast(bias_bcast);
				771
				772	if (AccDtype == DType_INT48)
				773	{
				774	this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
				775	this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
				776	}
				777	return GraphNode::eval();
				778	}
				779
				780	template <DType Dtype>
				781	OpMatMul<Dtype>::OpMatMul(TosaAttributeBase* attribute_, TosaQuantInfoBase* qinfo_, uint64_t id_)
				782	: GraphNode(Op_MATMUL, id_)
				783	{
				784	setRequiredOperands(2, 1);
				785	setRequiredRank(2);
				786
				787	INIT_QINFO(MatMul);
				788	}
				789
				790	template <DType Dtype>
				791	OpMatMul<Dtype>::~OpMatMul()
				792	{
				793	if (qinfo)
				794	delete qinfo;
				795	}
				796
				797	template <DType Dtype>
				798	int OpMatMul<Dtype>::checkTensorAttributes()
				799	{
				800	if (validateRequiredOperands())
				801	return 1;
				802
				803	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(inputs[1]) \|\| validateRequiredRank(outputs[0]))
				804	{
				805	return 1;
				806	}
				807
				808	a = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				809	b = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[1]);
				810
				811	if (a->getShape()[1] != b->getShape()[0])
				812	{
				813	printNodeValidationError("OpMatMul operator a.shape[1] should match b.shape[0]");
				814	return 1;
				815	}
				816
				817	c = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
				818
				819	return 0;
				820	}
				821
				822	template <DType Dtype>
				823	int OpMatMul<Dtype>::eval()
				824	{
				825	typedef Eigen::Tensor<int, 1>::DimensionPair DimPair;
				826	Eigen::array<DimPair, 1> dims{ { DimPair(1, 0) } };
				827
				828	TIn a_val = this->a->getTensor();
				829	TIn b_val = this->b->getTensor();
				830	if (this->qinfo)
				831	{
				832	a_val = a_val - (InEigenType)this->qinfo->a_zp();
				833	b_val = b_val - (InEigenType)this->qinfo->b_zp();
				834	}
				835
				836	this->c->getTensor() = a_val.template cast<AccEigenType>().contract(b_val.template cast<AccEigenType>(), dims);
				837
				838	if (AccDtype == DType_INT48)
				839	{
				840	this->c->getTensor() = this->c->getTensor().cwiseMax((AccEigenType)AccQMin);
				841	this->c->getTensor() = this->c->getTensor().cwiseMin((AccEigenType)AccQMax);
				842	}
				843
				844	return GraphNode::eval();
				845	}
				846
				847	template <DType Dtype>
				848	OpMaxPool2d<Dtype>::OpMaxPool2d(TosaAttributeBase* attribute_, TosaQuantInfoBase* qinfo_, uint64_t id_)
				849	: GraphNode(Op_MAX_POOL2D, id_)
				850	{
				851	setRequiredOperands(1, 1);
				852	setRequiredRank(4);
				853
				854	INIT_ATTRIBUTE(Pool2d);
				855	}
				856
				857	template <DType Dtype>
				858	OpMaxPool2d<Dtype>::~OpMaxPool2d()
				859	{
				860	if (attribute)
				861	delete attribute;
				862	}
				863
				864	template <DType Dtype>
				865	int OpMaxPool2d<Dtype>::checkTensorAttributes()
				866	{
				867	if (validateRequiredOperands())
				868	return 1;
				869
				870	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(outputs[0]))
				871	{
				872	return 1;
				873	}
				874
				875	if (inputs[0]->matchType(*outputs[0]))
				876	{
				877	printNodeValidationError("OpMaxPool2d: input and output tensor type mismatch");
				878	return 1;
				879	}
				880
				881	in = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				882	out = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
				883
				884	if (!in->hasFormat(Format_NHWC))
				885	{
				886	printNodeValidationError("OpMaxPool2d: unsupported tensor format");
				887	return 1;
				888	}
				889
				890	if (attribute->padding().size() != 4)
				891	{
				892	printNodeValidationError("OpMaxPool2d: illegal size for attribute padding");
				893	return 1;
				894	}
				895
				896	if (attribute->kernel().size() != 2)
				897	{
				898	printNodeValidationError("OpMaxPool2d: illegal size for attribute kernel");
				899	return 1;
				900	}
				901
				902	if (attribute->stride().size() != 2)
				903	{
				904	printNodeValidationError("OpMaxPool2d: illegal size for attribute stride");
				905	return 1;
				906	}
				907
				908	return 0;
				909	}
				910
				911	template <DType Dtype>
				912	int OpMaxPool2d<Dtype>::eval()
				913	{
				914	int in_batch = this->in->getShape()[0];
				915	int in_height = this->in->getShape()[1];
				916	int in_width = this->in->getShape()[2];
				917	int in_channels = this->in->getShape()[3];
				918
				919	int out_batch = this->out->getShape()[0];
				920	int out_height = this->out->getShape()[1];
				921	int out_width = this->out->getShape()[2];
				922	int out_channels = this->out->getShape()[3];
				923
				924	ASSERT_MSG_NODE(in_batch == out_batch, "OpMaxPool2d: tensor batch mismatch %d != %d", in_batch, out_batch);
				925
				926	int padding_top = this->attribute->padding()[0];
				927	int padding_bottom = this->attribute->padding()[1];
				928	int padding_left = this->attribute->padding()[2];
				929	int padding_right = this->attribute->padding()[3];
				930	int kernel_h = this->attribute->kernel()[0];
				931	int kernel_w = this->attribute->kernel()[1];
				932	int stride_h = this->attribute->stride()[0];
				933	int stride_w = this->attribute->stride()[1];
				934
				935	DEBUG_INFO(OP,
				936	"perform MaxPool2d, input.shape=[%d,%d,%d,%d], output.shape=[%d,%d,%d,%d], kernel=[%d,%d], "
				937	"stride=[%d,%d], padding=[%d,%d,%d,%d]",
				938	in_batch, in_height, in_width, in_channels, out_batch, out_height, out_width, out_channels, kernel_h,
				939	kernel_w, stride_h, stride_w, padding_top, padding_bottom, padding_left, padding_right);
				940
				941	Eigen::array<Eigen::Index, 2> im2col_input_dims;
				942	im2col_input_dims[0] = kernel_h * kernel_w;
				943	im2col_input_dims[1] = out_batch * out_height * out_width * out_channels;
				944
				945	Eigen::array<Eigen::Index, 4> col2im_output_dims;
				946	col2im_output_dims[0] = out_batch;
				947	col2im_output_dims[1] = out_height;
				948	col2im_output_dims[2] = out_width;
				949	col2im_output_dims[3] = out_channels;
				950
				951	Eigen::array<std::pair<int32_t, int32_t>, 4> padding;
				952	padding[0] = std::make_pair(0, 0);
				953	padding[1] = std::make_pair(padding_top, padding_bottom);
				954	padding[2] = std::make_pair(padding_left, padding_right);
				955	padding[3] = std::make_pair(0, 0);
				956
				957	ETensor4<InEigenType> input_padded = this->in->getTensor().pad(padding, std::numeric_limits<InEigenType>::lowest());
				958
				959	// extract_image_patches() output [N, KH, KW, H * W, C]
				960	// transpose to [KH, KW, N, H * W, C]
				961	// reshape to [KH * KW, N * H * W * C]
				962	//
				963	// Set the padding value to be the most negative value that can be
				964	// represented by the datatype to ensure that any padding values will be equal
				965	// to or smaller than the actual maximum in the KH x KW patch.
				966	ETensor2<InEigenType> input_extract_patches =
				967	input_padded
				968	.extract_image_patches(kernel_h, kernel_w, stride_h, stride_w, 1, 1, Eigen::PADDING_VALID,
				969	std::numeric_limits<InEigenType>::lowest())
				970	.shuffle(Eigen::array<Eigen::Index, 5>{ 1, 2, 0, 3, 4 })
				971	.reshape(im2col_input_dims);
				972
				973	// Get the maximum of the KHxHW patches along axis 0
				974	Eigen::Tensor<DenseIndex, 1> tensor_argmax = input_extract_patches.argmax(0);
				975
				976	// 1D result with [N * H * W * C]
				977	ETensor1<OutEigenType> out_1d(this->out->getElementCount());
				978
				979	// index input_patches with argmax array should give the result
				980	for (size_t i = 0; i < this->out->getElementCount(); i++)
				981	{
				982	out_1d(i) = (OutEigenType)input_extract_patches(tensor_argmax(i), i);
				983	}
				984
				985	// reshape result to [N, H, W, C]
				986	this->out->getTensor() = out_1d.reshape(col2im_output_dims);
				987
				988	return GraphNode::eval();
				989	}
				990
				991	template <DType InDtype, DType OutDtype>
				992	OpTransposeConv2d<InDtype, OutDtype>::OpTransposeConv2d(TosaAttributeBase* attribute_,
				993	TosaQuantInfoBase* qinfo_,
				994	uint64_t id_)
				995	: GraphNode(Op_TRANSPOSE_CONV2D, id_)
				996	{
				997	setRequiredOperands(3, 1);
				998	setRequiredRank(4);
				999
				1000	INIT_ATTRIBUTE(TransposeConv2d);
				1001	INIT_QINFO(Conv);
				1002	}
				1003
				1004	template <DType InDtype, DType OutDtype>
				1005	OpTransposeConv2d<InDtype, OutDtype>::~OpTransposeConv2d()
				1006	{
				1007	if (attribute)
				1008	delete attribute;
				1009	if (qinfo)
				1010	delete qinfo;
				1011	}
				1012
				1013	template <DType InDtype, DType OutDtype>
				1014	int OpTransposeConv2d<InDtype, OutDtype>::checkTensorAttributes()
				1015	{
				1016	if (validateRequiredOperands())
				1017	return 1;
				1018
				1019	if (validateRequiredRank(inputs[0]) \|\| validateRequiredRank(inputs[1]) \|\| validateRequiredRank(outputs[0]))
				1020	{
				1021	return 1;
				1022	}
				1023
				1024	if (inputs[1]->getIsConst() == 0)
				1025	{
				1026	printNodeValidationError("OpTransposeConv2d: weight tensor is not const typed");
				1027	}
				1028
				1029	input = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
				1030	weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
				1031	bias = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
				1032	output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
				1033
				1034	if (!input->hasFormat(Format_NHWC))
				1035	{
				1036	printNodeValidationError("OpTransposeConv2d: unsupported input tensor format");
				1037	return 1;
				1038	}
				1039
				1040	if (!weight->hasFormat(Format_OHWI))
				1041	{
				1042	printNodeValidationError("OpTransposeConv2d: unsupported weight tensor format");
				1043	return 1;
				1044	}
				1045
				1046	if (attribute->outpad().size() != 2)
				1047	{
				1048	printNodeValidationError("OpTransposeConv2d: illegal size for attribute outpad");
				1049	return 1;
				1050	}
				1051
				1052	if (attribute->stride().size() != 2)
				1053	{
				1054	printNodeValidationError("OpTransposeConv2d: illegal size for attribute stride");
				1055	return 1;
				1056	}
				1057
				1058	if (attribute->dilation().size() != 2)
				1059	{
				1060	printNodeValidationError("OpTransposeConv2d: illegal size for attribute dilation");
				1061	return 1;
				1062	}
				1063
				1064	if (attribute->output_shape().size() != 4)
				1065	{
				1066	printNodeValidationError("OpTransposeConv2d: illegal size for attribute output_shape");
				1067	return 1;
				1068	}
				1069
				1070	for (int d = 0; d < 4; d++)
				1071	{
				1072	if (attribute->output_shape()[d] != this->output->getShape()[d])
				1073	{
				1074	printNodeValidationError("OpTransposeConv2d: illegal size for attribute output_shape");
				1075	return 1;
				1076	}
				1077	}
				1078
				1079	return 0;
				1080	}
				1081
				1082	template <DType InDtype, DType OutDtype>
				1083	int OpTransposeConv2d<InDtype, OutDtype>::eval()
				1084	{
				1085	int in_batch = this->input->getShape()[0];
				1086	int in_height = this->input->getShape()[1];
				1087	int in_width = this->input->getShape()[2];
				1088	int in_channels = this->input->getShape()[3];
				1089
				1090	int f_out_channels = this->weight->getShape()[0];
				1091	int f_height = this->weight->getShape()[1];
				1092	int f_width = this->weight->getShape()[2];
				1093	int f_in_channels = this->weight->getShape()[3];
				1094
				1095	int b_out_channels = this->bias->getShape()[0];
				1096
				1097	int out_batch = this->output->getShape()[0];
				1098	int out_height = this->output->getShape()[1];
				1099	int out_width = this->output->getShape()[2];
				1100	int out_channels = this->output->getShape()[3];
				1101
				1102	int padding_top = this->attribute->outpad()[0];
				1103	int padding_left = this->attribute->outpad()[1];
				1104	int stride_h = this->attribute->stride()[0];
				1105	int stride_w = this->attribute->stride()[1];
				1106	int dilation_h = this->attribute->dilation()[0];
				1107	int dilation_w = this->attribute->dilation()[1];
				1108
				1109	ASSERT_MSG_NODE(in_batch == out_batch, "OpTransposeConv2d: tensor batch mismatch %d != %d", in_batch, out_batch);
				1110	ASSERT_MSG_NODE(f_in_channels == in_channels, "OpTransposeConv2d: tensor input channel mismatch %d != %d",
				1111	f_in_channels, in_channels);
				1112	ASSERT_MSG_NODE(f_out_channels == out_channels, "OpTransposeConv2d: tensor output channel mismatch %d != %d",
				1113	f_out_channels, out_channels);
				1114	ASSERT_MSG_NODE(b_out_channels == out_channels, "OpDepthwiseConv2d: tensor b_out_channels mismatch %d != %d",
				1115	b_out_channels, out_channels);
				1116
				1117	DEBUG_INFO(OP,
				1118	"perform OpTransposeConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], "
				1119	"output.shape=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d], padding=[%d,%d]",
				1120	in_batch, in_height, in_width, in_channels, f_height, f_width, f_out_channels, f_in_channels, out_batch,
				1121	out_height, out_width, out_channels, stride_h, stride_w, dilation_h, dilation_w, padding_top,
				1122	padding_left);
				1123
				1124	TIn input_val = this->input->getTensor();
				1125	TWeight weight_val = this->weight->getTensor();
				1126	if (this->qinfo)
				1127	{
				1128	input_val = input_val - (InEigenType)this->qinfo->input_zp();
				1129	weight_val = weight_val - (WeightEigenType)this->qinfo->weight_zp();
				1130	}
				1131
				1132	Eigen::array<Eigen::Index, 4> reshape_dim;
				1133	reshape_dim.fill(1);
				1134	reshape_dim[3] = b_out_channels;
				1135
				1136	Eigen::array<Eigen::Index, 4> bcast;
				1137	bcast[0] = out_batch;
				1138	bcast[1] = out_height;
				1139	bcast[2] = out_width;
				1140	bcast[3] = 1;
				1141
				1142	// initialize with bias
				1143	this->output->getTensor() = this->bias->getTensor().reshape(reshape_dim).broadcast(bcast);
				1144
				1145	int out_x_origin, out_y_origin;
				1146	int out_x, out_y;
				1147
				1148	// reference implementation from: tensorflow/tensorflow/lite/kernels/internal/reference/reference_ops.h
				1149	for (int ob = 0; ob < out_batch; ob++)
				1150	{
				1151	for (int ih = 0; ih < in_height; ih++)
				1152	{
				1153	for (int iw = 0; iw < in_width; iw++)
				1154	{
				1155	out_x_origin = iw * stride_w - padding_left;
				1156	out_y_origin = ih * stride_h - padding_top;
				1157	for (int ic = 0; ic < in_channels; ic++)
				1158	{
				1159	for (int fh = 0; fh < f_height; fh++)
				1160	{
				1161	for (int fw = 0; fw < f_width; fw++)
				1162	{
				1163	out_x = out_x_origin + fw * dilation_w;
				1164	out_y = out_y_origin + fh * dilation_h;
				1165	for (int oc = 0; oc < out_channels; oc++)
				1166	{
				1167	if ((out_x >= 0 && out_x < out_width) && (out_y >= 0 && out_y < out_height))
				1168	{
				1169	this->output->getTensor()(ob, out_y, out_x, oc) +=
				1170	((AccEigenType)input_val(ob, ih, iw, ic) *
				1171	(AccEigenType)weight_val(oc, fh, fw, ic));
				1172	}
				1173	}
				1174	}
				1175	}
				1176	}
				1177	}
				1178	}
				1179	}
				1180
				1181	if (AccDtype == DType_INT48)
				1182	{
				1183	this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
				1184	this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
				1185	}
				1186
				1187	return GraphNode::eval();
				1188	}
				1189
				1190	// template explicit instantiation
				1191	DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, FLOAT);
				1192	DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, AINT8);
				1193	DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, INT16);
				1194
				1195	DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, FLOAT)
				1196	DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, AINT8)
				1197	DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, INT16)
				1198
				1199	DEF_INSTANTIATE_TWO_TYPE(OpConv2d, FLOAT, FLOAT);
				1200	DEF_INSTANTIATE_TWO_TYPE(OpConv2d, AINT8, INT4);
				1201	DEF_INSTANTIATE_TWO_TYPE(OpConv2d, AINT8, INT8);
				1202	DEF_INSTANTIATE_TWO_TYPE(OpConv2d, AINT8, AINT8);
				1203	DEF_INSTANTIATE_TWO_TYPE(OpConv2d, INT16, INT8);
				1204
				1205	DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, FLOAT, FLOAT);
				1206	DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, AINT8, INT4);
				1207	DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, AINT8, INT8);
				1208	DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, AINT8, AINT8);
				1209	DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, INT16, INT8);
				1210
				1211	DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, FLOAT, FLOAT);
				1212	DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, AINT8, INT4);
				1213	DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, AINT8, INT8);
				1214	DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, AINT8, AINT8);
				1215	DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, INT16, INT8);
				1216
				1217	DEF_INSTANTIATE_ONE_TYPE(OpMatMul, AINT8);
				1218	DEF_INSTANTIATE_ONE_TYPE(OpMatMul, INT16);
				1219	DEF_INSTANTIATE_ONE_TYPE(OpMatMul, FLOAT);
				1220
				1221	DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, FLOAT);
				1222	DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, AINT8);
				1223	DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, INT16);
				1224
				1225	DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, FLOAT, FLOAT);
				1226	DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, AINT8, INT4);
				1227	DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, AINT8, INT8);
				1228	DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, AINT8, AINT8);
				1229	DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, INT16, INT8);