chapters/ewise_binary.adoc - tosa/specification - Gitiles

 //
 // This confidential and proprietary software may be used only as
 // authorised by a licensing agreement from ARM Limited
 // (C) COPYRIGHT 2020-2021 ARM Limited
 // ALL RIGHTS RESERVED
 // The entire notice above must be reproduced on all authorised
 // copies and copies may only be made to the extent permitted
 // by a licensing agreement from ARM Limited.

 === Elementwise Binary Operators

 ==== ADD

 Elementwise addition of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = apply_add<in_t>(value1, value2);
     tensor_write<in_t>(output, shape, index, result);
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
 |===

 ==== ARITHMETIC_RIGHT_SHIFT

 Elementwise arithmetic right shift of input1 by the amount specified in input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Input|bool_t|round|-|If true then the shift is rounded
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);

     // Ensure that shift amount is appropriate for the data type
     REQUIRE((in_t == int32_t && 0 <= value2 && value2 <= 31) ||
             (in_t == int16_t && 0 <= value2 && value2 <= 15) ||
             (in_t == int8_t && 0 <= value2 && value2 <= 7));

     in_t result = value1 >> value2;
     if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
         result = result + 1;
     }
     result = apply_clip<in_t>(result, minimum<in_t>, maximum<in_t>);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== BITWISE_AND

 Elementwise bitwise AND of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 & value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== BITWISE_OR

 Elementwise bitwise OR of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 | value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== BITWISE_XOR

 Elementwise bitwise XOR of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 ^ value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== INTDIV

 Elementwise integer divide of input1 by input2.
 The result of the divide is truncated towards zero.
 Expected use is for operations on non-scaled integers.
 Floating point divide should use RECIPROCAL and MUL.
 Quantized integer divide should use TABLE (for 1/x) and MUL.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     REQUIRE(value2 != 0);
     // This catches the case where we divide minimum<in_t> by -1
     // which is not representable in two's complement
     REQUIRE((int64_t)value1 / value2 <= maximum<in_t>);
     in_t result = value1 / value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*
 |===
 |Profile|Mode|in_t

 |Any|signed 32|int32_t
 |===

 ==== LOGICAL_AND

 Elementwise logical AND of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 && value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|Bool|bool_t
 |===

 ==== LOGICAL_LEFT_SHIFT

 Elementwise left shift of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
     in_t result = value1 << value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== LOGICAL_RIGHT_SHIFT

 Elementwise logical right shift of input1 by the amount specified in input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
     in_t result = (in_t)((unsigned in_t)value1 >> value2);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
 |Any|signed 32|int32_t
 |===

 ==== LOGICAL_OR

 Elementwise logical OR of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 || value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|Bool|bool_t
 |===

 ==== LOGICAL_XOR

 Elementwise logical XOR of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = value1 != value2;
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|Bool|bool_t
 |===

 ==== MAXIMUM

 Elementwise max of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = apply_max(value1, value2);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
 |===

 ==== MINIMUM

 Elementwise minimum of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = apply_min(value1, value2);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
 |===

 ==== MUL

 Elementwise multiplication (Hadamard product) of input1 and input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Input (MT profile) Attribute (BI/MI profiles)|uint6_t|shift|-|Result right shift (int32_t data type only)
 |Output|out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     out_t result;
     if (in_t == int32_t && shift > 0) {
         result = apply_scale_32(value1, value2, shift);
     } else {
         result = value1 * value2;  // low 32-bits of result for int32_t
     }
     tensor_write<out_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*
 |===
 |Profile|Mode|in_t|out_t

 |Any|signed 8|int8_t|int32_t
 |Any|signed 16|int16_t|int32_t
 |Any|signed 32|int32_t|int32_t
 |MI, MT|floating-point|float_t|float_t
 |===

 ==== POW

 Elementwise input1 value raised to the power of input2.
 Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor from 1 to 4 dims
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = apply_pow<in_t>(value1, value2);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |MI, MT|floating-point|float_t
 |===

 ==== SUB

 Elementwise subtraction of input1 and input2.
 Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===

 *Operation Function:*

 [source,c++]
 ----
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t result = apply_sub<in_t>(value1, value2);
     tensor_write<in_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t

 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
 |===

 ====   TABLE

 Table lookup operation.
 For int8_t TABLE operation, perform a 256 entry table lookup returning an int8_t value.
 For int16_t tables, the int16_t input is treated as a fixed-point 9.7 value.
 The most significant 9 bits are used to index into the table.
 The fractional 7 bits are used to interpolate based on table[index] and table[index+1].
 For int16_t inputs, the TABLE operator returns a 16.7 interpolated value in an int32_t.
 This value can then be input to the RESCALE operator to scale to the required output data type.
 Note that int16_t table has 513 values to handle table[index+1] when index=511.

 An int16_t to int16_t table lookup can be constructed in TOSA as follows:

 * Use the TABLE operator to produce a fixed point 16.7 interpolated result
 * Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required)

 *Arguments:*

 |===
 |Argument|Type|Name|Shape|Description

 |Input|in_t*|Input|shape|Input tensor
 |Input (MT profile) Attribute (BI/MI profiles)|table_t*|table|[TABLE_SIZE]|Lookup table tensor
 |Output|out_t*|output|shape|Output tensor
 |===

 *Operation Function:*

 [source,c++]
 ----
 REQUIRE(length(table) == TABLE_SIZE);
 for_each(index in shape) {
     in_t value = tensor_read<in_t>(input, shape, index);
     out_t result;
     if (in_t == int8_t) {
         // value is a signed int, convert to a 0 based index
         result = table[value + 128];
     } else {
         result = apply_lookup(table, value);
     }
     tensor_write<out_t>(output, shape, index, result);
 }
 ----

 *Supported Data Types:*

 |===
 |Profile|Mode|in_t|table_t|TABLE_SIZE|out_t

 |Any|signed 8|int8_t|int8_t|256|int8_t
 |Any|signed 16|int16_t|int16_t|513|int32_t
 |===
	//
	// This confidential and proprietary software may be used only as
	// authorised by a licensing agreement from ARM Limited
	// (C) COPYRIGHT 2020-2021 ARM Limited
	// ALL RIGHTS RESERVED
	// The entire notice above must be reproduced on all authorised
	// copies and copies may only be made to the extent permitted
	// by a licensing agreement from ARM Limited.

	=== Elementwise Binary Operators

	==== ADD

	Elementwise addition of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = apply_add<in_t>(value1, value2);
	tensor_write<in_t>(output, shape, index, result);
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 32\|int32_t
	\|MI, MT\|floating-point\|float_t
	\|===

	==== ARITHMETIC_RIGHT_SHIFT

	Elementwise arithmetic right shift of input1 by the amount specified in input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Input\|bool_t\|round\|-\|If true then the shift is rounded
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);

	// Ensure that shift amount is appropriate for the data type
	REQUIRE((in_t == int32_t && 0 <= value2 && value2 <= 31) \|\|
	(in_t == int16_t && 0 <= value2 && value2 <= 15) \|\|
	(in_t == int8_t && 0 <= value2 && value2 <= 7));

	in_t result = value1 >> value2;
	if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
	result = result + 1;
	}
	result = apply_clip<in_t>(result, minimum<in_t>, maximum<in_t>);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== BITWISE_AND

	Elementwise bitwise AND of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor of same type as the input tensors, with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 & value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== BITWISE_OR

	Elementwise bitwise OR of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 \| value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== BITWISE_XOR

	Elementwise bitwise XOR of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 ^ value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== INTDIV

	Elementwise integer divide of input1 by input2.
	The result of the divide is truncated towards zero.
	Expected use is for operations on non-scaled integers.
	Floating point divide should use RECIPROCAL and MUL.
	Quantized integer divide should use TABLE (for 1/x) and MUL.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	REQUIRE(value2 != 0);
	// This catches the case where we divide minimum<in_t> by -1
	// which is not representable in two's complement
	REQUIRE((int64_t)value1 / value2 <= maximum<in_t>);
	in_t result = value1 / value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:
	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 32\|int32_t
	\|===

	==== LOGICAL_AND

	Elementwise logical AND of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 && value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|Bool\|bool_t
	\|===

	==== LOGICAL_LEFT_SHIFT

	Elementwise left shift of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	REQUIRE(0 <= value2 && value2 <= 31);
	in_t result = value1 << value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== LOGICAL_RIGHT_SHIFT

	Elementwise logical right shift of input1 by the amount specified in input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	REQUIRE(0 <= value2 && value2 <= 31);
	in_t result = (in_t)((unsigned in_t)value1 >> value2);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 8\|int8_t
	\|Any\|signed 16\|int16_t
	\|Any\|signed 32\|int32_t
	\|===

	==== LOGICAL_OR

	Elementwise logical OR of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 \|\| value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|Bool\|bool_t
	\|===

	==== LOGICAL_XOR

	Elementwise logical XOR of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor of same type as the input tensors, with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = value1 != value2;
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|Bool\|bool_t
	\|===

	==== MAXIMUM

	Elementwise max of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = apply_max(value1, value2);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 32\|int32_t
	\|MI, MT\|floating-point\|float_t
	\|===

	==== MINIMUM

	Elementwise minimum of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = apply_min(value1, value2);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 32\|int32_t
	\|MI, MT\|floating-point\|float_t
	\|===

	==== MUL

	Elementwise multiplication (Hadamard product) of input1 and input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Input (MT profile) Attribute (BI/MI profiles)\|uint6_t\|shift\|-\|Result right shift (int32_t data type only)
	\|Output\|out_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	out_t result;
	if (in_t == int32_t && shift > 0) {
	result = apply_scale_32(value1, value2, shift);
	} else {
	result = value1 * value2; // low 32-bits of result for int32_t
	}
	tensor_write<out_t>(output, shape, index, result);
	}
	----

	Supported Data Types:
	\|===
	\|Profile\|Mode\|in_t\|out_t

	\|Any\|signed 8\|int8_t\|int32_t
	\|Any\|signed 16\|int16_t\|int32_t
	\|Any\|signed 32\|int32_t\|int32_t
	\|MI, MT\|floating-point\|float_t\|float_t
	\|===

	==== POW

	Elementwise input1 value raised to the power of input2.
	Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor from 1 to 4 dims
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor of same type as the input tensors, with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = apply_pow<in_t>(value1, value2);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|MI, MT\|floating-point\|float_t
	\|===

	==== SUB

	Elementwise subtraction of input1 and input2.
	Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|input1\|shape1\|Input tensor
	\|Input\|in_t*\|input2\|shape2\|Input tensor with the same rank as input1
	\|Output\|in_t*\|output\|shape\|Output tensor with broadcast shape if necessary
	\|===

	Operation Function:

	[source,c++]
	----
	for_each(index in shape) {
	index1 = apply_broadcast(shape, shape1, index);
	index2 = apply_broadcast(shape, shape2, index);
	in_t value1 = tensor_read<in_t>(input1, shape1, index1);
	in_t value2 = tensor_read<in_t>(input2, shape2, index2);
	in_t result = apply_sub<in_t>(value1, value2);
	tensor_write<in_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t

	\|Any\|signed 32\|int32_t
	\|MI, MT\|floating-point\|float_t
	\|===

	==== TABLE

	Table lookup operation.
	For int8_t TABLE operation, perform a 256 entry table lookup returning an int8_t value.
	For int16_t tables, the int16_t input is treated as a fixed-point 9.7 value.
	The most significant 9 bits are used to index into the table.
	The fractional 7 bits are used to interpolate based on table[index] and table[index+1].
	For int16_t inputs, the TABLE operator returns a 16.7 interpolated value in an int32_t.
	This value can then be input to the RESCALE operator to scale to the required output data type.
	Note that int16_t table has 513 values to handle table[index+1] when index=511.

	An int16_t to int16_t table lookup can be constructed in TOSA as follows:

	* Use the TABLE operator to produce a fixed point 16.7 interpolated result
	* Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required)

	Arguments:

	\|===
	\|Argument\|Type\|Name\|Shape\|Description

	\|Input\|in_t*\|Input\|shape\|Input tensor
	\|Input (MT profile) Attribute (BI/MI profiles)\|table_t*\|table\|[TABLE_SIZE]\|Lookup table tensor
	\|Output\|out_t*\|output\|shape\|Output tensor
	\|===

	Operation Function:

	[source,c++]
	----
	REQUIRE(length(table) == TABLE_SIZE);
	for_each(index in shape) {
	in_t value = tensor_read<in_t>(input, shape, index);
	out_t result;
	if (in_t == int8_t) {
	// value is a signed int, convert to a 0 based index
	result = table[value + 128];
	} else {
	result = apply_lookup(table, value);
	}
	tensor_write<out_t>(output, shape, index, result);
	}
	----

	Supported Data Types:

	\|===
	\|Profile\|Mode\|in_t\|table_t\|TABLE_SIZE\|out_t

	\|Any\|signed 8\|int8_t\|int8_t\|256\|int8_t
	\|Any\|signed 16\|int16_t\|int16_t\|513\|int32_t
	\|===