blob: 7d16f35fbe0704b0d06400b19da8a8ec41f5fcb5 [file] [log] [blame]
/*
* Copyright (c) 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "src/cl/helpers/CLMemoryOpBufferHelper.h"
#include "ckw/Error.h"
#include "ckw/TensorSampler.h"
#include "ckw/types/MemoryOperation.h"
#include "ckw/types/TensorStorageType.h"
#include "src/cl/CLHelpers.h"
#include "src/cl/CLKernelWriter.h"
#include "src/cl/CLTensorArgument.h"
#include "src/cl/CLTile.h"
#include "src/ITensor.h"
#include "src/Tensor3dMapper.h"
#include "src/TileView.h"
namespace ckw
{
bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
const ITensor *tensor,
const TensorSampler *sampler,
const Tensor3dMapper *mapper,
MemoryOperation op,
const TileView<CLTile> &dst)
{
CKW_UNUSED(writer, tensor, mapper, op, dst);
if (sampler->storage() != TensorStorageType::BufferUint8Ptr)
{
return false;
}
return true;
}
/** Initialization and Finalizing Logic
*
* The meanings of if/elses in different dimensions and how they're constructed:
* - x: partial load/store
* - y: no load/store operation
* - z: no load/store operation
* if(x)
* {
* if(z)
* {
* if(y)
* {
* // full load/store width
* }
* else
* {
* // no load/store
* }
* }
* else
* {
* // no load/store
* }
* }
* else
* {
* if(z)
* {
* if(y)
* {
* // partial load/store width
* }
* else
* {
* // no load/store
* }
* }
* else
* {
* // no load/store
* }
* }
*
* In general, initialize() writes if conditions, and finalize() writes else conditions.
* The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
* at each row write. In some addressing modes, such as None, no if/else conditions are written.
*/
void CLMemoryOpBufferHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
{
CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
_coord_x = x->scalar(0, 0).str;
_coord_z = z->scalar(0, 0).str;
_coord_b = b->scalar(0, 0).str;
_coord_orig_z = _coord_z;
out_of_bound_initialize_x(_coord_x);
out_of_bound_initialize_z(_coord_z);
}
void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_y)
{
// The only check required is on Y.
out_of_bound_initialize_y(coord_y);
const std::string dst = _dst.vector(row_id).str;
const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
const std::string ls_buf = to_statement(_op, _ls_width_full, dst, address);
_writer->op_write_raw_code(ls_buf);
_writer->op_write_raw_code(";\n");
out_of_bound_finalize_y(dst);
// The left over load/store will be written in the finalize stage
if (_ls_width_part.size() != 0)
{
int32_t col_start = 0;
const TileArea original_area = _dst.area();
for (int32_t partial_width : _ls_width_part)
{
// Set the active area
const TileArea area(original_area.row_start(), original_area.row_end(), col_start,
col_start + partial_width);
_dst.area(area);
const std::string dst = _dst.vector(row_id).str;
const std::string coord_x = _coord_x + " + " + std::to_string(col_start);
const std::string address = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
const std::string statement = to_statement(_op, partial_width, dst, address);
_leftovers_x.emplace_back(dst, coord_y, statement);
col_start += partial_width;
}
// Restore the original area
_dst.area(original_area);
}
}
void CLMemoryOpBufferHelper::finalize()
{
out_of_bound_finalize_z();
out_of_bound_finalize_x();
}
void CLMemoryOpBufferHelper::out_of_bound_initialize_x(const std::string &coord)
{
if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
{
TensorInfo tensor_info = _tensor->info();
TensorShape shape = tensor_info.shape();
_ls_width_part = cl_decompose_vector_width(shape[0] % _ls_width_full);
if (_ls_width_part.size() != 0)
{
_writer->op_write_raw_code("if(" + coord + " > 0)\n{\n");
}
}
}
void CLMemoryOpBufferHelper::out_of_bound_finalize_x()
{
if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
{
if (_ls_width_part.size() != 0)
{
_writer->op_write_raw_code("}\nelse\n{\n");
out_of_bound_initialize_z(_coord_orig_z);
for (LeftoverDescriptor leftover_desc : _leftovers_x)
{
out_of_bound_initialize_y(leftover_desc.coord);
_writer->op_write_raw_code(leftover_desc.statement);
_writer->op_write_raw_code(";\n");
out_of_bound_finalize_y(leftover_desc.dst);
}
out_of_bound_finalize_z();
_writer->op_write_raw_code("}\n");
}
}
}
void CLMemoryOpBufferHelper::out_of_bound_initialize_y(const std::string &coord)
{
std::string max = "";
const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
switch (address_mode_y)
{
case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
// Not to be moved outside the case because it marks the relevant tensor component as used even if we dont't use the variable
max = _mapper->dim_y().str;
_writer->op_write_raw_code("if(" + coord + " < " + max + ")\n{\n");
break;
case TensorSamplerAddressModeY::SkipLessThanZero:
_writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
break;
case TensorSamplerAddressModeY::None:
break;
default:
CKW_THROW_MSG("Unsupported address mode for Y dimension");
}
}
void CLMemoryOpBufferHelper::out_of_bound_finalize_y(const std::string &dst)
{
const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
switch (address_mode_y)
{
case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
_writer->op_write_raw_code("}\nelse\n{\n");
_writer->op_write_raw_code(dst);
_writer->op_write_raw_code(" = 0.0f;\n}\n");
break;
case TensorSamplerAddressModeY::SkipLessThanZero:
_writer->op_write_raw_code("}\n");
break;
case TensorSamplerAddressModeY::None:
break;
default:
CKW_THROW_MSG("Unsupported address mode for Y dimension");
}
}
void CLMemoryOpBufferHelper::out_of_bound_initialize_z(const std::string &coord)
{
CKW_UNUSED(coord);
const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
switch (address_mode_z)
{
case TensorSamplerAddressModeZ::None:
break;
default:
CKW_THROW_MSG("Unsupported address mode for Z dimension");
}
}
void CLMemoryOpBufferHelper::out_of_bound_finalize_z()
{
const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
switch (address_mode_z)
{
case TensorSamplerAddressModeZ::None:
break;
default:
CKW_THROW_MSG("Unsupported address mode for Z dimension");
}
}
std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation op,
int32_t vector_width,
const std::string &data,
const std::string &address) const
{
switch (op)
{
case MemoryOperation::Load:
if (vector_width != 1)
{
return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
}
else
{
return data + " = *(" + address + ")";
}
break;
case MemoryOperation::Store:
if (vector_width != 1)
{
return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
}
else
{
return "*(" + address + ") = " + data;
}
break;
default:
CKW_THROW_MSG("Unsupported MemoryOperation");
}
return "";
}
std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x,
const std::string &y,
const std::string &z,
const std::string &b) const
{
TensorStorageType tensor_storage = _sampler->storage();
CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
const std::string ptr_buf = _tensor->storage(tensor_storage).val;
const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst.data_type(), 1);
std::string address;
address += "(__global ";
address += dst_type;
address += "*)(";
address += ptr_buf;
if (x != "0" && (_mapper->dim_x().str != "1"))
{
address += " + (";
address += x + ") * sizeof(" + dst_type + ")";
}
if (y != "0")
{
const std::string stride_y = _mapper->stride_y().str;
address += " + (";
address += y + ")";
address += " * ";
address += stride_y;
}
if (z != "0" && (_mapper->dim_z().str != "1"))
{
const std::string stride_z = _mapper->stride_z().str;
address += " + (";
address += z + ")";
address += " * ";
address += stride_z;
}
if (b != "0" && (_mapper->dim_batch().str != "1"))
{
const std::string stride_b = _mapper->stride_batch().str;
address += " + (";
address += b + ")";
address += " * ";
address += stride_b;
}
address += ")";
return address;
}
} // namespace ckw