| /* |
| * Copyright (c) 2022 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| /// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp |
| /// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target) |
| /// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator |
| |
| #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION |
| #ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ |
| #error "This example needs to be built with -DARM_COMPUTE_CL" |
| #endif /* ARM_COMPUTE_CL */ |
| |
| #include "arm_compute/core/CL/CLKernelLibrary.h" |
| #include "arm_compute/core/Types.h" |
| #include "arm_compute/core/experimental/ClWorkload.h" |
| #include "arm_compute/core/experimental/OperatorGraph.h" |
| #include "arm_compute/runtime/CL/CLScheduler.h" |
| #include "arm_compute/runtime/CL/CLTensor.h" |
| #include "arm_compute/runtime/CL/CLTuner.h" |
| #include "arm_compute/runtime/experimental/ClCompositeOperator.h" |
| |
| #include "arm_compute/core/utils/misc/ShapeCalculator.h" |
| #include "utils/TypePrinter.h" |
| |
| #include "utils/Utils.h" |
| |
| #include <cstdlib> |
| |
| using namespace arm_compute; |
| using namespace utils; |
| using namespace arm_compute::experimental::dynamic_fusion; |
| |
| #define TICK(clock_name) \ |
| auto clock_name##_tick = std::chrono::high_resolution_clock::now(); |
| #define TOCK(clock_name, measurement_map) \ |
| auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ |
| measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick); |
| #define TOCK_AVG(clock_name, measurement_map, num_iterations) \ |
| auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ |
| measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations)); |
| |
| using std::chrono::duration_cast; |
| using std::chrono::microseconds; |
| |
| class ClFusedConv2dEltwiseAddExample : public Example |
| { |
| public: |
| bool do_setup(int argc, char **argv) override |
| { |
| size_t ih; |
| size_t iw; |
| size_t ifm; |
| size_t wh; |
| size_t ww; |
| size_t ofm; |
| size_t tuner_choice; |
| unsigned int pad_x; |
| unsigned int pad_y; |
| if(argc < 10) |
| { |
| // Print help |
| std::cout << "Usage: ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n"; |
| std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; |
| ih = 512; |
| iw = 512; |
| ifm = 64; |
| wh = 1; |
| ww = 1; |
| ofm = 3; |
| tuner_choice = 2; |
| pad_x = 0; |
| pad_y = 0; |
| } |
| else |
| { |
| ih = strtol(argv[1], nullptr, 10); |
| iw = strtol(argv[2], nullptr, 10); |
| ifm = strtol(argv[3], nullptr, 10); |
| wh = strtol(argv[4], nullptr, 10); |
| ww = strtol(argv[5], nullptr, 10); |
| ofm = strtol(argv[6], nullptr, 10); |
| tuner_choice = strtol(argv[7], nullptr, 10); |
| pad_x = strtol(argv[8], nullptr, 10); |
| pad_y = strtol(argv[9], nullptr, 10); |
| } |
| |
| CLTuner *tuner_to_use; |
| switch(tuner_choice) |
| { |
| case 0: |
| { |
| tuner_to_use = nullptr; |
| break; |
| } |
| case 1: |
| { |
| tuner.set_tuner_mode(CLTunerMode::RAPID); |
| tuner_to_use = &tuner; |
| break; |
| } |
| case 3: |
| { |
| tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE); |
| tuner_to_use = &tuner; |
| break; |
| } |
| case 2: |
| default: |
| { |
| tuner.set_tuner_mode(CLTunerMode::NORMAL); |
| tuner_to_use = &tuner; |
| break; |
| } |
| } |
| CLScheduler::get().default_init(tuner_to_use); |
| |
| TICK(startup_time); |
| TICK(configure); |
| /* Computation: |
| * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) |
| */ |
| const auto data_type = DataType::F32; |
| const auto data_layout = DataLayout::NHWC; |
| |
| const auto t_input_shape = TensorShape(ifm, iw, ih); |
| const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); |
| const auto t_bias_shape = TensorShape(ofm); |
| const auto t_l1_addend_shape = TensorShape(ofm, iw); |
| |
| std::cout << "input_shape: " << t_input_shape << std::endl; |
| std::cout << "weight_shape: " << t_weight_shape << std::endl; |
| std::cout << "bias_shape: " << t_bias_shape << std::endl; |
| std::cout << "addend_shape: " << t_l1_addend_shape << std::endl; |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph |
| /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph |
| // [Construct OperatorGraph] |
| OperatorGraph op_graph; |
| // [Construct OperatorGraph] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @subsection add_conv2d Add the first operator (root operator) Conv2d |
| /// The first operator to be added to the graph is called the "root operator" of the entire graph. |
| /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time. |
| /// Later this might be changed to allow out-of-order insertion. |
| |
| /// Before we insert the operator, we need to initialize the required TensorInfo objects. |
| /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph |
| /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize Conv2d TensorInfo |
| // [Initialize Conv2d TensorInfo] |
| auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); |
| auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); |
| auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout); |
| auto t_acc_info = TensorInfo(); |
| // [Initialize Conv2d TensorInfo] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// Next we associate the TensorInfo with the OpTensor s created in the op_graph. |
| /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed |
| |
| /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction |
| |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add OpTensors |
| // [Add OpTensors] |
| const auto op_t_input = add_tensor(op_graph, t_input_info); |
| const auto op_t_weight = add_tensor(op_graph, t_weight_info); |
| const auto op_t_bias = add_tensor(op_graph, t_bias_info); |
| const auto op_t_acc = add_tensor(op_graph, t_acc_info); |
| // [Add OpTensors] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters |
| /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle. |
| /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation |
| /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution |
| /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod |
| |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Conv2d Operator |
| // [Add Conv2d Operator] |
| Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } }; |
| auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc); |
| force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes |
| // [Add Conv2d Operator] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @subsection add_elementwise_add Add the second operator Elementwise Add |
| /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor, |
| /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition |
| /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor. |
| /// For example, here op_t_dst could be associated with op_t_acc info as they are the same, |
| /// but we still recommend creating a separate object. |
| |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Elementwise Add Operator |
| // [Add Elementwise Add Operator] |
| auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout); |
| auto t_dst_info = TensorInfo(); |
| const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); |
| const auto op_t_dst = add_tensor(op_graph, t_dst_info); |
| ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD }; |
| add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); |
| // [Add Elementwise Add Operator] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @section build_clworkload Build ClWorkload |
| /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them |
| /// We build ClWorkload from the op_graph object that we just described |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Build ClWorkload |
| // [Build ClWorkload] |
| const ClWorkloadContext workload_ctx |
| { |
| GpuInfo{ CLScheduler::get().target() } |
| }; |
| ClWorkload workload; |
| build(workload, op_graph, workload_ctx); |
| // [Build ClWorkload] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator |
| /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator |
| /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it. |
| /// Optionally we can explicitly validate the workload to check if the workload has been built successfully. |
| /// The validate is automatically run inside configure and would throw if it fails. |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Validate and configure ClCompositeOperator |
| // [Validate and configure ClCompositeOperator] |
| const auto success = ClCompositeOperator::validate(workload); // Optional |
| op.configure(CLKernelLibrary::get().get_compile_context(), workload); |
| // [Validate and configure ClCompositeOperator] |
| TOCK(configure, measurements); |
| |
| TICK(tensor_allocation); |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// @subsection run_clcompositeoperator Run ClCompositeOperator |
| /// Construct the runtime CLTensor s with backing memory |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects |
| |
| /// Initialize, allocate and fill the CLTensor objects |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects |
| // [Initialize, Allocate and Fill CLTensor objects] |
| t_input.allocator()->init(t_input_info); |
| t_weight.allocator()->init(t_weight_info); |
| t_bias.allocator()->init(t_bias_info); |
| t_l1_addend.allocator()->init(t_dst_info); |
| t_dst.allocator()->init(t_dst_info); |
| |
| t_input.allocator()->allocate(); |
| t_weight.allocator()->allocate(); |
| t_bias.allocator()->allocate(); |
| t_l1_addend.allocator()->allocate(); |
| t_dst.allocator()->allocate(); |
| |
| fill_random_tensor(t_input, -1.f, 1.f); |
| fill_random_tensor(t_weight, -1.f, 1.f); |
| fill_random_tensor(t_l1_addend, -1.f, 1.f); |
| // [Initialize, Allocate and Fill CLTensor objects] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding |
| // [Create OpTensorBinding] |
| OpTensorBinding op_tensors({ { op_t_input, &t_input }, |
| { op_t_weight, &t_weight }, |
| { op_t_bias, &t_bias }, |
| { op_t_l1_addend, &t_l1_addend }, |
| { op_t_dst, &t_dst } |
| }); |
| // [Create OpTensorBinding] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op |
| /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data |
| /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map |
| /// Make sure these CLTensors remain valid as long as the two pack_maps are still in use |
| |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors |
| // [Bind Tensors] |
| bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors); |
| // [Bind Tensors] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// Initialize and Allocate Auxiliary CLTensor objects. |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects |
| // [Initialize and Allocate Auxiliary CLTensor objects] |
| for(auto tensor_data : aux_tensor_data.get_tensors()) |
| { |
| tensor_data.tensor->allocator()->init(tensor_data.tensor_info); |
| tensor_data.tensor->allocator()->allocate(); |
| } |
| // [Initialize and Allocate Auxiliary CLTensor objects] |
| TOCK(tensor_allocation, measurements); |
| |
| TICK(dummy_run); |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like |
| /// reshaping tensors for a more performant format. |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator |
| // [Prepare ClCompositeOperator] |
| op.prepare(prepare_pack_map); |
| // [Prepare ClCompositeOperator] |
| |
| /// @page example_dynamic_fusion_cl_conv2d_elementwise_add |
| /// At last, we run our operator |
| /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator |
| // [Run ClCompositeOperator] |
| op.run(run_pack_map); |
| // [Run ClCompositeOperator] |
| CLScheduler::get().sync(); |
| TOCK(dummy_run, measurements); |
| TOCK(startup_time, measurements); |
| return true; |
| } |
| void do_run() override |
| { |
| // Run the fused op |
| op.run(run_pack_map); |
| |
| // Make sure all the OpenCL jobs are done executing: |
| CLScheduler::get().sync(); |
| } |
| |
| void do_teardown() override |
| { |
| for(const auto &m : measurements) |
| { |
| std::cout << m.first << ": " << m.second.count() << "us" << std::endl; |
| } |
| } |
| |
| private: |
| // [Construct CLTensor objects] |
| CLTensor t_input{}; |
| CLTensor t_weight{}; |
| CLTensor t_bias{}; |
| CLTensor t_l1_addend{}; |
| CLTensor t_dst{}; |
| // [Construct CLTensor objects] |
| // [Construct ClAuxTensorData] |
| ClAuxTensorData aux_tensor_data{}; |
| // [Construct ClAuxTensorData] |
| // [Construct TensorPackMaps] |
| TensorPackMap prepare_pack_map{}; |
| TensorPackMap run_pack_map{}; |
| // [Construct TensorPackMaps] |
| // [Construct ClCompositeOperator] |
| ClCompositeOperator op{}; |
| // [Construct ClCompositeOperator] |
| CLTuner tuner{}; |
| std::map<std::string, std::chrono::microseconds> measurements{}; |
| }; |
| |
| /** Main program for sgemm test |
| * |
| * @param[in] argc Number of arguments |
| * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) |
| */ |
| int main(int argc, char **argv) |
| { |
| return utils::run_example<ClFusedConv2dEltwiseAddExample>(argc, argv); |
| } |
| |
| #undef TICK |
| #undef TOCK |
| #undef TOCK_AVG |
| #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ |