/*
 * Copyright (c) 2016-2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"

#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IDistribution1D.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"

#include <algorithm>
#include <arm_neon.h>
#include <array>

namespace arm_compute
{
class Coordinates;

inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
{
    arm_compute::lock_guard<arm_compute::Mutex> lock(_hist_mtx);

    const unsigned int v_end = (bins / 4) * 4;

    for(unsigned int b = 0; b < v_end; b += 4)
    {
        const uint32x4_t tmp_global = vld1q_u32(global_hist + b);
        const uint32x4_t tmp_local  = vld1q_u32(local_hist + b);
        vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local));
    }

    for(unsigned int b = v_end; b < bins; ++b)
    {
        global_hist[b] += local_hist[b];
    }
}

NEHistogramKernel::NEHistogramKernel()
    : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
{
}

void NEHistogramKernel::histogram_U8(Window win, const ThreadInfo &info)
{
    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);

    const size_t          bins       = _output->num_bins();
    const int32_t         offset     = _output->offset();
    const uint32_t        offrange   = offset + _output->range();
    const uint32_t *const w_lut      = _window_lut;
    uint32_t *const       local_hist = _local_hist + info.thread_id * bins;

    // Clear local_histogram
    std::fill_n(local_hist, bins, 0);

    auto update_local_hist = [&](uint8_t p)
    {
        if(offset <= p && p < offrange)
        {
            ++local_hist[w_lut[p]];
        }
    };

    const int x_start = win.x().start();
    const int x_end   = win.x().end();

    // Handle X dimension manually to split into two loops
    // First one will use vector operations, second one processes the left over
    // pixels
    win.set(Window::DimX, Window::Dimension(0, 1, 1));

    Iterator input(_input, win);

    // Calculate local histogram
    execute_window_loop(win, [&](const Coordinates &)
    {
        int x = x_start;

        // Vector loop
        for(; x <= x_end - 8; x += 8)
        {
            const uint8x8_t pixels = vld1_u8(input.ptr() + x);

            update_local_hist(vget_lane_u8(pixels, 0));
            update_local_hist(vget_lane_u8(pixels, 1));
            update_local_hist(vget_lane_u8(pixels, 2));
            update_local_hist(vget_lane_u8(pixels, 3));
            update_local_hist(vget_lane_u8(pixels, 4));
            update_local_hist(vget_lane_u8(pixels, 5));
            update_local_hist(vget_lane_u8(pixels, 6));
            update_local_hist(vget_lane_u8(pixels, 7));
        }

        // Process leftover pixels
        for(; x < x_end; ++x)
        {
            update_local_hist(input.ptr()[x]);
        }
    },
    input);

    // Merge histograms
    merge_histogram(_output->buffer(), local_hist, bins);
}

void NEHistogramKernel::histogram_fixed_U8(Window win, const ThreadInfo &info)
{
    ARM_COMPUTE_UNUSED(info);
    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);

    std::array<uint32_t, _max_range_size> local_hist{ { 0 } };

    const int x_start = win.x().start();
    const int x_end   = win.x().end();

    // Handle X dimension manually to split into two loops
    // First one will use vector operations, second one processes the left over
    // pixels
    win.set(Window::DimX, Window::Dimension(0, 1, 1));

    Iterator input(_input, win);

    // Calculate local histogram
    execute_window_loop(win, [&](const Coordinates &)
    {
        int x = x_start;

        // Vector loop
        for(; x <= x_end - 8; x += 8)
        {
            const uint8x8_t pixels = vld1_u8(input.ptr() + x);

            ++local_hist[vget_lane_u8(pixels, 0)];
            ++local_hist[vget_lane_u8(pixels, 1)];
            ++local_hist[vget_lane_u8(pixels, 2)];
            ++local_hist[vget_lane_u8(pixels, 3)];
            ++local_hist[vget_lane_u8(pixels, 4)];
            ++local_hist[vget_lane_u8(pixels, 5)];
            ++local_hist[vget_lane_u8(pixels, 6)];
            ++local_hist[vget_lane_u8(pixels, 7)];
        }

        // Process leftover pixels
        for(; x < x_end; ++x)
        {
            ++local_hist[input.ptr()[x]];
        }
    },
    input);

    // Merge histograms
    merge_histogram(_output->buffer(), local_hist.data(), _max_range_size);
}

void NEHistogramKernel::calculate_window_lut() const
{
    const int32_t  offset = _output->offset();
    const size_t   bins   = _output->num_bins();
    const uint32_t range  = _output->range();

    std::fill_n(_window_lut, offset, 0);

    for(unsigned int p = offset; p < _max_range_size; ++p)
    {
        _window_lut[p] = ((p - offset) * bins) / range;
    }
}

void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
{
    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
    ARM_COMPUTE_ERROR_ON(nullptr == output);
    ARM_COMPUTE_ERROR_ON(nullptr == local_hist);
    ARM_COMPUTE_ERROR_ON(nullptr == window_lut);

    _input      = input;
    _output     = output;
    _local_hist = local_hist;
    _window_lut = window_lut;

    //Check offset
    ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast<int32_t>(_max_range_size), "Offset is larger than the image value range.");

    //Check range
    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->range()) > static_cast<int32_t>(_max_range_size) /* max range */, "Range larger than the image value range.");

    // Calculate LUT
    calculate_window_lut();

    // Set appropriate function
    _func = &NEHistogramKernel::histogram_U8;

    Window win = calculate_max_window(*input->info(), Steps());

    INEKernel::configure(win);
}

void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output)
{
    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
    ARM_COMPUTE_ERROR_ON(nullptr == output);

    _input  = input;
    _output = output;

    // Set appropriate function
    _func = &NEHistogramKernel::histogram_fixed_U8;

    Window win = calculate_max_window(*input->info(), Steps());

    INEKernel::configure(win);
}

void NEHistogramKernel::run(const Window &window, const ThreadInfo &info)
{
    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
    ARM_COMPUTE_ERROR_ON(_func == nullptr);

    (this->*_func)(window, info);
}
} // namespace arm_compute
