blob: 4caa31905c4cd1ae946ff1c4a11190c9217a7fc2 [file] [log] [blame]
//
// Copyright © 2017 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "Profiling.hpp"
#include "JsonPrinter.hpp"
#if ARMNN_STREAMLINE_ENABLED
#include <streamline_annotate.h>
#endif
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <map>
#include <stack>
#include <boost/algorithm/string.hpp>
#include <boost/core/ignore_unused.hpp>
namespace armnn
{
// Controls the amount of memory initially allocated to store profiling events.
// If chosen carefully, the profiling system will not make any additional allocations, thus minimizing its impact on
// measured times.
constexpr std::size_t g_ProfilingEventCountHint = 1024;
// Whether profiling reports should include the sequence of events together with their timings.
constexpr bool g_WriteProfilingEventSequence = true;
// Whether profiling reports should also report detailed information on events grouped by inference.
// This can spam the output stream, so use carefully (or adapt the code to just output information
// of interest).
constexpr bool g_AggregateProfilingEventsByInference = true;
// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed.
// It can be convenient for local tests.
constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false;
Measurement FindMeasurement(const std::string& name, const Event* event)
{
BOOST_ASSERT(event != nullptr);
// Search though the measurements.
for (const auto& measurement : event->GetMeasurements())
{
if (measurement.m_Name == name)
{
// Measurement found.
return measurement;
}
}
// Measurement not found.
return Measurement{ "", 0.f, Measurement::Unit::TIME_MS };
}
std::vector<Measurement> FindKernelMeasurements(const Event* event)
{
BOOST_ASSERT(event != nullptr);
std::vector<Measurement> measurements;
// Search through the measurements.
for (const auto& measurement : event->GetMeasurements())
{
if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0
|| measurement.m_Name.rfind("NeonKernelTimer", 0) == 0)
{
// Measurement found.
measurements.push_back(measurement);
}
}
return measurements;
}
std::map<std::string, Profiler::ProfilingEventStats> Profiler::CalculateProfilingEventStats() const
{
std::map<std::string, ProfilingEventStats> nameToStatsMap;
for (const auto& event : m_EventSequence)
{
Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get());
double durationMs = measurement.m_Value;
auto it = nameToStatsMap.find(event->GetName());
if (it != nameToStatsMap.end())
{
ProfilingEventStats& stats = it->second;
stats.m_TotalMs += durationMs;
stats.m_MinMs = std::min(stats.m_MinMs, durationMs);
stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs);
++stats.m_Count;
}
else
{
nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 });
}
}
return nameToStatsMap;
}
const Event* GetEventPtr(const Event* ptr) { return ptr;}
const Event* GetEventPtr(const std::unique_ptr<Event>& ptr) {return ptr.get(); }
template<typename ItertType>
void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const
{
// Outputs event sequence, if needed.
if (g_WriteProfilingEventSequence)
{
// Makes sure timestamps are output with 6 decimals, and save old settings.
std::streamsize oldPrecision = outStream.precision();
outStream.precision(6);
std::ios_base::fmtflags oldFlags = outStream.flags();
outStream.setf(std::ios::fixed);
// Outputs fields.
outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl;
for (auto event = first; event != last; ++event)
{
const Event* eventPtr = GetEventPtr((*event));
double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value;
double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value;
// Find the WallClock measurement if there is one.
double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value;
outStream << std::setw(50) << eventPtr->GetName() << " "
<< std::setw(20) << durationMs
<< std::setw(20) << startTimeMs
<< std::setw(20) << stopTimeMs
<< std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice())
<< std::endl;
}
outStream << std::endl;
// Restores previous precision settings.
outStream.flags(oldFlags);
outStream.precision(oldPrecision);
}
// Aggregates results per event name.
std::map<std::string, ProfilingEventStats> nameToStatsMap = CalculateProfilingEventStats();
// Outputs aggregated stats.
outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl;
for (const auto& pair : nameToStatsMap)
{
const std::string& eventLabel = pair.first;
const ProfilingEventStats& eventStats = pair.second;
const double avgMs = eventStats.m_TotalMs / double(eventStats.m_Count);
outStream << "\t" << std::setw(50) << eventLabel << " " << std::setw(9) << avgMs << " "
<< std::setw(9) << eventStats.m_MinMs << " " << std::setw(9) << eventStats.m_MaxMs << " "
<< std::setw(9) << eventStats.m_TotalMs << " " << std::setw(9) << eventStats.m_Count << std::endl;
}
outStream << std::endl;
}
Profiler::Profiler()
: m_ProfilingEnabled(false)
{
m_EventSequence.reserve(g_ProfilingEventCountHint);
#if ARMNN_STREAMLINE_ENABLED
// Initialises streamline annotations.
ANNOTATE_SETUP;
#endif
}
Profiler::~Profiler()
{
if (m_ProfilingEnabled)
{
if (g_WriteReportToStdOutOnProfilerDestruction)
{
Print(std::cout);
}
}
// Un-register this profiler from the current thread.
ProfilerManager::GetInstance().RegisterProfiler(nullptr);
}
bool Profiler::IsProfilingEnabled()
{
return m_ProfilingEnabled;
}
void Profiler::EnableProfiling(bool enableProfiling)
{
m_ProfilingEnabled = enableProfiling;
}
Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector<InstrumentPtr>&& instruments)
{
Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
m_EventSequence.push_back(std::make_unique<Event>(label, this, parent, compute, std::move(instruments)));
Event* event = m_EventSequence.back().get();
event->Start();
#if ARMNN_STREAMLINE_ENABLED
ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str());
#endif
m_Parents.push(event);
return event;
}
void Profiler::EndEvent(Event* event)
{
event->Stop();
BOOST_ASSERT(!m_Parents.empty());
BOOST_ASSERT(event == m_Parents.top());
m_Parents.pop();
Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
boost::ignore_unused(parent);
BOOST_ASSERT(event->GetParentEvent() == parent);
#if ARMNN_STREAMLINE_ENABLED
ANNOTATE_CHANNEL_END(m_Parents.size());
#endif
}
int CalcLevel(const Event* eventPtr)
{
int level=0;
while (eventPtr != nullptr)
{
eventPtr = eventPtr->GetParentEvent();
level++;
}
return level;
}
void Profiler::PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const
{
outInferences.reserve(m_EventSequence.size());
for (const auto& event : m_EventSequence)
{
const Event* eventPtrRaw = event.get();
if (eventPtrRaw->GetName() == "EnqueueWorkload")
{
outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel;
outInferences.push_back(eventPtrRaw);
}
}
}
void Profiler::PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const
{
for (const auto& event : m_EventSequence)
{
const Event* eventPtrRaw = event.get();
const Event* parent = eventPtrRaw->GetParentEvent();
if (!parent)
{
continue;
}
auto it = outDescendantsMap.find(parent);
if (it == outDescendantsMap.end())
{
outDescendantsMap.emplace(parent, std::vector<const Event*>({eventPtrRaw}));
}
else
{
it->second.push_back(eventPtrRaw);
}
}
}
void Profiler::Print(std::ostream& outStream) const
{
// Makes sure timestamps are output with 6 decimals, and save old settings.
std::streamsize oldPrecision = outStream.precision();
outStream.precision(6);
std::ios_base::fmtflags oldFlags = outStream.flags();
outStream.setf(std::ios::fixed);
JsonPrinter printer(outStream);
// First find all the "inference" Events and print out duration measurements.
int baseLevel = -1;
std::vector<const Event*> inferences;
PopulateInferences(inferences, baseLevel);
// Second map out descendants hierarchy
std::map<const Event*, std::vector<const Event*>> descendantsMap;
PopulateDescendants(descendantsMap);
JsonChildObject inferenceObject{"inference_measurements"};
JsonChildObject layerObject{"layer_measurements"};
std::vector<JsonChildObject> workloadObjects;
std::map<unsigned int, std::vector<JsonChildObject>> workloadToKernelObjects;
for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex)
{
auto inference = inferences[inferenceIndex];
Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference);
inferenceObject.SetUnit(measurement.m_Unit);
inferenceObject.AddMeasurement(measurement.m_Value);
auto layerEventsIt = descendantsMap.find(inference);
// Assuming 1 Execute per inference
if (layerEventsIt != descendantsMap.end())
{
auto layerEvent = layerEventsIt->second[0];
Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent);
layerObject.SetUnit(measurement.m_Unit);
layerObject.AddMeasurement(measurement.m_Value);
// Get Descendant Events for Execute
auto workloadEventsIt = descendantsMap.find(layerEvent);
for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex)
{
auto workloadEvent = workloadEventsIt->second[workloadIndex];
Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent);
std::vector<Measurement> kernelMeasurements = FindKernelMeasurements(workloadEvent);
if (inferenceIndex == 0)
{
// Only add second level once, in case of multiple inferences
JsonChildObject workloadObject{workloadEvent->GetName()};
workloadObject.SetUnit(measurement.m_Unit);
workloadObjects.push_back(workloadObject);
}
workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value);
for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex)
{
if (inferenceIndex == 0)
{
// Only add kernel measurement once, in case of multiple inferences
JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name};
kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit);
workloadToKernelObjects[workloadIndex].push_back(kernelObject);
}
workloadToKernelObjects[workloadIndex][kernelIndex].
AddMeasurement(kernelMeasurements[kernelIndex].m_Value);
}
}
}
}
for (auto workloadToKernelPair : workloadToKernelObjects)
{
for (auto kernelObject : workloadToKernelPair.second)
{
workloadObjects[workloadToKernelPair.first].AddChild(kernelObject);
}
}
for (auto workloadObject : workloadObjects)
{
layerObject.AddChild(workloadObject);
}
inferenceObject.AddChild(layerObject);
printer.PrintHeader();
printer.PrintArmNNHeader();
// print inference object, also prints child layer and kernel measurements
printer.PrintJsonChildObject(inferenceObject);
// end of ArmNN
printer.PrintNewLine();
printer.PrintFooter();
// end of main JSON object
printer.PrintNewLine();
printer.PrintFooter();
printer.PrintNewLine();
// Restores previous precision settings.
outStream.flags(oldFlags);
outStream.precision(oldPrecision);
}
void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
{
// Stack should be empty now.
const bool saneMarkerSequence = m_Parents.empty();
// Abort if the sequence of markers was found to have incorrect information:
// The stats cannot be trusted.
if (!saneMarkerSequence)
{
outStream << "Cannot write profiling stats. "
"Unexpected errors were found when analyzing the sequence of logged events, which may lead to plainly "
"wrong stats. The profiling system may contain implementation issues or could have been used in an "
"unsafe manner." << std::endl;
return;
}
// Analyzes the full sequence of events.
AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(),
m_EventSequence.cend(),
outStream);
// Aggregates events by tag if requested (spams the output stream if done for all tags).
if (g_AggregateProfilingEventsByInference)
{
outStream << std::endl;
outStream << "***" << std::endl;
outStream << "*** Per Inference Stats" << std::endl;
outStream << "***" << std::endl;
outStream << std::endl;
int baseLevel = -1;
std::vector<const Event*> inferences;
PopulateInferences(inferences, baseLevel);
// Second map out descendants hierarchy
std::map<const Event*, std::vector<const Event*>> descendantsMap;
PopulateDescendants(descendantsMap);
std::function<void (const Event*, std::vector<const Event*>&)>
FindDescendantEvents = [&](const Event* eventPtr,
std::vector<const Event*>& sequence)
{
sequence.push_back(eventPtr);
if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions.
{
return;
}
auto children = descendantsMap.find(eventPtr);
if (children == descendantsMap.end())
{
return;
}
for (const Event* child : children->second)
{
return FindDescendantEvents(child, sequence);
}
};
// Third, find events belonging to each inference
int inferenceIdx = 0;
for (auto inference : inferences)
{
std::vector<const Event*> sequence;
//build sequence, depth first
FindDescendantEvents(inference, sequence);
outStream << "> Begin Inference: " << inferenceIdx << std::endl;
outStream << std::endl;
AnalyzeEventSequenceAndWriteResults(sequence.cbegin(),
sequence.cend(),
outStream);
outStream << std::endl;
outStream << "> End Inference: " << inferenceIdx << std::endl;
inferenceIdx++;
}
}
}
std::uint32_t Profiler::GetEventColor(Compute compute) const
{
switch(compute)
{
case Compute::CpuRef:
// Cyan
return 0xffff001b;
case Compute::CpuAcc:
// Green
return 0x00ff001b;
case Compute::GpuAcc:
// Purple
return 0xff007f1b;
default:
// Dark gray
return 0x5555551b;
}
}
// The thread_local pointer to the profiler instance.
thread_local Profiler* tl_Profiler = nullptr;
ProfilerManager& ProfilerManager::GetInstance()
{
// Global reference to the single ProfileManager instance allowed.
static ProfilerManager s_ProfilerManager;
return s_ProfilerManager;
}
void ProfilerManager::RegisterProfiler(Profiler* profiler)
{
tl_Profiler = profiler;
}
Profiler* ProfilerManager::GetProfiler()
{
return tl_Profiler;
}
} // namespace armnn