DataSeriesUtils.h
317 lines
| 13.8 KiB
| text/x-c
|
CLexer
r0 | #ifndef SCIQLOP_DATASERIESUTILS_H | |||
#define SCIQLOP_DATASERIESUTILS_H | ||||
#include "CoreGlobal.h" | ||||
#include <Common/SortUtils.h> | ||||
#include <Data/DataSeriesIterator.h> | ||||
#include <QLoggingCategory> | ||||
#include <cmath> | ||||
Q_DECLARE_LOGGING_CATEGORY(LOG_DataSeriesUtils) | ||||
/** | ||||
* Utility class with methods for data series | ||||
*/ | ||||
struct SCIQLOP_CORE_EXPORT DataSeriesUtils { | ||||
/** | ||||
* Define a meshs. | ||||
* | ||||
* A mesh is a regular grid representing cells of the same width (in x) and of the same height | ||||
* (in y). At each mesh point is associated a value. | ||||
* | ||||
* Each axis of the mesh is defined by a minimum value, a number of values is a mesh step. | ||||
* For example: if min = 1, nbValues = 5 and step = 2 => the axis of the mesh will be [1, 3, 5, | ||||
* 7, 9]. | ||||
* | ||||
* The values are defined in an array of size {nbX * nbY}. The data is stored along the X axis. | ||||
* | ||||
* For example, the mesh: | ||||
* Y = 2 [ 7 ; 8 ; 9 | ||||
* Y = 1 4 ; 5 ; 6 | ||||
* Y = 0 1 ; 2 ; 3 ] | ||||
* X = 0 X = 1 X = 2 | ||||
* | ||||
* will be represented by data [1, 2, 3, 4, 5, 6, 7, 8, 9] | ||||
*/ | ||||
struct Mesh { | ||||
explicit Mesh() = default; | ||||
explicit Mesh(int nbX, double xMin, double xStep, int nbY, double yMin, double yStep) | ||||
: m_NbX{nbX}, | ||||
m_XMin{xMin}, | ||||
m_XStep{xStep}, | ||||
m_NbY{nbY}, | ||||
m_YMin{yMin}, | ||||
m_YStep{yStep}, | ||||
m_Data(nbX * nbY) | ||||
{ | ||||
} | ||||
inline bool isEmpty() const { return m_Data.size() == 0; } | ||||
inline double xMax() const { return m_XMin + (m_NbX - 1) * m_XStep; } | ||||
inline double yMax() const { return m_YMin + (m_NbY - 1) * m_YStep; } | ||||
int m_NbX{0}; | ||||
double m_XMin{}; | ||||
double m_XStep{}; | ||||
int m_NbY{0}; | ||||
double m_YMin{}; | ||||
double m_YStep{}; | ||||
std::vector<double> m_Data{}; | ||||
}; | ||||
/** | ||||
* Represents a resolution used to generate the data of a mesh on the x-axis or in Y. | ||||
* | ||||
* A resolution is represented by a value and flag indicating if it's in the logarithmic scale | ||||
* @sa Mesh | ||||
*/ | ||||
struct Resolution { | ||||
double m_Val{std::numeric_limits<double>::quiet_NaN()}; | ||||
bool m_Logarithmic{false}; | ||||
}; | ||||
/** | ||||
* Processes data from a data series to complete the data holes with a fill value. | ||||
* | ||||
* A data hole is determined by the resolution passed in parameter: if, between two continuous | ||||
* data on the x-axis, the difference between these data is greater than the resolution, then | ||||
* there is one or more holes between them. The holes are filled by adding: | ||||
* - for the x-axis, new data corresponding to the 'step resolution' starting from the first | ||||
* data; | ||||
* - for values, a default value (fill value) for each new data added on the x-axis. | ||||
* | ||||
* For example, with : | ||||
* - xAxisData = [0, 1, 5, 7, 14 ] | ||||
* - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] (two components per x-axis data) | ||||
* - fillValue = NaN | ||||
* - and resolution = 2; | ||||
* | ||||
* For the x axis, we calculate as data holes: [3, 9, 11, 13]. These holes are added to the | ||||
* x-axis data, and NaNs (two per x-axis data) are added to the values: | ||||
* => xAxisData = [0, 1, 3, 5, 7, 9, 11, 13, 14 ] | ||||
* => valuesData = [0, 1, 2, 3, NaN, NaN, 4, 5, 6, 7, NaN, NaN, NaN, NaN, NaN, NaN, 8, 9] | ||||
* | ||||
* It is also possible to set bounds for the data series. If these bounds are defined and exceed | ||||
* the limits of the data series, data holes are added to the series at the beginning and/or the | ||||
* end. | ||||
* | ||||
* The generation of data holes at the beginning/end of the data series is performed starting | ||||
* from the x-axis series limit and adding data holes at each 'resolution step' as long as the | ||||
* new bound is not reached. | ||||
* | ||||
* For example, with : | ||||
* - xAxisData = [3, 4, 5, 6, 7 ] | ||||
* - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] | ||||
* - fillValue = NaN | ||||
* - minBound = 0 | ||||
* - maxBound = 12 | ||||
* - and resolution = 2; | ||||
* | ||||
* => Starting from 3 and decreasing 2 by 2 until reaching 0 : a data hole at value 1 will be | ||||
* added to the beginning of the series | ||||
* => Starting from 7 and increasing 2 by 2 until reaching 12 : data holes at values 9 and 11 | ||||
* will be added to the end of the series | ||||
* | ||||
* So : | ||||
* => xAxisData = [1, 3, 4, 5, 6, 7, 9, 11 ] | ||||
* => valuesData = [NaN, NaN, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, NaN, NaN, NaN, NaN] | ||||
* | ||||
* @param xAxisData the x-axis data of the data series | ||||
* @param valuesData the values data of the data series | ||||
* @param resolution the resoultion (on x-axis) used to determinate data holes | ||||
* @param fillValue the fill value used for data holes in the values data | ||||
* @param minBound the limit at which to start filling data holes for the series. If set to NaN, | ||||
* the limit is not used | ||||
* @param maxBound the limit at which to end filling data holes for the series. If set to NaN, | ||||
* the limit is not used | ||||
* | ||||
* @remarks There is no control over the consistency between x-axis data and values data. The | ||||
* method considers that the data is well formed (the total number of values data is a multiple | ||||
* of the number of x-axis data) | ||||
*/ | ||||
static void fillDataHoles(std::vector<double> &xAxisData, std::vector<double> &valuesData, | ||||
double resolution, | ||||
double fillValue = std::numeric_limits<double>::quiet_NaN(), | ||||
double minBound = std::numeric_limits<double>::quiet_NaN(), | ||||
double maxBound = std::numeric_limits<double>::quiet_NaN()); | ||||
/** | ||||
* Computes the resolution of a dataset passed as a parameter. | ||||
* | ||||
* The resolution of a dataset is the minimum difference between two values that follow in the | ||||
* set. | ||||
* For example: | ||||
* - for the set [0, 2, 4, 8, 10, 11, 13] => the resolution is 1 (difference between 10 and 11). | ||||
* | ||||
* A resolution can be calculated on the logarithmic scale (base of 10). In this case, the | ||||
* dataset is first converted to logarithmic values. | ||||
* For example: | ||||
* - for the set [10, 100, 10000, 1000000], the values are converted to [1, 2, 4, 6] => the | ||||
* logarithmic resolution is 1 (difference between 1 and 2). | ||||
* | ||||
* @param begin the iterator pointing to the beginning of the dataset | ||||
* @param end the iterator pointing to the end of the dataset | ||||
* @param logarithmic computes a logarithmic resolution or not | ||||
* @return the resolution computed | ||||
* @warning the method considers the dataset as sorted and doesn't control it. | ||||
*/ | ||||
template <typename Iterator> | ||||
static Resolution resolution(Iterator begin, Iterator end, bool logarithmic = false); | ||||
/** | ||||
* Computes a regular mesh for a data series, according to resolutions for x-axis and y-axis | ||||
* passed as parameters. | ||||
* | ||||
* The mesh is created from the resolutions in x and y and the boundaries delimiting the data | ||||
* series. If the resolutions do not allow to obtain a regular mesh, they are recalculated. | ||||
* | ||||
* For example : | ||||
* Let x-axis data = [0, 1, 3, 5, 9], its associated values ββ= [0, 10, 30, 50, 90] and | ||||
* xResolution = 2. | ||||
* Based on the resolution, the mesh would be [0, 2, 4, 6, 8, 10] and would be invalid because | ||||
* it exceeds the maximum bound of the data. The resolution is thus recalculated so that the | ||||
* mesh holds between the data terminals. | ||||
* So => resolution is 1.8 and the mesh is [0, 1.8, 3.6, 5.4, 7.2, 9]. | ||||
* | ||||
* Once the mesh is generated in x and y, the values ββare associated with each mesh point, | ||||
* based on the data in the series, finding the existing data at which the mesh point would be | ||||
* or would be closest to, without exceeding it. | ||||
* | ||||
* In the example, we determine the value of each mesh point: | ||||
* - x = 0 => value = 0 (existing x in the data series) | ||||
* - x = 1.8 => value = 10 (the closest existing x: 1) | ||||
* - x = 3.6 => value = 30 (the closest existing x: 3) | ||||
* - x = 5.4 => value = 50 (the closest existing x: 5) | ||||
* - x = 7.2 => value = 50 (the closest existing x: 5) | ||||
* - x = 9 => value = 90 (existing x in the data series) | ||||
* | ||||
* Same algorithm is applied for y-axis. | ||||
* | ||||
* @param begin the iterator pointing to the beginning of the data series | ||||
* @param end the iterator pointing to the end of the data series | ||||
* @param xResolution the resolution expected for the mesh's x-axis | ||||
* @param yResolution the resolution expected for the mesh's y-axis | ||||
* @return the mesh created, an empty mesh if the input data do not allow to generate a regular | ||||
* mesh (empty data, null resolutions, logarithmic x-axis) | ||||
* @warning the method considers the dataset as sorted and doesn't control it. | ||||
*/ | ||||
static Mesh regularMesh(DataSeriesIterator begin, DataSeriesIterator end, | ||||
Resolution xResolution, Resolution yResolution); | ||||
/** | ||||
* Calculates the min and max thresholds of a dataset. | ||||
* | ||||
* The thresholds of a dataset correspond to the min and max limits of the set to which the | ||||
* outliers are exluded (values distant from the others) For example, for the set [1, 2, 3, 4, | ||||
* 5, 10000], 10000 is an outlier and will be excluded from the thresholds. | ||||
* | ||||
* Bounds determining the thresholds is calculated according to the mean and the standard | ||||
* deviation of the defined data. The thresholds are limited to the min / max values of the | ||||
* dataset: if for example the calculated min threshold is 2 but the min value of the datasetset | ||||
* is 4, 4 is returned as the min threshold. | ||||
* | ||||
* @param begin the beginning of the dataset | ||||
* @param end the end of the dataset | ||||
* @param logarithmic computes threshold with a logarithmic scale or not | ||||
* @return the thresholds computed, a couple of nan values if it couldn't be computed | ||||
*/ | ||||
template <typename Iterator> | ||||
static std::pair<double, double> thresholds(Iterator begin, Iterator end, | ||||
bool logarithmic = false); | ||||
}; | ||||
template <typename Iterator> | ||||
DataSeriesUtils::Resolution DataSeriesUtils::resolution(Iterator begin, Iterator end, | ||||
bool logarithmic) | ||||
{ | ||||
// Retrieves data into a work dataset | ||||
using ValueType = typename Iterator::value_type; | ||||
std::vector<ValueType> values{}; | ||||
std::copy(begin, end, std::back_inserter(values)); | ||||
// Converts data if logarithmic flag is activated | ||||
if (logarithmic) { | ||||
std::for_each(values.begin(), values.end(), | ||||
[logarithmic](auto &val) { val = std::log10(val); }); | ||||
} | ||||
// Computes the differences between the values in the dataset | ||||
std::adjacent_difference(values.begin(), values.end(), values.begin()); | ||||
// Retrieves the smallest difference | ||||
auto resolutionIt = std::min_element(values.begin(), values.end()); | ||||
auto resolution | ||||
= resolutionIt != values.end() ? *resolutionIt : std::numeric_limits<double>::quiet_NaN(); | ||||
return Resolution{resolution, logarithmic}; | ||||
} | ||||
template <typename Iterator> | ||||
std::pair<double, double> DataSeriesUtils::thresholds(Iterator begin, Iterator end, | ||||
bool logarithmic) | ||||
{ | ||||
/// Lambda that converts values in case of logaritmic scale | ||||
auto toLog = [logarithmic](const auto &value) { | ||||
if (logarithmic) { | ||||
// Logaritmic scale doesn't include zero value | ||||
return !(std::isnan(value) || value < std::numeric_limits<double>::epsilon()) | ||||
? std::log10(value) | ||||
: std::numeric_limits<double>::quiet_NaN(); | ||||
} | ||||
else { | ||||
return value; | ||||
} | ||||
}; | ||||
/// Lambda that converts values to linear scale | ||||
auto fromLog | ||||
= [logarithmic](const auto &value) { return logarithmic ? std::pow(10, value) : value; }; | ||||
/// Lambda used to sum data and divide the sum by the number of data. It is used to calculate | ||||
/// the mean and standard deviation | ||||
/// @param fun the data addition function | ||||
auto accumulate = [begin, end](auto fun) { | ||||
double sum; | ||||
int nbValues; | ||||
std::tie(sum, nbValues) = std::accumulate( | ||||
begin, end, std::make_pair(0., 0), [fun](const auto &input, const auto &value) { | ||||
auto computedValue = fun(value); | ||||
// NaN values are excluded from the sum | ||||
return !std::isnan(computedValue) | ||||
? std::make_pair(input.first + computedValue, input.second + 1) | ||||
: input; | ||||
}); | ||||
return nbValues != 0 ? sum / nbValues : std::numeric_limits<double>::quiet_NaN(); | ||||
}; | ||||
// Computes mean | ||||
auto mean = accumulate([toLog](const auto &val) { return toLog(val); }); | ||||
if (std::isnan(mean)) { | ||||
return {std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN()}; | ||||
} | ||||
// Computes standard deviation | ||||
auto variance | ||||
= accumulate([mean, toLog](const auto &val) { return std::pow(toLog(val) - mean, 2); }); | ||||
auto sigma = std::sqrt(variance); | ||||
// Computes thresholds | ||||
auto minThreshold = fromLog(mean - 3 * sigma); | ||||
auto maxThreshold = fromLog(mean + 3 * sigma); | ||||
// Finds min/max values | ||||
auto minIt = std::min_element(begin, end, [toLog](const auto &it1, const auto &it2) { | ||||
return SortUtils::minCompareWithNaN(toLog(it1), toLog(it2)); | ||||
}); | ||||
auto maxIt = std::max_element(begin, end, [toLog](const auto &it1, const auto &it2) { | ||||
return SortUtils::maxCompareWithNaN(toLog(it1), toLog(it2)); | ||||
}); | ||||
// Returns thresholds (bounded to min/max values) | ||||
return {std::max(*minIt, minThreshold), std::min(*maxIt, maxThreshold)}; | ||||
} | ||||
#endif // SCIQLOP_DATASERIESUTILS_H | ||||