##// END OF EJS Templates
Creates method to compute automatic thresholds
Alexandre Leroux -
r1015:bd0d48a48271
parent child
Show More
@@ -1,228 +1,317
1 #ifndef SCIQLOP_DATASERIESUTILS_H
1 #ifndef SCIQLOP_DATASERIESUTILS_H
2 #define SCIQLOP_DATASERIESUTILS_H
2 #define SCIQLOP_DATASERIESUTILS_H
3
3
4 #include "CoreGlobal.h"
4 #include "CoreGlobal.h"
5
5
6 #include <Common/SortUtils.h>
6 #include <Data/DataSeriesIterator.h>
7 #include <Data/DataSeriesIterator.h>
7
8
8 #include <QLoggingCategory>
9 #include <QLoggingCategory>
9 #include <cmath>
10 #include <cmath>
10
11
11 Q_DECLARE_LOGGING_CATEGORY(LOG_DataSeriesUtils)
12 Q_DECLARE_LOGGING_CATEGORY(LOG_DataSeriesUtils)
12
13
13 /**
14 /**
14 * Utility class with methods for data series
15 * Utility class with methods for data series
15 */
16 */
16 struct SCIQLOP_CORE_EXPORT DataSeriesUtils {
17 struct SCIQLOP_CORE_EXPORT DataSeriesUtils {
17 /**
18 /**
18 * Define a meshs.
19 * Define a meshs.
19 *
20 *
20 * A mesh is a regular grid representing cells of the same width (in x) and of the same height
21 * A mesh is a regular grid representing cells of the same width (in x) and of the same height
21 * (in y). At each mesh point is associated a value.
22 * (in y). At each mesh point is associated a value.
22 *
23 *
23 * Each axis of the mesh is defined by a minimum value, a number of values is a mesh step.
24 * Each axis of the mesh is defined by a minimum value, a number of values is a mesh step.
24 * For example: if min = 1, nbValues = 5 and step = 2 => the axis of the mesh will be [1, 3, 5,
25 * For example: if min = 1, nbValues = 5 and step = 2 => the axis of the mesh will be [1, 3, 5,
25 * 7, 9].
26 * 7, 9].
26 *
27 *
27 * The values are defined in an array of size {nbX * nbY}. The data is stored along the X axis.
28 * The values are defined in an array of size {nbX * nbY}. The data is stored along the X axis.
28 *
29 *
29 * For example, the mesh:
30 * For example, the mesh:
30 * Y = 2 [ 7 ; 8 ; 9
31 * Y = 2 [ 7 ; 8 ; 9
31 * Y = 1 4 ; 5 ; 6
32 * Y = 1 4 ; 5 ; 6
32 * Y = 0 1 ; 2 ; 3 ]
33 * Y = 0 1 ; 2 ; 3 ]
33 * X = 0 X = 1 X = 2
34 * X = 0 X = 1 X = 2
34 *
35 *
35 * will be represented by data [1, 2, 3, 4, 5, 6, 7, 8, 9]
36 * will be represented by data [1, 2, 3, 4, 5, 6, 7, 8, 9]
36 */
37 */
37 struct Mesh {
38 struct Mesh {
38 explicit Mesh() = default;
39 explicit Mesh() = default;
39 explicit Mesh(int nbX, double xMin, double xStep, int nbY, double yMin, double yStep)
40 explicit Mesh(int nbX, double xMin, double xStep, int nbY, double yMin, double yStep)
40 : m_NbX{nbX},
41 : m_NbX{nbX},
41 m_XMin{xMin},
42 m_XMin{xMin},
42 m_XStep{xStep},
43 m_XStep{xStep},
43 m_NbY{nbY},
44 m_NbY{nbY},
44 m_YMin{yMin},
45 m_YMin{yMin},
45 m_YStep{yStep},
46 m_YStep{yStep},
46 m_Data(nbX * nbY)
47 m_Data(nbX * nbY)
47 {
48 {
48 }
49 }
49
50
50 inline bool isEmpty() const { return m_Data.size() == 0; }
51 inline bool isEmpty() const { return m_Data.size() == 0; }
51 inline double xMax() const { return m_XMin + (m_NbX - 1) * m_XStep; }
52 inline double xMax() const { return m_XMin + (m_NbX - 1) * m_XStep; }
52 inline double yMax() const { return m_YMin + (m_NbY - 1) * m_YStep; }
53 inline double yMax() const { return m_YMin + (m_NbY - 1) * m_YStep; }
53
54
54 int m_NbX{0};
55 int m_NbX{0};
55 double m_XMin{};
56 double m_XMin{};
56 double m_XStep{};
57 double m_XStep{};
57 int m_NbY{0};
58 int m_NbY{0};
58 double m_YMin{};
59 double m_YMin{};
59 double m_YStep{};
60 double m_YStep{};
60 std::vector<double> m_Data{};
61 std::vector<double> m_Data{};
61 };
62 };
62
63
63 /**
64 /**
64 * Represents a resolution used to generate the data of a mesh on the x-axis or in Y.
65 * Represents a resolution used to generate the data of a mesh on the x-axis or in Y.
65 *
66 *
66 * A resolution is represented by a value and flag indicating if it's in the logarithmic scale
67 * A resolution is represented by a value and flag indicating if it's in the logarithmic scale
67 * @sa Mesh
68 * @sa Mesh
68 */
69 */
69 struct Resolution {
70 struct Resolution {
70 double m_Val{std::numeric_limits<double>::quiet_NaN()};
71 double m_Val{std::numeric_limits<double>::quiet_NaN()};
71 bool m_Logarithmic{false};
72 bool m_Logarithmic{false};
72 };
73 };
73
74
74 /**
75 /**
75 * Processes data from a data series to complete the data holes with a fill value.
76 * Processes data from a data series to complete the data holes with a fill value.
76 *
77 *
77 * A data hole is determined by the resolution passed in parameter: if, between two continuous
78 * A data hole is determined by the resolution passed in parameter: if, between two continuous
78 * data on the x-axis, the difference between these data is greater than the resolution, then
79 * data on the x-axis, the difference between these data is greater than the resolution, then
79 * there is one or more holes between them. The holes are filled by adding:
80 * there is one or more holes between them. The holes are filled by adding:
80 * - for the x-axis, new data corresponding to the 'step resolution' starting from the first
81 * - for the x-axis, new data corresponding to the 'step resolution' starting from the first
81 * data;
82 * data;
82 * - for values, a default value (fill value) for each new data added on the x-axis.
83 * - for values, a default value (fill value) for each new data added on the x-axis.
83 *
84 *
84 * For example, with :
85 * For example, with :
85 * - xAxisData = [0, 1, 5, 7, 14 ]
86 * - xAxisData = [0, 1, 5, 7, 14 ]
86 * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] (two components per x-axis data)
87 * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] (two components per x-axis data)
87 * - fillValue = NaN
88 * - fillValue = NaN
88 * - and resolution = 2;
89 * - and resolution = 2;
89 *
90 *
90 * For the x axis, we calculate as data holes: [3, 9, 11, 13]. These holes are added to the
91 * For the x axis, we calculate as data holes: [3, 9, 11, 13]. These holes are added to the
91 * x-axis data, and NaNs (two per x-axis data) are added to the values:
92 * x-axis data, and NaNs (two per x-axis data) are added to the values:
92 * => xAxisData = [0, 1, 3, 5, 7, 9, 11, 13, 14 ]
93 * => xAxisData = [0, 1, 3, 5, 7, 9, 11, 13, 14 ]
93 * => valuesData = [0, 1, 2, 3, NaN, NaN, 4, 5, 6, 7, NaN, NaN, NaN, NaN, NaN, NaN, 8, 9]
94 * => valuesData = [0, 1, 2, 3, NaN, NaN, 4, 5, 6, 7, NaN, NaN, NaN, NaN, NaN, NaN, 8, 9]
94 *
95 *
95 * It is also possible to set bounds for the data series. If these bounds are defined and exceed
96 * It is also possible to set bounds for the data series. If these bounds are defined and exceed
96 * the limits of the data series, data holes are added to the series at the beginning and/or the
97 * the limits of the data series, data holes are added to the series at the beginning and/or the
97 * end.
98 * end.
98 *
99 *
99 * The generation of data holes at the beginning/end of the data series is performed starting
100 * The generation of data holes at the beginning/end of the data series is performed starting
100 * from the x-axis series limit and adding data holes at each 'resolution step' as long as the
101 * from the x-axis series limit and adding data holes at each 'resolution step' as long as the
101 * new bound is not reached.
102 * new bound is not reached.
102 *
103 *
103 * For example, with :
104 * For example, with :
104 * - xAxisData = [3, 4, 5, 6, 7 ]
105 * - xAxisData = [3, 4, 5, 6, 7 ]
105 * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
106 * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
106 * - fillValue = NaN
107 * - fillValue = NaN
107 * - minBound = 0
108 * - minBound = 0
108 * - maxBound = 12
109 * - maxBound = 12
109 * - and resolution = 2;
110 * - and resolution = 2;
110 *
111 *
111 * => Starting from 3 and decreasing 2 by 2 until reaching 0 : a data hole at value 1 will be
112 * => Starting from 3 and decreasing 2 by 2 until reaching 0 : a data hole at value 1 will be
112 * added to the beginning of the series
113 * added to the beginning of the series
113 * => Starting from 7 and increasing 2 by 2 until reaching 12 : data holes at values 9 and 11
114 * => Starting from 7 and increasing 2 by 2 until reaching 12 : data holes at values 9 and 11
114 * will be added to the end of the series
115 * will be added to the end of the series
115 *
116 *
116 * So :
117 * So :
117 * => xAxisData = [1, 3, 4, 5, 6, 7, 9, 11 ]
118 * => xAxisData = [1, 3, 4, 5, 6, 7, 9, 11 ]
118 * => valuesData = [NaN, NaN, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, NaN, NaN, NaN, NaN]
119 * => valuesData = [NaN, NaN, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, NaN, NaN, NaN, NaN]
119 *
120 *
120 * @param xAxisData the x-axis data of the data series
121 * @param xAxisData the x-axis data of the data series
121 * @param valuesData the values data of the data series
122 * @param valuesData the values data of the data series
122 * @param resolution the resoultion (on x-axis) used to determinate data holes
123 * @param resolution the resoultion (on x-axis) used to determinate data holes
123 * @param fillValue the fill value used for data holes in the values data
124 * @param fillValue the fill value used for data holes in the values data
124 * @param minBound the limit at which to start filling data holes for the series. If set to NaN,
125 * @param minBound the limit at which to start filling data holes for the series. If set to NaN,
125 * the limit is not used
126 * the limit is not used
126 * @param maxBound the limit at which to end filling data holes for the series. If set to NaN,
127 * @param maxBound the limit at which to end filling data holes for the series. If set to NaN,
127 * the limit is not used
128 * the limit is not used
128 *
129 *
129 * @remarks There is no control over the consistency between x-axis data and values data. The
130 * @remarks There is no control over the consistency between x-axis data and values data. The
130 * method considers that the data is well formed (the total number of values data is a multiple
131 * method considers that the data is well formed (the total number of values data is a multiple
131 * of the number of x-axis data)
132 * of the number of x-axis data)
132 */
133 */
133 static void fillDataHoles(std::vector<double> &xAxisData, std::vector<double> &valuesData,
134 static void fillDataHoles(std::vector<double> &xAxisData, std::vector<double> &valuesData,
134 double resolution,
135 double resolution,
135 double fillValue = std::numeric_limits<double>::quiet_NaN(),
136 double fillValue = std::numeric_limits<double>::quiet_NaN(),
136 double minBound = std::numeric_limits<double>::quiet_NaN(),
137 double minBound = std::numeric_limits<double>::quiet_NaN(),
137 double maxBound = std::numeric_limits<double>::quiet_NaN());
138 double maxBound = std::numeric_limits<double>::quiet_NaN());
138 /**
139 /**
139 * Computes the resolution of a dataset passed as a parameter.
140 * Computes the resolution of a dataset passed as a parameter.
140 *
141 *
141 * The resolution of a dataset is the minimum difference between two values that follow in the
142 * The resolution of a dataset is the minimum difference between two values that follow in the
142 * set.
143 * set.
143 * For example:
144 * For example:
144 * - for the set [0, 2, 4, 8, 10, 11, 13] => the resolution is 1 (difference between 10 and 11).
145 * - for the set [0, 2, 4, 8, 10, 11, 13] => the resolution is 1 (difference between 10 and 11).
145 *
146 *
146 * A resolution can be calculated on the logarithmic scale (base of 10). In this case, the
147 * A resolution can be calculated on the logarithmic scale (base of 10). In this case, the
147 * dataset is first converted to logarithmic values.
148 * dataset is first converted to logarithmic values.
148 * For example:
149 * For example:
149 * - for the set [10, 100, 10000, 1000000], the values are converted to [1, 2, 4, 6] => the
150 * - for the set [10, 100, 10000, 1000000], the values are converted to [1, 2, 4, 6] => the
150 * logarithmic resolution is 1 (difference between 1 and 2).
151 * logarithmic resolution is 1 (difference between 1 and 2).
151 *
152 *
152 * @param begin the iterator pointing to the beginning of the dataset
153 * @param begin the iterator pointing to the beginning of the dataset
153 * @param end the iterator pointing to the end of the dataset
154 * @param end the iterator pointing to the end of the dataset
154 * @param logarithmic computes a logarithmic resolution or not
155 * @param logarithmic computes a logarithmic resolution or not
155 * @return the resolution computed
156 * @return the resolution computed
156 * @warning the method considers the dataset as sorted and doesn't control it.
157 * @warning the method considers the dataset as sorted and doesn't control it.
157 */
158 */
158 template <typename Iterator>
159 template <typename Iterator>
159 static Resolution resolution(Iterator begin, Iterator end, bool logarithmic = false);
160 static Resolution resolution(Iterator begin, Iterator end, bool logarithmic = false);
160
161
161 /**
162 /**
162 * Computes a regular mesh for a data series, according to resolutions for x-axis and y-axis
163 * Computes a regular mesh for a data series, according to resolutions for x-axis and y-axis
163 * passed as parameters.
164 * passed as parameters.
164 *
165 *
165 * The mesh is created from the resolutions in x and y and the boundaries delimiting the data
166 * The mesh is created from the resolutions in x and y and the boundaries delimiting the data
166 * series. If the resolutions do not allow to obtain a regular mesh, they are recalculated.
167 * series. If the resolutions do not allow to obtain a regular mesh, they are recalculated.
167 *
168 *
168 * For example :
169 * For example :
169 * Let x-axis data = [0, 1, 3, 5, 9], its associated values ​​= [0, 10, 30, 50, 90] and
170 * Let x-axis data = [0, 1, 3, 5, 9], its associated values ​​= [0, 10, 30, 50, 90] and
170 * xResolution = 2.
171 * xResolution = 2.
171 * Based on the resolution, the mesh would be [0, 2, 4, 6, 8, 10] and would be invalid because
172 * Based on the resolution, the mesh would be [0, 2, 4, 6, 8, 10] and would be invalid because
172 * it exceeds the maximum bound of the data. The resolution is thus recalculated so that the
173 * it exceeds the maximum bound of the data. The resolution is thus recalculated so that the
173 * mesh holds between the data terminals.
174 * mesh holds between the data terminals.
174 * So => resolution is 1.8 and the mesh is [0, 1.8, 3.6, 5.4, 7.2, 9].
175 * So => resolution is 1.8 and the mesh is [0, 1.8, 3.6, 5.4, 7.2, 9].
175 *
176 *
176 * Once the mesh is generated in x and y, the values ​​are associated with each mesh point,
177 * Once the mesh is generated in x and y, the values ​​are associated with each mesh point,
177 * based on the data in the series, finding the existing data at which the mesh point would be
178 * based on the data in the series, finding the existing data at which the mesh point would be
178 * or would be closest to, without exceeding it.
179 * or would be closest to, without exceeding it.
179 *
180 *
180 * In the example, we determine the value of each mesh point:
181 * In the example, we determine the value of each mesh point:
181 * - x = 0 => value = 0 (existing x in the data series)
182 * - x = 0 => value = 0 (existing x in the data series)
182 * - x = 1.8 => value = 10 (the closest existing x: 1)
183 * - x = 1.8 => value = 10 (the closest existing x: 1)
183 * - x = 3.6 => value = 30 (the closest existing x: 3)
184 * - x = 3.6 => value = 30 (the closest existing x: 3)
184 * - x = 5.4 => value = 50 (the closest existing x: 5)
185 * - x = 5.4 => value = 50 (the closest existing x: 5)
185 * - x = 7.2 => value = 50 (the closest existing x: 5)
186 * - x = 7.2 => value = 50 (the closest existing x: 5)
186 * - x = 9 => value = 90 (existing x in the data series)
187 * - x = 9 => value = 90 (existing x in the data series)
187 *
188 *
188 * Same algorithm is applied for y-axis.
189 * Same algorithm is applied for y-axis.
189 *
190 *
190 * @param begin the iterator pointing to the beginning of the data series
191 * @param begin the iterator pointing to the beginning of the data series
191 * @param end the iterator pointing to the end of the data series
192 * @param end the iterator pointing to the end of the data series
192 * @param xResolution the resolution expected for the mesh's x-axis
193 * @param xResolution the resolution expected for the mesh's x-axis
193 * @param yResolution the resolution expected for the mesh's y-axis
194 * @param yResolution the resolution expected for the mesh's y-axis
194 * @return the mesh created, an empty mesh if the input data do not allow to generate a regular
195 * @return the mesh created, an empty mesh if the input data do not allow to generate a regular
195 * mesh (empty data, null resolutions, logarithmic x-axis)
196 * mesh (empty data, null resolutions, logarithmic x-axis)
196 * @warning the method considers the dataset as sorted and doesn't control it.
197 * @warning the method considers the dataset as sorted and doesn't control it.
197 */
198 */
198 static Mesh regularMesh(DataSeriesIterator begin, DataSeriesIterator end,
199 static Mesh regularMesh(DataSeriesIterator begin, DataSeriesIterator end,
199 Resolution xResolution, Resolution yResolution);
200 Resolution xResolution, Resolution yResolution);
201
202 /**
203 * Calculates the min and max thresholds of a dataset.
204 *
205 * The thresholds of a dataset correspond to the min and max limits of the set to which the
206 * outliers are exluded (values distant from the others) For example, for the set [1, 2, 3, 4,
207 * 5, 10000], 10000 is an outlier and will be excluded from the thresholds.
208 *
209 * Bounds determining the thresholds is calculated according to the mean and the standard
210 * deviation of the defined data. The thresholds are limited to the min / max values of the
211 * dataset: if for example the calculated min threshold is 2 but the min value of the datasetset
212 * is 4, 4 is returned as the min threshold.
213 *
214 * @param begin the beginning of the dataset
215 * @param end the end of the dataset
216 * @param logarithmic computes threshold with a logarithmic scale or not
217 * @return the thresholds computed, a couple of nan values if it couldn't be computed
218 */
219 template <typename Iterator>
220 static std::pair<double, double> thresholds(Iterator begin, Iterator end,
221 bool logarithmic = false);
200 };
222 };
201
223
202 template <typename Iterator>
224 template <typename Iterator>
203 DataSeriesUtils::Resolution DataSeriesUtils::resolution(Iterator begin, Iterator end,
225 DataSeriesUtils::Resolution DataSeriesUtils::resolution(Iterator begin, Iterator end,
204 bool logarithmic)
226 bool logarithmic)
205 {
227 {
206 // Retrieves data into a work dataset
228 // Retrieves data into a work dataset
207 using ValueType = typename Iterator::value_type;
229 using ValueType = typename Iterator::value_type;
208 std::vector<ValueType> values{};
230 std::vector<ValueType> values{};
209 std::copy(begin, end, std::back_inserter(values));
231 std::copy(begin, end, std::back_inserter(values));
210
232
211 // Converts data if logarithmic flag is activated
233 // Converts data if logarithmic flag is activated
212 if (logarithmic) {
234 if (logarithmic) {
213 std::for_each(values.begin(), values.end(),
235 std::for_each(values.begin(), values.end(),
214 [logarithmic](auto &val) { val = std::log10(val); });
236 [logarithmic](auto &val) { val = std::log10(val); });
215 }
237 }
216
238
217 // Computes the differences between the values in the dataset
239 // Computes the differences between the values in the dataset
218 std::adjacent_difference(values.begin(), values.end(), values.begin());
240 std::adjacent_difference(values.begin(), values.end(), values.begin());
219
241
220 // Retrieves the smallest difference
242 // Retrieves the smallest difference
221 auto resolutionIt = std::min_element(values.begin(), values.end());
243 auto resolutionIt = std::min_element(values.begin(), values.end());
222 auto resolution
244 auto resolution
223 = resolutionIt != values.end() ? *resolutionIt : std::numeric_limits<double>::quiet_NaN();
245 = resolutionIt != values.end() ? *resolutionIt : std::numeric_limits<double>::quiet_NaN();
224
246
225 return Resolution{resolution, logarithmic};
247 return Resolution{resolution, logarithmic};
226 }
248 }
227
249
250 template <typename Iterator>
251 std::pair<double, double> DataSeriesUtils::thresholds(Iterator begin, Iterator end,
252 bool logarithmic)
253 {
254 /// Lambda that converts values in case of logaritmic scale
255 auto toLog = [logarithmic](const auto &value) {
256 if (logarithmic) {
257 // Logaritmic scale doesn't include zero value
258 return !(std::isnan(value) || value < std::numeric_limits<double>::epsilon())
259 ? std::log10(value)
260 : std::numeric_limits<double>::quiet_NaN();
261 }
262 else {
263 return value;
264 }
265 };
266
267 /// Lambda that converts values to linear scale
268 auto fromLog
269 = [logarithmic](const auto &value) { return logarithmic ? std::pow(10, value) : value; };
270
271 /// Lambda used to sum data and divide the sum by the number of data. It is used to calculate
272 /// the mean and standard deviation
273 /// @param fun the data addition function
274 auto accumulate = [begin, end](auto fun) {
275 double sum;
276 int nbValues;
277 std::tie(sum, nbValues) = std::accumulate(
278 begin, end, std::make_pair(0., 0), [fun](const auto &input, const auto &value) {
279 auto computedValue = fun(value);
280
281 // NaN values are excluded from the sum
282 return !std::isnan(computedValue)
283 ? std::make_pair(input.first + computedValue, input.second + 1)
284 : input;
285 });
286
287 return nbValues != 0 ? sum / nbValues : std::numeric_limits<double>::quiet_NaN();
288 };
289
290 // Computes mean
291 auto mean = accumulate([toLog](const auto &val) { return toLog(val); });
292 if (std::isnan(mean)) {
293 return {std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN()};
294 }
295
296 // Computes standard deviation
297 auto variance
298 = accumulate([mean, toLog](const auto &val) { return std::pow(toLog(val) - mean, 2); });
299 auto sigma = std::sqrt(variance);
300
301 // Computes thresholds
302 auto minThreshold = fromLog(mean - 3 * sigma);
303 auto maxThreshold = fromLog(mean + 3 * sigma);
304
305 // Finds min/max values
306 auto minIt = std::min_element(begin, end, [toLog](const auto &it1, const auto &it2) {
307 return SortUtils::minCompareWithNaN(toLog(it1), toLog(it2));
308 });
309 auto maxIt = std::max_element(begin, end, [toLog](const auto &it1, const auto &it2) {
310 return SortUtils::maxCompareWithNaN(toLog(it1), toLog(it2));
311 });
312
313 // Returns thresholds (bounded to min/max values)
314 return {std::max(*minIt, minThreshold), std::min(*maxIt, maxThreshold)};
315 }
316
228 #endif // SCIQLOP_DATASERIESUTILS_H
317 #endif // SCIQLOP_DATASERIESUTILS_H
General Comments 0
You need to be logged in to leave comments. Login now