@@ -1,228 +1,317 | |||
|
1 | 1 | #ifndef SCIQLOP_DATASERIESUTILS_H |
|
2 | 2 | #define SCIQLOP_DATASERIESUTILS_H |
|
3 | 3 | |
|
4 | 4 | #include "CoreGlobal.h" |
|
5 | 5 | |
|
6 | #include <Common/SortUtils.h> | |
|
6 | 7 | #include <Data/DataSeriesIterator.h> |
|
7 | 8 | |
|
8 | 9 | #include <QLoggingCategory> |
|
9 | 10 | #include <cmath> |
|
10 | 11 | |
|
11 | 12 | Q_DECLARE_LOGGING_CATEGORY(LOG_DataSeriesUtils) |
|
12 | 13 | |
|
13 | 14 | /** |
|
14 | 15 | * Utility class with methods for data series |
|
15 | 16 | */ |
|
16 | 17 | struct SCIQLOP_CORE_EXPORT DataSeriesUtils { |
|
17 | 18 | /** |
|
18 | 19 | * Define a meshs. |
|
19 | 20 | * |
|
20 | 21 | * A mesh is a regular grid representing cells of the same width (in x) and of the same height |
|
21 | 22 | * (in y). At each mesh point is associated a value. |
|
22 | 23 | * |
|
23 | 24 | * Each axis of the mesh is defined by a minimum value, a number of values is a mesh step. |
|
24 | 25 | * For example: if min = 1, nbValues = 5 and step = 2 => the axis of the mesh will be [1, 3, 5, |
|
25 | 26 | * 7, 9]. |
|
26 | 27 | * |
|
27 | 28 | * The values are defined in an array of size {nbX * nbY}. The data is stored along the X axis. |
|
28 | 29 | * |
|
29 | 30 | * For example, the mesh: |
|
30 | 31 | * Y = 2 [ 7 ; 8 ; 9 |
|
31 | 32 | * Y = 1 4 ; 5 ; 6 |
|
32 | 33 | * Y = 0 1 ; 2 ; 3 ] |
|
33 | 34 | * X = 0 X = 1 X = 2 |
|
34 | 35 | * |
|
35 | 36 | * will be represented by data [1, 2, 3, 4, 5, 6, 7, 8, 9] |
|
36 | 37 | */ |
|
37 | 38 | struct Mesh { |
|
38 | 39 | explicit Mesh() = default; |
|
39 | 40 | explicit Mesh(int nbX, double xMin, double xStep, int nbY, double yMin, double yStep) |
|
40 | 41 | : m_NbX{nbX}, |
|
41 | 42 | m_XMin{xMin}, |
|
42 | 43 | m_XStep{xStep}, |
|
43 | 44 | m_NbY{nbY}, |
|
44 | 45 | m_YMin{yMin}, |
|
45 | 46 | m_YStep{yStep}, |
|
46 | 47 | m_Data(nbX * nbY) |
|
47 | 48 | { |
|
48 | 49 | } |
|
49 | 50 | |
|
50 | 51 | inline bool isEmpty() const { return m_Data.size() == 0; } |
|
51 | 52 | inline double xMax() const { return m_XMin + (m_NbX - 1) * m_XStep; } |
|
52 | 53 | inline double yMax() const { return m_YMin + (m_NbY - 1) * m_YStep; } |
|
53 | 54 | |
|
54 | 55 | int m_NbX{0}; |
|
55 | 56 | double m_XMin{}; |
|
56 | 57 | double m_XStep{}; |
|
57 | 58 | int m_NbY{0}; |
|
58 | 59 | double m_YMin{}; |
|
59 | 60 | double m_YStep{}; |
|
60 | 61 | std::vector<double> m_Data{}; |
|
61 | 62 | }; |
|
62 | 63 | |
|
63 | 64 | /** |
|
64 | 65 | * Represents a resolution used to generate the data of a mesh on the x-axis or in Y. |
|
65 | 66 | * |
|
66 | 67 | * A resolution is represented by a value and flag indicating if it's in the logarithmic scale |
|
67 | 68 | * @sa Mesh |
|
68 | 69 | */ |
|
69 | 70 | struct Resolution { |
|
70 | 71 | double m_Val{std::numeric_limits<double>::quiet_NaN()}; |
|
71 | 72 | bool m_Logarithmic{false}; |
|
72 | 73 | }; |
|
73 | 74 | |
|
74 | 75 | /** |
|
75 | 76 | * Processes data from a data series to complete the data holes with a fill value. |
|
76 | 77 | * |
|
77 | 78 | * A data hole is determined by the resolution passed in parameter: if, between two continuous |
|
78 | 79 | * data on the x-axis, the difference between these data is greater than the resolution, then |
|
79 | 80 | * there is one or more holes between them. The holes are filled by adding: |
|
80 | 81 | * - for the x-axis, new data corresponding to the 'step resolution' starting from the first |
|
81 | 82 | * data; |
|
82 | 83 | * - for values, a default value (fill value) for each new data added on the x-axis. |
|
83 | 84 | * |
|
84 | 85 | * For example, with : |
|
85 | 86 | * - xAxisData = [0, 1, 5, 7, 14 ] |
|
86 | 87 | * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] (two components per x-axis data) |
|
87 | 88 | * - fillValue = NaN |
|
88 | 89 | * - and resolution = 2; |
|
89 | 90 | * |
|
90 | 91 | * For the x axis, we calculate as data holes: [3, 9, 11, 13]. These holes are added to the |
|
91 | 92 | * x-axis data, and NaNs (two per x-axis data) are added to the values: |
|
92 | 93 | * => xAxisData = [0, 1, 3, 5, 7, 9, 11, 13, 14 ] |
|
93 | 94 | * => valuesData = [0, 1, 2, 3, NaN, NaN, 4, 5, 6, 7, NaN, NaN, NaN, NaN, NaN, NaN, 8, 9] |
|
94 | 95 | * |
|
95 | 96 | * It is also possible to set bounds for the data series. If these bounds are defined and exceed |
|
96 | 97 | * the limits of the data series, data holes are added to the series at the beginning and/or the |
|
97 | 98 | * end. |
|
98 | 99 | * |
|
99 | 100 | * The generation of data holes at the beginning/end of the data series is performed starting |
|
100 | 101 | * from the x-axis series limit and adding data holes at each 'resolution step' as long as the |
|
101 | 102 | * new bound is not reached. |
|
102 | 103 | * |
|
103 | 104 | * For example, with : |
|
104 | 105 | * - xAxisData = [3, 4, 5, 6, 7 ] |
|
105 | 106 | * - valuesData = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] |
|
106 | 107 | * - fillValue = NaN |
|
107 | 108 | * - minBound = 0 |
|
108 | 109 | * - maxBound = 12 |
|
109 | 110 | * - and resolution = 2; |
|
110 | 111 | * |
|
111 | 112 | * => Starting from 3 and decreasing 2 by 2 until reaching 0 : a data hole at value 1 will be |
|
112 | 113 | * added to the beginning of the series |
|
113 | 114 | * => Starting from 7 and increasing 2 by 2 until reaching 12 : data holes at values 9 and 11 |
|
114 | 115 | * will be added to the end of the series |
|
115 | 116 | * |
|
116 | 117 | * So : |
|
117 | 118 | * => xAxisData = [1, 3, 4, 5, 6, 7, 9, 11 ] |
|
118 | 119 | * => valuesData = [NaN, NaN, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, NaN, NaN, NaN, NaN] |
|
119 | 120 | * |
|
120 | 121 | * @param xAxisData the x-axis data of the data series |
|
121 | 122 | * @param valuesData the values data of the data series |
|
122 | 123 | * @param resolution the resoultion (on x-axis) used to determinate data holes |
|
123 | 124 | * @param fillValue the fill value used for data holes in the values data |
|
124 | 125 | * @param minBound the limit at which to start filling data holes for the series. If set to NaN, |
|
125 | 126 | * the limit is not used |
|
126 | 127 | * @param maxBound the limit at which to end filling data holes for the series. If set to NaN, |
|
127 | 128 | * the limit is not used |
|
128 | 129 | * |
|
129 | 130 | * @remarks There is no control over the consistency between x-axis data and values data. The |
|
130 | 131 | * method considers that the data is well formed (the total number of values data is a multiple |
|
131 | 132 | * of the number of x-axis data) |
|
132 | 133 | */ |
|
133 | 134 | static void fillDataHoles(std::vector<double> &xAxisData, std::vector<double> &valuesData, |
|
134 | 135 | double resolution, |
|
135 | 136 | double fillValue = std::numeric_limits<double>::quiet_NaN(), |
|
136 | 137 | double minBound = std::numeric_limits<double>::quiet_NaN(), |
|
137 | 138 | double maxBound = std::numeric_limits<double>::quiet_NaN()); |
|
138 | 139 | /** |
|
139 | 140 | * Computes the resolution of a dataset passed as a parameter. |
|
140 | 141 | * |
|
141 | 142 | * The resolution of a dataset is the minimum difference between two values that follow in the |
|
142 | 143 | * set. |
|
143 | 144 | * For example: |
|
144 | 145 | * - for the set [0, 2, 4, 8, 10, 11, 13] => the resolution is 1 (difference between 10 and 11). |
|
145 | 146 | * |
|
146 | 147 | * A resolution can be calculated on the logarithmic scale (base of 10). In this case, the |
|
147 | 148 | * dataset is first converted to logarithmic values. |
|
148 | 149 | * For example: |
|
149 | 150 | * - for the set [10, 100, 10000, 1000000], the values are converted to [1, 2, 4, 6] => the |
|
150 | 151 | * logarithmic resolution is 1 (difference between 1 and 2). |
|
151 | 152 | * |
|
152 | 153 | * @param begin the iterator pointing to the beginning of the dataset |
|
153 | 154 | * @param end the iterator pointing to the end of the dataset |
|
154 | 155 | * @param logarithmic computes a logarithmic resolution or not |
|
155 | 156 | * @return the resolution computed |
|
156 | 157 | * @warning the method considers the dataset as sorted and doesn't control it. |
|
157 | 158 | */ |
|
158 | 159 | template <typename Iterator> |
|
159 | 160 | static Resolution resolution(Iterator begin, Iterator end, bool logarithmic = false); |
|
160 | 161 | |
|
161 | 162 | /** |
|
162 | 163 | * Computes a regular mesh for a data series, according to resolutions for x-axis and y-axis |
|
163 | 164 | * passed as parameters. |
|
164 | 165 | * |
|
165 | 166 | * The mesh is created from the resolutions in x and y and the boundaries delimiting the data |
|
166 | 167 | * series. If the resolutions do not allow to obtain a regular mesh, they are recalculated. |
|
167 | 168 | * |
|
168 | 169 | * For example : |
|
169 | 170 | * Let x-axis data = [0, 1, 3, 5, 9], its associated values ββ= [0, 10, 30, 50, 90] and |
|
170 | 171 | * xResolution = 2. |
|
171 | 172 | * Based on the resolution, the mesh would be [0, 2, 4, 6, 8, 10] and would be invalid because |
|
172 | 173 | * it exceeds the maximum bound of the data. The resolution is thus recalculated so that the |
|
173 | 174 | * mesh holds between the data terminals. |
|
174 | 175 | * So => resolution is 1.8 and the mesh is [0, 1.8, 3.6, 5.4, 7.2, 9]. |
|
175 | 176 | * |
|
176 | 177 | * Once the mesh is generated in x and y, the values ββare associated with each mesh point, |
|
177 | 178 | * based on the data in the series, finding the existing data at which the mesh point would be |
|
178 | 179 | * or would be closest to, without exceeding it. |
|
179 | 180 | * |
|
180 | 181 | * In the example, we determine the value of each mesh point: |
|
181 | 182 | * - x = 0 => value = 0 (existing x in the data series) |
|
182 | 183 | * - x = 1.8 => value = 10 (the closest existing x: 1) |
|
183 | 184 | * - x = 3.6 => value = 30 (the closest existing x: 3) |
|
184 | 185 | * - x = 5.4 => value = 50 (the closest existing x: 5) |
|
185 | 186 | * - x = 7.2 => value = 50 (the closest existing x: 5) |
|
186 | 187 | * - x = 9 => value = 90 (existing x in the data series) |
|
187 | 188 | * |
|
188 | 189 | * Same algorithm is applied for y-axis. |
|
189 | 190 | * |
|
190 | 191 | * @param begin the iterator pointing to the beginning of the data series |
|
191 | 192 | * @param end the iterator pointing to the end of the data series |
|
192 | 193 | * @param xResolution the resolution expected for the mesh's x-axis |
|
193 | 194 | * @param yResolution the resolution expected for the mesh's y-axis |
|
194 | 195 | * @return the mesh created, an empty mesh if the input data do not allow to generate a regular |
|
195 | 196 | * mesh (empty data, null resolutions, logarithmic x-axis) |
|
196 | 197 | * @warning the method considers the dataset as sorted and doesn't control it. |
|
197 | 198 | */ |
|
198 | 199 | static Mesh regularMesh(DataSeriesIterator begin, DataSeriesIterator end, |
|
199 | 200 | Resolution xResolution, Resolution yResolution); |
|
201 | ||
|
202 | /** | |
|
203 | * Calculates the min and max thresholds of a dataset. | |
|
204 | * | |
|
205 | * The thresholds of a dataset correspond to the min and max limits of the set to which the | |
|
206 | * outliers are exluded (values distant from the others) For example, for the set [1, 2, 3, 4, | |
|
207 | * 5, 10000], 10000 is an outlier and will be excluded from the thresholds. | |
|
208 | * | |
|
209 | * Bounds determining the thresholds is calculated according to the mean and the standard | |
|
210 | * deviation of the defined data. The thresholds are limited to the min / max values of the | |
|
211 | * dataset: if for example the calculated min threshold is 2 but the min value of the datasetset | |
|
212 | * is 4, 4 is returned as the min threshold. | |
|
213 | * | |
|
214 | * @param begin the beginning of the dataset | |
|
215 | * @param end the end of the dataset | |
|
216 | * @param logarithmic computes threshold with a logarithmic scale or not | |
|
217 | * @return the thresholds computed, a couple of nan values if it couldn't be computed | |
|
218 | */ | |
|
219 | template <typename Iterator> | |
|
220 | static std::pair<double, double> thresholds(Iterator begin, Iterator end, | |
|
221 | bool logarithmic = false); | |
|
200 | 222 | }; |
|
201 | 223 | |
|
202 | 224 | template <typename Iterator> |
|
203 | 225 | DataSeriesUtils::Resolution DataSeriesUtils::resolution(Iterator begin, Iterator end, |
|
204 | 226 | bool logarithmic) |
|
205 | 227 | { |
|
206 | 228 | // Retrieves data into a work dataset |
|
207 | 229 | using ValueType = typename Iterator::value_type; |
|
208 | 230 | std::vector<ValueType> values{}; |
|
209 | 231 | std::copy(begin, end, std::back_inserter(values)); |
|
210 | 232 | |
|
211 | 233 | // Converts data if logarithmic flag is activated |
|
212 | 234 | if (logarithmic) { |
|
213 | 235 | std::for_each(values.begin(), values.end(), |
|
214 | 236 | [logarithmic](auto &val) { val = std::log10(val); }); |
|
215 | 237 | } |
|
216 | 238 | |
|
217 | 239 | // Computes the differences between the values in the dataset |
|
218 | 240 | std::adjacent_difference(values.begin(), values.end(), values.begin()); |
|
219 | 241 | |
|
220 | 242 | // Retrieves the smallest difference |
|
221 | 243 | auto resolutionIt = std::min_element(values.begin(), values.end()); |
|
222 | 244 | auto resolution |
|
223 | 245 | = resolutionIt != values.end() ? *resolutionIt : std::numeric_limits<double>::quiet_NaN(); |
|
224 | 246 | |
|
225 | 247 | return Resolution{resolution, logarithmic}; |
|
226 | 248 | } |
|
227 | 249 | |
|
250 | template <typename Iterator> | |
|
251 | std::pair<double, double> DataSeriesUtils::thresholds(Iterator begin, Iterator end, | |
|
252 | bool logarithmic) | |
|
253 | { | |
|
254 | /// Lambda that converts values in case of logaritmic scale | |
|
255 | auto toLog = [logarithmic](const auto &value) { | |
|
256 | if (logarithmic) { | |
|
257 | // Logaritmic scale doesn't include zero value | |
|
258 | return !(std::isnan(value) || value < std::numeric_limits<double>::epsilon()) | |
|
259 | ? std::log10(value) | |
|
260 | : std::numeric_limits<double>::quiet_NaN(); | |
|
261 | } | |
|
262 | else { | |
|
263 | return value; | |
|
264 | } | |
|
265 | }; | |
|
266 | ||
|
267 | /// Lambda that converts values to linear scale | |
|
268 | auto fromLog | |
|
269 | = [logarithmic](const auto &value) { return logarithmic ? std::pow(10, value) : value; }; | |
|
270 | ||
|
271 | /// Lambda used to sum data and divide the sum by the number of data. It is used to calculate | |
|
272 | /// the mean and standard deviation | |
|
273 | /// @param fun the data addition function | |
|
274 | auto accumulate = [begin, end](auto fun) { | |
|
275 | double sum; | |
|
276 | int nbValues; | |
|
277 | std::tie(sum, nbValues) = std::accumulate( | |
|
278 | begin, end, std::make_pair(0., 0), [fun](const auto &input, const auto &value) { | |
|
279 | auto computedValue = fun(value); | |
|
280 | ||
|
281 | // NaN values are excluded from the sum | |
|
282 | return !std::isnan(computedValue) | |
|
283 | ? std::make_pair(input.first + computedValue, input.second + 1) | |
|
284 | : input; | |
|
285 | }); | |
|
286 | ||
|
287 | return nbValues != 0 ? sum / nbValues : std::numeric_limits<double>::quiet_NaN(); | |
|
288 | }; | |
|
289 | ||
|
290 | // Computes mean | |
|
291 | auto mean = accumulate([toLog](const auto &val) { return toLog(val); }); | |
|
292 | if (std::isnan(mean)) { | |
|
293 | return {std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN()}; | |
|
294 | } | |
|
295 | ||
|
296 | // Computes standard deviation | |
|
297 | auto variance | |
|
298 | = accumulate([mean, toLog](const auto &val) { return std::pow(toLog(val) - mean, 2); }); | |
|
299 | auto sigma = std::sqrt(variance); | |
|
300 | ||
|
301 | // Computes thresholds | |
|
302 | auto minThreshold = fromLog(mean - 3 * sigma); | |
|
303 | auto maxThreshold = fromLog(mean + 3 * sigma); | |
|
304 | ||
|
305 | // Finds min/max values | |
|
306 | auto minIt = std::min_element(begin, end, [toLog](const auto &it1, const auto &it2) { | |
|
307 | return SortUtils::minCompareWithNaN(toLog(it1), toLog(it2)); | |
|
308 | }); | |
|
309 | auto maxIt = std::max_element(begin, end, [toLog](const auto &it1, const auto &it2) { | |
|
310 | return SortUtils::maxCompareWithNaN(toLog(it1), toLog(it2)); | |
|
311 | }); | |
|
312 | ||
|
313 | // Returns thresholds (bounded to min/max values) | |
|
314 | return {std::max(*minIt, minThreshold), std::min(*maxIt, maxThreshold)}; | |
|
315 | } | |
|
316 | ||
|
228 | 317 | #endif // SCIQLOP_DATASERIESUTILS_H |
General Comments 0
You need to be logged in to leave comments.
Login now