From 0b428e98a0c5f5d92fea17011690176c51929600 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Tue, 4 Aug 2020 08:04:10 +0200 Subject: [PATCH] soil moisture flagging module documented (all but random forest) --- saqc/funcs/soil_moisture_tests.py | 307 ++++++++++++++++++++++++------ 1 file changed, 245 insertions(+), 62 deletions(-) diff --git a/saqc/funcs/soil_moisture_tests.py b/saqc/funcs/soil_moisture_tests.py index bff6344c5..cf529debd 100644 --- a/saqc/funcs/soil_moisture_tests.py +++ b/saqc/funcs/soil_moisture_tests.py @@ -30,10 +30,55 @@ def sm_flagSpikes( ): """ - The Function provides just a call to flagSpikes_spektrumBased, with parameter defaults, that refer to: + The Function provides just a call to ``flagSpikes_spektrumBased``, with parameter defaults, + that refer to References [1]. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + raise_factor : float, default 0.15 + Minimum relative value difference between two values to consider the latter as a spike candidate. + See condition (1) (or reference [2]). + deriv_factor : float, default 0.2 + See condition (2) (or reference [2]). + noise_func : {'CoVar', 'rVar'}, default 'CoVar' + Function to calculate noisiness of the data surrounding potential spikes. + ``'CoVar'``: Coefficient of Variation + ``'rVar'``: Relative Variance + noise_window : str, default '12h' + An offset string that determines the range of the time window of the "surrounding" data of a potential spike. + See condition (3) (or reference [2]). + noise_thresh : float, default 1 + Upper threshold for noisiness of data surrounding potential spikes. See condition (3) (or reference [2]). + smooth_window : {None, str}, default None + Size of the smoothing window of the Savitsky-Golay filter. + The default value ``None`` results in a window of two times the sampling rate (i.e. containing three values). + smooth_poly_deg : int, default 2 + Degree of the polynomial used for fitting with the Savitsky-Golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + References + ---------- + This Function is a generalization of the Spectrum based Spike flagging mechanism as presented in: + + [1] Dorigo, W. et al: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + [2] https://git.ufz.de/rdm-software/saqc/-/blob/testfuncDocs/docs/funcs/FormalDescriptions.md#spikes_flagspektrumbased - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. """ return spikes_flagSpektrumBased( @@ -69,10 +114,57 @@ def sm_flagBreaks( ): """ - The Function provides just a call to flagBreaks_spektrumBased, with parameter defaults that refer to: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + The Function provides just a call to flagBreaks_spektrumBased, with parameter defaults that refer to references [1]. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh_rel : float, default 0.1 + Float in [0,1]. See (1) of function description above to learn more + thresh_abs : float, default 0.01 + Float > 0. See (2) of function descritpion above to learn more. + first_der_factor : float, default 10 + Float > 0. See (3) of function descritpion above to learn more. + first_der_window_range : str, default '12h' + Offset string. See (3) of function description to learn more. + scnd_der_ratio_margin_1 : float, default 0.05 + Float in [0,1]. See (4) of function descritpion above to learn more. + scnd_der_ratio_margin_2 : float, default 10 + Float in [0,1]. See (5) of function descritpion above to learn more. + smooth : bool, default True + Method for obtaining dataseries' derivatives. + * False: Just take series step differences (default) + * True: Smooth data with a Savitzky Golay Filter before differentiating. + smooth_window : {None, str}, default 2 + Effective only if `smooth` = True + Offset string. Size of the filter window, used to calculate the derivatives. + smooth_poly_deg : int, default 2 + Effective only, if `smooth` = True + Polynomial order, used for smoothing with savitzk golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + Find a brief mathematical description of the function here: + + [2] https://git.ufz.de/rdm-software/saqc/-/blob/testfuncDocs/docs/funcs + /FormalDescriptions.md#breaks_flagspektrumbased """ return breaks_flagSpektrumBased( @@ -95,28 +187,45 @@ def sm_flagBreaks( @register def sm_flagFrost(data, field, flagger, soil_temp_variable, window="1h", frost_thresh=0, **kwargs): - """This Function is an implementation of the soil temperature based Soil Moisture flagging, as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + """ + This Function is an implementation of the soil temperature based Soil Moisture flagging, as presented in + references [1]: All parameters default to the values, suggested in this publication. Function flags Soil moisture measurements by evaluating the soil-frost-level in the moment of measurement. Soil temperatures below "frost_level" are regarded as denoting frozen soil state. - :param data: The pandas dataframe holding the data-to-be flagged, as well as the reference - series. Data must be indexed by a datetime series. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. - like thingies that refer to the data(including datestrings). - :param tolerated_deviation: Offset String. Denoting the maximal temporal deviation, - the soil frost states timestamp is allowed to have, relative to the - data point to-be-flagged. - :param soil_temp_reference: A STRING, denoting the fields name in data, - that holds the data series of soil temperature values, - the to-be-flagged values shall be checked against. - :param frost_level: Value level, the flagger shall check against, when evaluating soil frost level. + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + soil_temp_variable : str, + An offset string, denoting the fields name in data, that holds the data series of soil temperature values, + the to-be-flagged values shall be checked against. + window : str + An offset string denoting the maximal temporal deviation, the soil frost states timestamp is allowed to have, + relative to the data point to-be-flagged. + frost_thresh : float + Value level, the flagger shall check against, when evaluating soil frost level. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. """ # retrieve reference series @@ -155,10 +264,9 @@ def sm_flagPrecipitation( **kwargs, ): - """This Function is an implementation of the precipitation based Soil Moisture flagging, as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + """ + This Function is an implementation of the precipitation based Soil Moisture flagging, as presented in + references [1]. All parameters default to the values, suggested in this publication. (excluding porosity,sensor accuracy and sensor depth) @@ -172,9 +280,9 @@ def sm_flagPrecipitation( A data point y_t is flagged an invalid soil moisture raise, if: - (1) y_t > y_(t-raise_window) - (2) y_t - y_(t-"std_factor_range") > "std_factor" * std(y_(t-"std_factor_range"),...,y_t) - (3) sum(prec(t-24h),...,prec(t)) > sensor_depth * sensor_accuracy * soil_porosity + (1) y_t > y_(t-`raise_window`) + (2) y_t - y_(t-`std_factor_range`) > `std_factor` * std(y_(t-`std_factor_range`),...,y_t) + (3) sum(prec(t-24h),...,prec(t)) > `sensor_depth` * `sensor_accuracy` * `soil_porosity` NOTE1: np.nan entries in the input precipitation series will be regarded as susipicious and the test will be omited for every 24h interval including a np.nan entrie in the original precipitation sampling rate. @@ -183,27 +291,57 @@ def sm_flagPrecipitation( NOTE2: The function wont test any values that are flagged suspicious anyway - this may change in a future version. - :param data: The pandas dataframe holding the data-to-be flagged, as well as the reference - series. Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param prec_variable: Fieldname of the precipitation meassurements column in data. - :param sensor_depth: Measurement depth of the soil moisture sensor, [m]. - :param sensor_accuracy: Accuracy of the soil moisture sensor, [-]. - :param soil_porosity: Porosity of moisture sensors surrounding soil, [-]. - :param std_factor: The value determines by which rule it is decided, weather a raise in soil - moisture is significant enough to trigger the flag test or not: - Significants is assumed, if the raise is greater then "std_factor" multiplied - with the last 24 hours standart deviation. - :param std_factor_range: Offset String. Denotes the range over witch the standart deviation is obtained, - to test condition [2]. (Should be a multiple of the sampling rate) - :param raise_window: Offset String. Denotes the distance to the datapoint, relatively to witch - it is decided if the current datapoint is a raise or not. Equation [1]. - It defaults to None. When None is passed, raise_window is just the sample - rate of the data. Any raise reference must be a multiple of the (intended) - sample rate and below std_factor_range. - :param ignore_missing: + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + prec_variable : str + Fieldname of the precipitation meassurements column in data. + raise_window: {None, str}, default None + Denotes the distance to the datapoint, relatively to witch + it is decided if the current datapoint is a raise or not. Equation [1]. + It defaults to None. When None is passed, raise_window is just the sample + rate of the data. Any raise reference must be a multiple of the (intended) + sample rate and below std_factor_range. + sensor_depth : float, default 0 + Measurement depth of the soil moisture sensor, [m]. + sensor_accuracy : float, default 0 + Accuracy of the soil moisture sensor, [-]. + soil_porosity : float, default 0 + Porosity of moisture sensors surrounding soil, [-]. + std_factor : int, default 2 + The value determines by which rule it is decided, weather a raise in soil + moisture is significant enough to trigger the flag test or not: + Significance is assumed, if the raise is greater then "std_factor" multiplied + with the last 24 hours standart deviation. + std_window: str, default '24h' + An offset string that denotes the range over witch the standart deviation is obtained, + to test condition [2]. (Should be a multiple of the sampling rate) + raise_window: str + Denotes the distance to the datapoint, relatively to witch + it is decided if the current datapoint is a raise or not. Equation [1]. + It defaults to None. When None is passed, raise_window is just the sample + rate of the data. Any raise reference must be a multiple of the (intended) + sample rate and below std_factor_range. + ignore_missing: bool, default False + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. """ dataseries, moist_rate = retrieveTrustworthyOriginal(data, field, flagger) @@ -245,7 +383,6 @@ def sm_flagPrecipitation( flagger = flagger.setFlags(field, loc=invalid_indices.index, **kwargs) return data, flagger - @register def sm_flagConstants( data, @@ -265,16 +402,62 @@ def sm_flagConstants( ): """ - - Note, function has to be harmonized to equidistant freq_grid - - Note, in current implementation, it has to hold that: (rainfall_window_range >= plateau_window_min) - - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision (skips allowed). - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) + This function flags plateaus/series of constant values in soil moisture data. + + Mentionings of "conditions" in the following explanations refer to references [2]. + + The function represents a stricter version of + constants_flagVarianceBased. + + The additional constraints (3)-(5), are designed to match the special cases of constant + values in soil moisture measurements and basically for preceding precipitation events + (conditions (3) and (4)) and certain plateau level (condition (5)). + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + window : str, default '12h' + Minimum duration during which values need to identical to become plateau candidates. See condition (1) + thresh : float, default 0.0005 + Maximum variance of a group of values to still consider them constant. See condition (2) + precipitation_window : str, default '12h' + See condition (3) and (4) + tolerance : float, default 0.95 + Tolerance factor, see condition (5) + deriv_max : float, default 0 + See condition (4) + deriv_min : float, default 0.0025 + See condition (3) + max_missing : {None, int}, default None + Maximum number of missing values allowed in window, by default this condition is ignored + max_consec_missing : {None, int}, default None + Maximum number of consecutive missing values allowed in window, by default this condition is ignored + smooth_window : {None, str}, default None + Size of the smoothing window of the Savitsky-Golay filter. The default value None results in a window of two + times the sampling rate (i.e. three values) + smooth_poly_deg : int, default 2 + Degree of the polynomial used for smoothing with the Savitsky-Golay filter + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + [2] https://git.ufz.de/rdm-software/saqc/-/edit/testfuncDocs/docs/funcs/FormalDescriptions.md#sm_flagconstants """ # get plateaus: -- GitLab