diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index cb1b7ea430646c0642e95ea5f43d1e85949880ff..a2ae5932d6b71e9d47d390877706bdf7451d4c5d 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -189,8 +189,9 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe :param soil_porosity: Porosity of moisture sensors surrounding soil, [-]. """ - # retrieve input sampling rate: + # retrieve input sampling rate (needed to translate ref and data rates into each other): input_rate = estimateSamplingRate(data.index) + # retrieve data series input: dataseries = data[field] # "nan" suspicious values (neither "unflagged" nor "min-flagged") @@ -199,26 +200,30 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe flagger.isFlagged(data_flags, flag=flagger.flags.unflagged()) # drop suspicious values dataseries = dataseries[data_use.values] - # additionally drop the nan values that result from any preceeding upsampling of the + # additionally, drop the nan values that result from any preceeding upsampling of the # measurements: dataseries = dataseries.dropna() + # eventually, after dropping all nans, there is nothing left: if dataseries.empty: return (data, flags) - # estimate moisture sampling frequencie (the original series sampling rate may not match data-input sample rate): + # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample + # rate): moist_rate = estimateSamplingRate(dataseries.index) - # resample dataseries to its original sampling rate + # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data) dataseries = dataseries.resample(moist_rate).asfreq() # retrieve reference series input refseries = data[prec_reference] # "nan" suspicious values (neither "unflagged" nor "min-flagged") - # NOTE: suspicious values wont be dropped from reference series, because they make suspicious the entire - # 24h aggregation intervall, that is computed later on. ref_flags = flags[prec_reference] ref_use = flagger.isFlagged(ref_flags, flag=flagger.flags.min()) | \ flagger.isFlagged(ref_flags, flag=flagger.flags.unflagged()) + # drop suspicious values refseries = refseries[ref_use.values] + # additionally, drop the nan values that result from any preceeding upsampling of the + # measurements: refseries = refseries.dropna() + # eventually after dropping all nans, there is nothing left: if refseries.empty: return (data,flags) prec_rate = estimateSamplingRate(refseries.index) diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 063bfe7b6df3490d049707287f997d2431d1076e..bc6828efcf37dc6f6d8a9ea3f3940e6c1fa16cf8 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -103,7 +103,7 @@ def inferFrequency(data): def estimateSamplingRate(index): """The function estimates the sampling rate of a datetime index. The estimation basically evaluates a histogram of bins with seconds-accuracy. This means, that the - result may be contra intuitive very likely, if the input series is not rastered (harmonized with skips) + result may be contra intuitive or trashy very likely, if the input series is not rastered (harmonized with skips) to an interval divisible by seconds. :param index: A DatetimeIndex or array like Datetime listing, of wich you want the sampling rate to be @@ -119,3 +119,32 @@ def estimateSamplingRate(index): # return smallest non zero sample difference (this works, because input is expected to be at least # harmonized with skips) return pd.tseries.frequencies.to_offset(str(int(hist[1][:-1][hist[0] > 0].min())) + 's') + +def retrieveTrustworthyOriginal(dataseries, dataflags=None, flagger=None): + """Columns of data passed to the saqc runner may not be sampled to its original sampling rate - thus + differenciating between missng value - nans und fillvalue nans is impossible. This function evaluates flags for a + passed series, if flags and flagger object are passed and downsamples the input series to its original sampling + rate and sparsity. + + :param dataseries: The pd.dataseries object that you want to sample to original rate. It has to have a harmonic + timestamp. + :param dataflags: the flags series,referring to the passed dataseries. + :param dataflags: A flagger object, to apply the passed flags onto the dataseries. + + """ + if (dataflags is not None) and (flagger is not None): + data_use = flagger.isFlagged(data_flags, flag=flagger.flags.min()) | \ + flagger.isFlagged(data_flags, flag=flagger.flags.unflagged()) + # drop suspicious values + dataseries = dataseries[data_use.values] + # additionally, drop the nan values that result from any preceeding upsampling of the + # measurements: + dataseries = dataseries.dropna() + # eventually, after dropping all nans, there is nothing left: + if dataseries.empty: + return dataseries + # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample + # rate): + moist_rate = estimateSamplingRate(dataseries.index) + # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data) + return dataseries.resample(moist_rate).asfreq()