diff --git a/saqc/funcs/constants_detection.py b/saqc/funcs/constants_detection.py index 639aa1801a28435586742f750ed169bcb6f9db5f..7bcaf1698fb1c7465432ff463838d09f2a93c3b5 100644 --- a/saqc/funcs/constants_detection.py +++ b/saqc/funcs/constants_detection.py @@ -9,8 +9,7 @@ from saqc.lib.statistic_functions import varQC from saqc.lib.tools import ( valueRange, slidingWindowIndices, - retrieveTrustworthyOriginal, - offset2periods, + retrieveTrustworthyOriginal ) # todo: flagConstant does not flag the constant plateau data from test_constants properly diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index 43b1d6b793bcbf4a0c7a2f3ef0bdd0b94e5a529c..704d07b3997ea0ef5eae941a1d08852562046062 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -62,7 +62,8 @@ def harmWrapper(heap={}): ) # and dropped for harmonization: if drop_flags is not None: - drop_flags.append(flagger.BAD) + if flagger.BAD not in drop_flags: + drop_flags.append(flagger.BAD) # before sending the current flags and data frame to the future (for backtracking reasons), we clear it # from merge-nans that just resulted from harmonization of other variables! diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index c2ca0764423eaa3746df31a5bf8c0450bc07df76..a3950cc91c6137145c5c4d8e28ea283c3069942d 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -90,62 +90,57 @@ def inferFrequency(data): return pd.tseries.frequencies.to_offset(pd.infer_freq(data.index)) -def estimateSamplingRate(index): - """The function estimates the sampling rate of a datetime index. - The estimation basically evaluates a histogram of bins with seconds-accuracy. This means, that the - result may be contra intuitive or trashy very likely, if the input series is not rastered (harmonized with skips) - to an interval divisible by seconds. - - :param index: A DatetimeIndex or array like Datetime listing, of wich you want the sampling rate to be - estimated. - """ - if index.empty: - return pd.tseries.frequencies.to_offset("0s") - scnds_series = (pd.Series(index).diff().dt.total_seconds()).dropna() - max_scnds = scnds_series.max() - min_scnds = scnds_series.min() - hist = np.histogram( - scnds_series, - range=(min_scnds, max_scnds + 1), - bins=int(max_scnds - min_scnds + 1), - ) - # return smallest non zero sample difference (this works, because input is expected to be at least - # harmonized with skips) - return pd.tseries.frequencies.to_offset( - str(int(hist[1][:-1][hist[0] > 0].min())) + "s" - ) - - -def retrieveTrustworthyOriginal(data, field, flagger=None): +def retrieveTrustworthyOriginal(data, field, flagger=None, level=None): """Columns of data passed to the saqc runner may not be sampled to its original sampling rate - thus - differenciating between missng value - nans und fillvalue nans is impossible. This function evaluates flags for a - passed series, if flags and flagger object are passed and downsamples the input series to its original sampling - rate and sparsity. - - :param dataseries: The pd.dataseries object that you want to sample to original rate. It has to have a harmonic - timestamp. - :param dataflags: A flagger object, to apply the passed flags onto the dataseries. + differenciating between missng value - nans und fillvalue nans is impossible. + + This function: + (1) if flagger is None: + (a) estimates the sampling rate of the input dataseries by dropping all nans and then returns the series at the + estimated samplng rate. + + (2) if "flagger" is not None but "level" is None: + (a) all values are dropped, that are flagged worse then flagger.GOOD. (so unflagged values wont be dropped) + (b) estimates the sampling rate of the input dataseries by dropping all nans and then returns the series at the + estimated samplng rate. + (3) if "flagger" is not None and "level" is not None: + (a) all values are dropped, that are flagged worse then level. (so unflagged values wont be dropped) + (b) estimates the sampling rate of the input dataseries by dropping all nans and then returns the series at the + estimated samplng rate. + + Note, that the passed dataseries should be harmonized to an equidistant + frequencie grid (maybe including blow up entries). + + :param data: DataFrame. The Data frame holding the data containing 'field'. + :param field: String. Fieldname of the column in data, that you want to sample to original sampling rate. + It has to have a harmonic + :param flagger: None or a flagger object. + :param level: Lower bound of flags that are excepted for data. Must be a flag the flagger can handle. """ dataseries = data[field] + if flagger is not None: - data_use = flagger.isFlagged(field, flag=flagger.GOOD, comparator="<=") + if level is not None: + data_use = flagger.isFlagged(field, flag=level, comparator="<=") + else: + data_use = flagger.isFlagged(field, flag=flagger.GOOD, comparator="<=") # drop all flags that are suspicious or worse dataseries = dataseries[data_use] # drop the nan values that may result from any preceeding upsampling of the measurements: dataseries = dataseries.dropna() + if dataseries.empty: return dataseries, np.nan # estimate original data sampling frequencie # (the original series sampling rate may not match data-input sample rate): - data_rate = estimateSamplingRate(dataseries.index) + seconds_rate = (dataseries.index - dataseries.index.shift(-1)).to_series().min().seconds + data_rate = pd.tseries.frequencies.to_offset(str(seconds_rate) + 's') - # resample dataseries to its original sampling rate - # (now certain, to only get nans, indeed denoting "missing" data) - return dataseries.resample(data_rate).asfreq(), data_rate + return dataseries.asfreq(data_rate), data_rate def offset2seconds(offset):