diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index a2ae5932d6b71e9d47d390877706bdf7451d4c5d..12643e26e9e42fda2c8746552ed69b2bea0c5f98 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -4,7 +4,8 @@ import numpy as np import pandas as pd -from ..lib.tools import valueRange, slidingWindowIndices, inferFrequency, estimateSamplingRate +from ..lib.tools import valueRange, slidingWindowIndices, inferFrequency, estimateSamplingRate, \ + retrieveTrustworthyOriginal from ..dsl import evalExpression from ..core.config import Params @@ -191,47 +192,18 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe # retrieve input sampling rate (needed to translate ref and data rates into each other): input_rate = estimateSamplingRate(data.index) - - # retrieve data series input: - dataseries = data[field] - # "nan" suspicious values (neither "unflagged" nor "min-flagged") - data_flags = flags[field] - data_use = flagger.isFlagged(data_flags, flag=flagger.flags.min()) | \ - flagger.isFlagged(data_flags, flag=flagger.flags.unflagged()) - # drop suspicious values - dataseries = dataseries[data_use.values] - # additionally, drop the nan values that result from any preceeding upsampling of the - # measurements: - dataseries = dataseries.dropna() - # eventually, after dropping all nans, there is nothing left: - if dataseries.empty: - return (data, flags) - # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample - # rate): - moist_rate = estimateSamplingRate(dataseries.index) - # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data) - dataseries = dataseries.resample(moist_rate).asfreq() - - # retrieve reference series input - refseries = data[prec_reference] - # "nan" suspicious values (neither "unflagged" nor "min-flagged") - ref_flags = flags[prec_reference] - ref_use = flagger.isFlagged(ref_flags, flag=flagger.flags.min()) | \ - flagger.isFlagged(ref_flags, flag=flagger.flags.unflagged()) - # drop suspicious values - refseries = refseries[ref_use.values] - # additionally, drop the nan values that result from any preceeding upsampling of the - # measurements: - refseries = refseries.dropna() - # eventually after dropping all nans, there is nothing left: - if refseries.empty: - return (data,flags) - prec_rate = estimateSamplingRate(refseries.index) - refseries.resample(prec_rate).asfreq() + dataseries, moist_rate = retrieveTrustworthyOriginal(data[field], flags[field], flagger) + refseries, ref_rate = retrieveTrustworthyOriginal(data[prec_reference], flags[field], flagger) + # abort processing if any of the measurement series has no valid entries! + if moist_rate is np.nan: + return data, flags + if ref_rate is np.nan: + return data, flags # get 24 h prec. monitor (this makes last-24h-rainfall-evaluation independent from preceeding entries) prec_count = refseries.rolling(window='1D').apply(lambda x: x.sum(skipna=False), raw=False) - # upsample with zeros to input data sampling rate: + # upsample with zeros to input data sampling rate (we want to project the daysums onto the dataseries grid to + # prepare for use of rolling:): prec_count = prec_count.resample(input_rate).pad() # now we can: project precipitation onto dataseries sampling (and stack result to be able to apply df.rolling()) @@ -255,14 +227,13 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe else: return True - # get valid moisture raises: # rolling.apply should only get active every second entrie of the stacked frame, # so periods per window have to be calculated, # (this gives sufficiant conditian since window size controlls daterange:) periods = 2*int(24*60*60/moist_rate.n) invalid_raises = ~ef.rolling(window='1D', closed='both', min_periods=periods)\ .apply(prec_test, raw=False).astype(bool) - # undo stacking heritage (only every second entrie actually is holding an information: + # undo stacking (only every second entrie actually is holding an information: invalid_raises = invalid_raises[1::2] # retrieve indices referring to values-to-be-flagged-bad invalid_indices = invalid_raises.index[invalid_raises] diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index bc6828efcf37dc6f6d8a9ea3f3940e6c1fa16cf8..f908ba4342a098b2c0e71fa967fc9de7ad7ae952 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -133,8 +133,8 @@ def retrieveTrustworthyOriginal(dataseries, dataflags=None, flagger=None): """ if (dataflags is not None) and (flagger is not None): - data_use = flagger.isFlagged(data_flags, flag=flagger.flags.min()) | \ - flagger.isFlagged(data_flags, flag=flagger.flags.unflagged()) + data_use = flagger.isFlagged(dataflags, flag=flagger.flags.min()) | \ + flagger.isFlagged(dataflags, flag=flagger.flags.unflagged()) # drop suspicious values dataseries = dataseries[data_use.values] # additionally, drop the nan values that result from any preceeding upsampling of the @@ -142,9 +142,9 @@ def retrieveTrustworthyOriginal(dataseries, dataflags=None, flagger=None): dataseries = dataseries.dropna() # eventually, after dropping all nans, there is nothing left: if dataseries.empty: - return dataseries + return dataseries, np.nan # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample # rate): - moist_rate = estimateSamplingRate(dataseries.index) + data_rate = estimateSamplingRate(dataseries.index) # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data) - return dataseries.resample(moist_rate).asfreq() + return dataseries.resample(data_rate).asfreq(), data_rate