From 531dde4941163b19508b0dafe0795d21a015afbf Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Thu, 4 Jul 2019 11:12:23 +0200
Subject: [PATCH] helper functionadded to lib.tool: retrieveTrustworthyOriginal

---
 saqc/funcs/functions.py | 17 +++++++++++------
 saqc/lib/tools.py       | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py
index cb1b7ea43..a2ae5932d 100644
--- a/saqc/funcs/functions.py
+++ b/saqc/funcs/functions.py
@@ -189,8 +189,9 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe
     :param soil_porosity:               Porosity of moisture sensors surrounding soil, [-].
     """
 
-    # retrieve input sampling rate:
+    # retrieve input sampling rate (needed to translate ref and data rates into each other):
     input_rate = estimateSamplingRate(data.index)
+
     # retrieve data series input:
     dataseries = data[field]
     # "nan" suspicious values (neither "unflagged" nor "min-flagged")
@@ -199,26 +200,30 @@ def flagSoilMoistureByPrecipitationEvents(data, flags, field, flagger, prec_refe
                flagger.isFlagged(data_flags, flag=flagger.flags.unflagged())
     # drop suspicious values
     dataseries = dataseries[data_use.values]
-    # additionally drop the nan values that result from any preceeding upsampling of the
+    # additionally, drop the nan values that result from any preceeding upsampling of the
     # measurements:
     dataseries = dataseries.dropna()
+    # eventually, after dropping all nans, there is nothing left:
     if dataseries.empty:
         return (data, flags)
-    # estimate moisture sampling frequencie (the original series sampling rate may not match data-input sample rate):
+    # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample
+    # rate):
     moist_rate = estimateSamplingRate(dataseries.index)
-    # resample dataseries to its original sampling rate
+    # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data)
     dataseries = dataseries.resample(moist_rate).asfreq()
 
     # retrieve reference series input
     refseries = data[prec_reference]
     # "nan" suspicious values (neither "unflagged" nor "min-flagged")
-    # NOTE: suspicious values wont be dropped from reference series, because they make suspicious the entire
-    # 24h aggregation intervall, that is computed later on.
     ref_flags = flags[prec_reference]
     ref_use = flagger.isFlagged(ref_flags, flag=flagger.flags.min()) | \
               flagger.isFlagged(ref_flags, flag=flagger.flags.unflagged())
+    # drop suspicious values
     refseries = refseries[ref_use.values]
+    # additionally, drop the nan values that result from any preceeding upsampling of the
+    # measurements:
     refseries = refseries.dropna()
+    # eventually after dropping all nans, there is nothing left:
     if refseries.empty:
         return (data,flags)
     prec_rate = estimateSamplingRate(refseries.index)
diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py
index 063bfe7b6..bc6828efc 100644
--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -103,7 +103,7 @@ def inferFrequency(data):
 def estimateSamplingRate(index):
     """The function estimates the sampling rate of a datetime index.
     The estimation basically evaluates a histogram of bins with seconds-accuracy. This means, that the
-    result may be contra intuitive very likely, if the input series is not rastered (harmonized with skips)
+    result may be contra intuitive or trashy very likely, if the input series is not rastered (harmonized with skips)
     to an interval divisible by seconds.
 
     :param index: A DatetimeIndex or array like Datetime listing, of wich you want the sampling rate to be
@@ -119,3 +119,32 @@ def estimateSamplingRate(index):
     # return smallest non zero sample difference (this works, because input is expected to be at least
     # harmonized with skips)
     return pd.tseries.frequencies.to_offset(str(int(hist[1][:-1][hist[0] > 0].min())) + 's')
+
+def retrieveTrustworthyOriginal(dataseries, dataflags=None, flagger=None):
+    """Columns of data passed to the saqc runner may not be sampled to its original sampling rate - thus
+    differenciating between missng value - nans und fillvalue nans is impossible. This function evaluates flags for a
+    passed series, if flags and flagger object are passed and downsamples the input series to its original sampling
+    rate and sparsity.
+
+    :param dataseries:  The pd.dataseries object that you want to sample to original rate. It has to have a harmonic
+                        timestamp.
+    :param dataflags:   the flags series,referring to the passed dataseries.
+    :param dataflags:   A flagger object, to apply the passed flags onto the dataseries.
+
+    """
+    if (dataflags is not None) and (flagger is not None):
+        data_use = flagger.isFlagged(data_flags, flag=flagger.flags.min()) | \
+                   flagger.isFlagged(data_flags, flag=flagger.flags.unflagged())
+    # drop suspicious values
+    dataseries = dataseries[data_use.values]
+    # additionally, drop the nan values that result from any preceeding upsampling of the
+    # measurements:
+    dataseries = dataseries.dropna()
+    # eventually, after dropping all nans, there is nothing left:
+    if dataseries.empty:
+        return dataseries
+    # estimate original data sampling frequencie (the original series sampling rate may not match data-input sample
+    # rate):
+    moist_rate = estimateSamplingRate(dataseries.index)
+    # resample dataseries to its original sampling rate (now certain, to only get nans, indeed denoting "missing" data)
+    return dataseries.resample(moist_rate).asfreq()
-- 
GitLab