From 5d665fadc3549bd04746fa8ad926027cb64fdcad Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Thu, 19 Aug 2021 13:35:54 +0200
Subject: [PATCH] added a calibration plot to flag_pattern, because its hard to
 find a suitable threshold without any feedback.

---
 saqc/funcs/pattern.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/saqc/funcs/pattern.py b/saqc/funcs/pattern.py
index 89ac4b354..070974288 100644
--- a/saqc/funcs/pattern.py
+++ b/saqc/funcs/pattern.py
@@ -182,7 +182,15 @@ def calculateDistanceByDTW(
 
 @flagging(masking="field", module="pattern")
 def flagPatternByDTW(
-    data, field, flags, ref_field, max_distance=0.0, normalize=True, flag=BAD, **kwargs
+    data,
+    field,
+    flags,
+    ref_field,
+    max_distance=0.0,
+    normalize=True,
+    plot=False,
+    flag=BAD,
+    **kwargs
 ):
     """Pattern Recognition via Dynamic Time Warping.
 
@@ -219,6 +227,15 @@ def flagPatternByDTW(
         processing. The distances then refer to the mean distance per datapoint,
         expressed in the datas units.
 
+    plot: bool, default False
+        Show a calibration plot, which can be quite helpful to find the right threshold
+        for `max_distance`. It works best with `normalize=True`. Do not use in automatic
+        setups / pipelines. The plot show three lines:
+            - data: the data the function was called on
+            - distances: the calculated distances by the algorithm
+            - indicator: have to distinct levels: `0` and the value of `max_distance`.
+              If `max_distance` is `0.0` it defaults to `1`. Everywhere where the
+              indicator is not `0` the data will be flagged.
 
     Returns
     -------
@@ -260,5 +277,12 @@ def flagPatternByDTW(
     rolling = customRoller(minima, window=winsz)
     mask = rolling.sum() > 0
 
+    if plot:
+        df = pd.DataFrame()
+        df["data"] = dat
+        df["distances"] = distances
+        df["indicator"] = mask.astype(float) * (max_distance or 1)
+        df.plot()
+
     flags[mask, field] = flag
     return data, flags
-- 
GitLab