From 1a418f096ad3ec5f63590fdf6b58c44078191af4 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Tue, 6 Oct 2020 13:12:30 +0200
Subject: [PATCH] doc doc doc

---
 saqc/funcs/breaks_detection.py | 16 +++++++++-------
 saqc/funcs/modelling.py        |  3 +++
 saqc/lib/tools.py              |  8 +++++---
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/saqc/funcs/breaks_detection.py b/saqc/funcs/breaks_detection.py
index 41cd5dbd9..437367d29 100644
--- a/saqc/funcs/breaks_detection.py
+++ b/saqc/funcs/breaks_detection.py
@@ -14,12 +14,12 @@ from saqc.lib.tools import retrieveTrustworthyOriginal, detectDeviants
 @register(masking='all')
 def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
                      metric=lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y)),
-                     norm_frac=0.5, **kwargs):
+                     norm_frac=0.5, recluster=False, **kwargs):
     """
-    A function to flag values belonging to an anomalous regimes of field.
+    A function to flag values belonging to an anomalous regime of field.
 
     "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect
-    to a certain metric.
+    to a certain metric and linkage method.
 
     In addition, only a range of regimes is considered "normal", if it models more then `norm_frac` percentage of
     the valid samples in "field".
@@ -27,7 +27,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
     Note, that you must detect the regime changepoints prior to calling this function.
 
     Note, that it is possible to perform hypothesis tests for regime equality by passing the metric
-    a function for p-value calculation.
+    a function for p-value calculation and selecting linkage method "complete".
 
     Parameters
     ----------
@@ -41,13 +41,15 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
         The name of the column in data, holding the cluster labels for the samples in field. (has to be indexed
         equal to field)
     norm_spread : float
-        A threshold denoting the distance, members of the "normal" group must not exceed to each other (in terms of the
-        metric passed) to qualify their group as the "normal" group.
+        A threshold denoting the valuelevel, up to wich clusters a agglomerated.
     metric : Callable[[numpy.array, numpy.array], float], default lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y))
         A metric function for calculating the dissimilarity between 2 regimes. Defaults to just the difference in mean.
     norm_frac : float
         Has to be in [0,1]. Determines the minimum percentage of samples,
         the "normal" group has to comprise to be the normal group actually.
+    recluster : bool, default False
+        If True,
+
     kwargs
 
     Returns
@@ -67,7 +69,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
     plateaus = detectDeviants(cluster_dios, metric, norm_spread, norm_frac, 'single', 'samples')
 
     for p in plateaus:
-        flagger = flagger.setFlags(data.iloc[:, p].index, **kwargs)
+        flagger = flagger.setFlags(field, loc=cluster_dios.iloc[:, p].index, **kwargs)
 
     return data, flagger
 
diff --git a/saqc/funcs/modelling.py b/saqc/funcs/modelling.py
index 53d82b5df..3c48457f8 100644
--- a/saqc/funcs/modelling.py
+++ b/saqc/funcs/modelling.py
@@ -438,6 +438,9 @@ def modelling_clusterByChangePoints(data, field, flagger, stat_func, thresh_func
     generated by.
     The regime change points detection is based on a sliding window search.
 
+    Note, that the cluster labels will be stored to the `field` field of the input data, so that the data that is
+    clustered gets overridden.
+
     Parameters
     ----------
     data : dios.DictOfSeries
diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py
index 350d935b7..48a27f4a3 100644
--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -532,7 +532,9 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
     Helper function for carrying out the repeatedly upcoming task,
     of detecting variables a group of variables.
 
-    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed.
+    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed
+    in respect to a certain metric and linkage method.
+
     In addition, only a group is considered "normal" if it contains more then `norm_frac` percent of the
     variables in "fields".
 
@@ -560,7 +562,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
     Returns
     -------
     deviants : List
-        A list containing the the column positions of deviant variables in the input frame/dios.
+        A list containing the column positions of deviant variables in the input frame/dios.
 
     """
     var_num = len(data.columns)
@@ -582,7 +584,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
             counts[cluster[c]] += data.iloc[:, c].dropna().shape[0]
         pop_num = np.sum(list(counts.values()))
     else:
-        raise ValueError("Not a valid normality criteria keyword passed. pass either 'variables' or 'population'.")
+        raise ValueError("Not a valid normality criteria keyword passed. Pass either 'variables' or 'population'.")
     norm_cluster = -1
 
     for item in counts.items():
-- 
GitLab