doc doc doc

1a418f09 · Peter Lünenschloß · a0076e67 · 1a418f09 · 1a418f09 · 1a418f09
Commit 1a418f09 authored 4 years ago by Peter Lünenschloß
--- a/saqc/funcs/breaks_detection.py
+++ b/saqc/funcs/breaks_detection.py
@@ -14,12 +14,12 @@ from saqc.lib.tools import retrieveTrustworthyOriginal, detectDeviants
 @register(masking='all')
 def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
                     metric=lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y)),
-                     norm_frac=0.5, **kwargs):
+                     norm_frac=0.5, recluster=False, **kwargs):
    """
-    A function to flag values belonging to an anomalous regimes of field.
+    A function to flag values belonging to an anomalous regime of field.
    "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect
-    to a certain metric.
+    to a certain metric and linkage method.
    In addition, only a range of regimes is considered "normal", if it models more then `norm_frac` percentage of
    the valid samples in "field".
@@ -27,7 +27,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
    Note, that you must detect the regime changepoints prior to calling this function.
    Note, that it is possible to perform hypothesis tests for regime equality by passing the metric
-    a function for p-value calculation.
+    a function for p-value calculation and selecting linkage method "complete".
    Parameters
    ----------
@@ -41,13 +41,15 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
        The name of the column in data, holding the cluster labels for the samples in field. (has to be indexed
        equal to field)
    norm_spread : float
-        A threshold denoting the distance, members of the "normal" group must not exceed to each other (in terms of the
+        A threshold denoting the valuelevel, up to wich clusters a agglomerated.
-        metric passed) to qualify their group as the "normal" group.
    metric : Callable[[numpy.array, numpy.array], float], default lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y))
        A metric function for calculating the dissimilarity between 2 regimes. Defaults to just the difference in mean.
    norm_frac : float
        Has to be in [0,1]. Determines the minimum percentage of samples,
        the "normal" group has to comprise to be the normal group actually.
+    recluster : bool, default False
+        If True,
    kwargs
    Returns
@@ -67,7 +69,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread,
    plateaus = detectDeviants(cluster_dios, metric, norm_spread, norm_frac, 'single', 'samples')
    for p in plateaus:
-        flagger = flagger.setFlags(data.iloc[:, p].index, **kwargs)
+        flagger = flagger.setFlags(field, loc=cluster_dios.iloc[:, p].index, **kwargs)
    return data, flagger

--- a/saqc/funcs/modelling.py
+++ b/saqc/funcs/modelling.py
@@ -438,6 +438,9 @@ def modelling_clusterByChangePoints(data, field, flagger, stat_func, thresh_func
    generated by.
    The regime change points detection is based on a sliding window search.
+    Note, that the cluster labels will be stored to the `field` field of the input data, so that the data that is
+    clustered gets overridden.
    Parameters
    ----------
    data : dios.DictOfSeries

--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -532,7 +532,9 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
    Helper function for carrying out the repeatedly upcoming task,
    of detecting variables a group of variables.
-    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed.
+    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed
+    in respect to a certain metric and linkage method.
    In addition, only a group is considered "normal" if it contains more then `norm_frac` percent of the
    variables in "fields".
@@ -560,7 +562,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
    Returns
    -------
    deviants : List
-        A list containing the the column positions of deviant variables in the input frame/dios.
+        A list containing the column positions of deviant variables in the input frame/dios.
    """
    var_num = len(data.columns)
@@ -582,7 +584,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
            counts[cluster[c]] += data.iloc[:, c].dropna().shape[0]
        pop_num = np.sum(list(counts.values()))
    else:
-        raise ValueError("Not a valid normality criteria keyword passed. pass either 'variables' or 'population'.")
+        raise ValueError("Not a valid normality criteria keyword passed. Pass either 'variables' or 'population'.")
    norm_cluster = -1
    for item in counts.items():