diff --git a/saqc/funcs/breaks_detection.py b/saqc/funcs/breaks_detection.py index 41cd5dbd920be8c20a7581e2eb39acd381f83c8f..437367d29ea7a5ade8785348ec61f0dc8748be8b 100644 --- a/saqc/funcs/breaks_detection.py +++ b/saqc/funcs/breaks_detection.py @@ -14,12 +14,12 @@ from saqc.lib.tools import retrieveTrustworthyOriginal, detectDeviants @register(masking='all') def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread, metric=lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y)), - norm_frac=0.5, **kwargs): + norm_frac=0.5, recluster=False, **kwargs): """ - A function to flag values belonging to an anomalous regimes of field. + A function to flag values belonging to an anomalous regime of field. "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect - to a certain metric. + to a certain metric and linkage method. In addition, only a range of regimes is considered "normal", if it models more then `norm_frac` percentage of the valid samples in "field". @@ -27,7 +27,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread, Note, that you must detect the regime changepoints prior to calling this function. Note, that it is possible to perform hypothesis tests for regime equality by passing the metric - a function for p-value calculation. + a function for p-value calculation and selecting linkage method "complete". Parameters ---------- @@ -41,13 +41,15 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread, The name of the column in data, holding the cluster labels for the samples in field. (has to be indexed equal to field) norm_spread : float - A threshold denoting the distance, members of the "normal" group must not exceed to each other (in terms of the - metric passed) to qualify their group as the "normal" group. + A threshold denoting the valuelevel, up to wich clusters a agglomerated. metric : Callable[[numpy.array, numpy.array], float], default lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y)) A metric function for calculating the dissimilarity between 2 regimes. Defaults to just the difference in mean. norm_frac : float Has to be in [0,1]. Determines the minimum percentage of samples, the "normal" group has to comprise to be the normal group actually. + recluster : bool, default False + If True, + kwargs Returns @@ -67,7 +69,7 @@ def breaks_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread, plateaus = detectDeviants(cluster_dios, metric, norm_spread, norm_frac, 'single', 'samples') for p in plateaus: - flagger = flagger.setFlags(data.iloc[:, p].index, **kwargs) + flagger = flagger.setFlags(field, loc=cluster_dios.iloc[:, p].index, **kwargs) return data, flagger diff --git a/saqc/funcs/modelling.py b/saqc/funcs/modelling.py index 53d82b5dfd0c3478bee1620e0d0dafa9c6b74454..3c48457f8c2463c891f854579233e4fe0650e4cb 100644 --- a/saqc/funcs/modelling.py +++ b/saqc/funcs/modelling.py @@ -438,6 +438,9 @@ def modelling_clusterByChangePoints(data, field, flagger, stat_func, thresh_func generated by. The regime change points detection is based on a sliding window search. + Note, that the cluster labels will be stored to the `field` field of the input data, so that the data that is + clustered gets overridden. + Parameters ---------- data : dios.DictOfSeries diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 350d935b7b38dad1fa5ae87237760566fac122bf..48a27f4a3c5e13184cf7daf4ea526f40243afd7a 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -532,7 +532,9 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single' Helper function for carrying out the repeatedly upcoming task, of detecting variables a group of variables. - "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed. + "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed + in respect to a certain metric and linkage method. + In addition, only a group is considered "normal" if it contains more then `norm_frac` percent of the variables in "fields". @@ -560,7 +562,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single' Returns ------- deviants : List - A list containing the the column positions of deviant variables in the input frame/dios. + A list containing the column positions of deviant variables in the input frame/dios. """ var_num = len(data.columns) @@ -582,7 +584,7 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single' counts[cluster[c]] += data.iloc[:, c].dropna().shape[0] pop_num = np.sum(list(counts.values())) else: - raise ValueError("Not a valid normality criteria keyword passed. pass either 'variables' or 'population'.") + raise ValueError("Not a valid normality criteria keyword passed. Pass either 'variables' or 'population'.") norm_cluster = -1 for item in counts.items():