From 94a492063eefd9b6bf78b7f6542ca6f74ef94691 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Tue, 6 Oct 2020 11:28:24 +0200 Subject: [PATCH] added function for flagging of regime anomalies --- saqc/funcs/functions.py | 2 +- saqc/funcs/proc_functions.py | 13 +++++++++---- saqc/lib/tools.py | 17 +++++++++++++---- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index 855e423ed..afad72f7d 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -955,7 +955,7 @@ def flagDriftFromNorm(data, field, flagger, fields, segment_freq, norm_spread, n for segment in segments: if segment[1].shape[0] <= 1: continue - drifters = detectDeviants(data, metric, norm_spread, norm_frac, linkage_method) + drifters = detectDeviants(data, metric, norm_spread, norm_frac, linkage_method, 'variables') for var in drifters: flagger = flagger.setFlags(fields[var], loc=segment[1].index, **kwargs) diff --git a/saqc/funcs/proc_functions.py b/saqc/funcs/proc_functions.py index 42754e384..78e89f37a 100644 --- a/saqc/funcs/proc_functions.py +++ b/saqc/funcs/proc_functions.py @@ -971,14 +971,19 @@ def proc_seefoExpDriftCorrecture(data, field, flagger, maint_data_field, cal_mea @register(masking='all') -def proc_flagOffsets(data, field, flagger, cluster_field, norm_spread, metric=lambda x,y: np.abs(np.nanmean(x) - np.nanmean(y)), - norm_frac=0.5): +def proc_flagRegimeAnomaly(data, field, flagger, cluster_field, norm_spread, + metric=lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y)), + norm_frac=0.5, **kwargs): + clusterser = data[cluster_field] cluster_num = clusterser.max() + 1 - cluster_dios = dios.DictOfSeries({i : data[field][clusterser == i] for i in range(cluster_num)}) - plateaus = detectDeviants(cluster_dios, metric, norm_spread, norm_frac) + cluster_dios = dios.DictOfSeries({i: data[field][clusterser == i] for i in range(cluster_num)}) + plateaus = detectDeviants(cluster_dios, metric, norm_spread, norm_frac, 'single', 'samples') + for p in plateaus: + flagger = flagger.setFlags(data.iloc[:, p].index, **kwargs) + return data, flagger @register diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index b0d58f17a..daf91464f 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -527,7 +527,7 @@ def customRolling(to_roll, winsz, func, roll_mask=None, min_periods=1, center=Fa return pd.Series(i_roll.values, index=to_roll.index) -def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'): +def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single', population='variables'): """Helper function for carrying out the repeatedly upcoming task, to detect variables that significantly differ from the 'Norm'. @@ -547,15 +547,24 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single' condensed = np.abs(dist_mat[tuple(zip(*combs))]) Z = linkage(condensed, method=linkage_method) cluster = fcluster(Z, norm_spread, criterion='distance') - counts = collections.Counter(cluster) + if population == 'variables': + counts = collections.Counter(cluster) + pop_num = var_num + elif population == 'samples': + counts = {cluster[j]: 0 for j in range(0,var_num)} + for c in range(var_num): + counts[cluster[c]] += data.iloc[:, c].dropna().shape[0] + pop_num = np.sum(list(counts.values())) + else: + raise ValueError("Not a valid normality criteria keyword passed. pass either 'variables' or 'population'.") norm_cluster = -1 for item in counts.items(): - if item[1] > norm_frac * var_num: + if item[1] > norm_frac * pop_num: norm_cluster = item[0] break - if norm_cluster == -1 or counts[norm_cluster] == var_num: + if norm_cluster == -1 or counts[norm_cluster] == pop_num: return [] else: return [i for i, x in enumerate(cluster) if x != norm_cluster] -- GitLab