From a282f80210262e560844bc7531bbeab5b1a64876 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 3 Aug 2020 13:29:11 +0200
Subject: [PATCH] flag cross scoring documented

---
 saqc/funcs/functions.py | 45 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py
index eb1df8b6c..2b46b7f97 100644
--- a/saqc/funcs/functions.py
+++ b/saqc/funcs/functions.py
@@ -589,7 +589,50 @@ def flagManual(data, field, flagger, mdata, mflag: Any = 1, method="plain", **kw
 
 
 @register
-def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs):
+def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat='modZscore', **kwargs):
+    """
+    Function checks for outliers relatively to the "horizontal" input data axis.
+
+    For fields=[f_1,f_2,...,f_N] and timestamps [t_1,t_2,...,t_K], the following steps are taken for outlier detection:
+
+    1. All timestamps t_i, where there is one f_k, with data[f_K] having no entry at t_i, are excluded from the
+        following process (inner merge of the f_i fields.)
+    2. for every 0 <= i <= K, the value m_j = median({data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]}) is
+        calculated
+    2. for every 0 <= i <= K, the set {data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j} is tested
+        for outliers with the specified method (`cross_stat` parameter)
+
+    Parameters
+    ----------
+    data : dios.DictOfSeries
+        A dictionary of pandas.Series, holding all the data.
+    field : str
+        A dummy parameter.
+    flagger : saqc.flagger
+        A flagger object, holding flags and additional informations related to `data`.
+    fields : str
+        List of fieldnames in data, determining wich variables are to be included into the flagging process.
+    thresh : float
+        Threshold which the outlier score of an value must exceed, for being flagged an outlier.
+    cross_stat : {'modZscore', 'Zscore'}, default 'modZscore'
+        Method used for calculating the outlier scores.
+        * 'modZscore': Median based "sigma"-ish approach. See Referenecs [1].
+        * 'Zscore': Score values by how many times the standard deviation they differ from the median.
+            See References [1]
+
+    Returns
+    -------
+    data : dios.DictOfSeries
+        A dictionary of pandas.Series, holding all the data.
+    flagger : saqc.flagger
+        The flagger object, holding flags and additional Informations related to `data`.
+        Flags values may have changed relatively to the input flagger.
+
+    References
+    ----------
+    [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
+    """
+
     df = data[fields].loc[data[fields].index_of('shared')].to_df()
 
     if isinstance(cross_stat, str):
-- 
GitLab