From 6d260cfcef9ff2dec3d90a629256c6ce638ceef1 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Wed, 5 Jun 2019 22:23:08 +0200 Subject: [PATCH] rewrote MAD --- funcs/functions.py | 54 ++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 38 deletions(-) diff --git a/funcs/functions.py b/funcs/functions.py index 6f0cf24ea..241b4fb3f 100644 --- a/funcs/functions.py +++ b/funcs/functions.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd + from lib.tools import valueRange, slidingWindowIndices from dsl import evalExpression from config import Params @@ -23,7 +24,6 @@ def flagDispatch(func_name, *args, **kwargs): def flagGeneric(data, flags, field, flagger, nodata=np.nan, **flag_params): - expression = flag_params[Params.FUNC] result = evalExpression(expression, flagger, data, flags, field, @@ -46,7 +46,6 @@ def flagGeneric(data, flags, field, flagger, nodata=np.nan, **flag_params): def flagConstant(data, flags, field, flagger, eps, length, thmin=None, **kwargs): - datacol = data[field] flagcol = flags[field] @@ -86,46 +85,26 @@ def flagManual(data, flags, field, flagger, **kwargs): def flagRange(data, flags, field, flagger, min, max, **kwargs): - datacol = data[field].values mask = (datacol < min) | (datacol >= max) flags.loc[mask, field] = flagger.setFlag(flags.loc[mask, field], **kwargs) return data, flags -def flagMad(data, flags, field, flagger, length, z, deriv, **kwargs): - - def _flagMad(data: np.ndarray, z: int, deriv: int) -> np.ndarray: - # NOTE: numpy is at least twice as fast as numba.jit(nopython) - # median absolute deviation - for i in range(deriv): - data[i+1:] = np.diff(data[i:]) - data[i] = np.nan - median = np.nanmedian(data) - mad = np.nanmedian(np.abs(data-median)) - tresh = mad * (z/0.6745) - with np.errstate(invalid="ignore"): - return (data < (median - tresh)) | (data > (median + tresh)) - - datacol = data[field] - flagcol = flags[field] - - values = (datacol - .mask(flagger.isFlagged(flagcol)) - .values) - - window = pd.to_timedelta(length) - pd.to_timedelta(data.index.freq) - mask = np.zeros_like(values, dtype=bool) - - for start_idx, end_idx in slidingWindowIndices(datacol.index, window, "1D"): - mad_flags = _flagMad(values[start_idx:end_idx], z, deriv) - # reset the mask - mask[:] = False - mask[start_idx:end_idx] = mad_flags - flagcol[mask] = flagger.setFlag(flagcol[mask], **kwargs) - - flags[field] = flagcol - +def flagMad(data, flags, field, flagger, length, z, freq=None, **kwargs): + # late import because of cyclic import problem + # see core -> from import functions import flagDispatch + from core import inferFrequency + d = data[field].copy() + freq = inferFrequency(d) if freq is None else freq + if freq is None: + raise ValueError("freqency cannot inferred, provide `freq` as a param to mad().") + winsz = int(pd.to_timedelta(length) / freq) + median = d.rolling(window=winsz, center=True, closed='both').median() + diff = abs(d - median) + mad = diff.rolling(window=winsz, center=True, closed='both').median() + mask = (mad > 0) & (0.6745 * diff > z * mad) + flags.loc[mask, field] = flagger.setFlag(flags.loc[mask, field], **kwargs) return data, flags @@ -156,9 +135,8 @@ def flagSoilMoistureBySoilFrost(data, flags, field, flagger, soil_temp_reference # retrieve data series input: dataseries = pd.Series(data[field].values, index=pd.to_datetime(data.index)) - # retrieve reference input: - #if reference is a string, it refers to data field + # if reference is a string, it refers to data field # if reference series is part of input data frame, evaluate input data flags: flag_mask = flagger.isFlagged(flags)[soil_temp_reference] -- GitLab