diff --git a/saqc/core/core.py b/saqc/core/core.py index c21782f8c110a13693936bed10d08c906423bbc8..6c8d33dc77ea2ee84bd35b3cc03ae098514c6eaf 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -21,6 +21,8 @@ def _collectVariables(meta, data): for idx, configrow in meta.iterrows(): varname = configrow[Fields.VARNAME] assign = configrow[Fields.ASSIGN] + if varname in flags: + continue if varname in data: flags.append(varname) elif varname not in flags and assign is True: @@ -73,14 +75,14 @@ def runner(metafname, flagger, data, flags=None, nodata=np.nan, error_policy="ra meta = config[config.columns.difference(tests.columns)] # # prepapre the flags - # varnames = collectVariables(meta, data) - # fresh = flagger.initFlags(pd.DataFrame(index=data.index, columns=varnames)) - # flags = fresh if flags is None else flags.join(fresh) - if flags is None: - flag_cols = _collectVariables(meta, data) - flagger = flagger.initFlags(pd.DataFrame(index=data.index, columns=flag_cols)) - else: - flagger = flagger.initFlags(flags=flags) + varnames = _collectVariables(meta, data) + fresh = flagger.initFlags(pd.DataFrame(index=data.index, columns=varnames)) + flagger = fresh if flags is None else flags._flags.join(fresh._flags) + # if flags is None: + # flag_cols = _collectVariables(meta, data) + # flagger = flagger.initFlags(pd.DataFrame(index=data.index, columns=flag_cols)) + # else: + # flagger = flagger.initFlags(flags=flags) # this checks comes late, but the compiling of the user-test need fully prepared flags checkConfig(config, data, flagger, nodata) @@ -129,8 +131,10 @@ def runner(metafname, flagger, data, flags=None, nodata=np.nan, error_policy="ra continue flagger = flagger.setFlagger(flagger_chunk_result) - # plotHook(dchunk, fchunk, ffchunk, varname, configrow[Fields.PLOT], flag_test, flagger) - # plotAllHook(data, flags, flagger) + + plotHook(dchunk, flagger_chunk, flagger_chunk_result, varname, configrow[Fields.PLOT], flag_test) + + plotAllHook(data, flagger) return data, flagger diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index 83a8f33cfd5afa88e47f6cbda2475974adb26af2..a119d05e3da2d4c9f7941a6eb8753412b4d283cf 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -14,6 +14,7 @@ from saqc.lib.tools import toSequence, assertScalar, assertDataFrame COMPARATOR_MAP = { + "!=": op.ne, "==": op.eq, ">=": op.ge, ">": op.gt, diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index 5fd514ca6baf567f9a6d924f7863f7c15144d19d..9dd31852bff1a4ce8650cc5d242cba859560ef86 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -69,6 +69,9 @@ def harmWrapper(heap={}): data, flagger = flagMissing( data, field, flagger, nodata=data_missing_value, **kwargs ) + # and dropped for harmonization: + if drop_flags is not None: + drop_flags.append(flagger.BAD) # before sending the current flags and data frame to the future (for backtracking reasons), we clear it # from merge-nans that just resulted from harmonization of other variables! diff --git a/saqc/funcs/soil_moisture_tests.py b/saqc/funcs/soil_moisture_tests.py index 2c58a5206da7158435388420b4b42232392675bf..f9c232940d2310b722185fcff06b31ac0bf8bad9 100644 --- a/saqc/funcs/soil_moisture_tests.py +++ b/saqc/funcs/soil_moisture_tests.py @@ -129,7 +129,7 @@ def flagSoilMoistureBySoilFrost( """ # retrieve reference series - refseries = data[soil_temp_reference] + refseries = data[soil_temp_reference].copy() ref_use = flagger.isFlagged( soil_temp_reference, flag=flagger.GOOD, comparator="==" ) | flagger.isFlagged(soil_temp_reference, flag=flagger.UNFLAGGED, comparator="==") @@ -137,35 +137,14 @@ def flagSoilMoistureBySoilFrost( refseries = refseries[ref_use.values] # drop nan values from reference series, since those are values you dont want to refer to. refseries = refseries.dropna() - # skip further processing if reference series is empty: if refseries.empty: return data, flagger - # wrap around df.index.get_loc method, to catch key error in case of empty tolerance window: - def _checkNearestForFrost(ref_date, ref_series, tolerance, check_level): - - try: - # if there is no reference value within tolerance margin, following line will raise key error and - # trigger the exception - ref_pos = ref_series.index.get_loc( - ref_date, method="nearest", tolerance=tolerance - ) - except KeyError: - # since test is not applicable: make no change to flag state - return False - - # if reference value index is available, return comparison result (to determine flag) - return ref_series[ref_pos] <= check_level - - # make temporal frame holding date index, since df.apply cant access index - temp_frame = pd.Series(data.index) - # get flagging mask ("True" denotes "bad"="test succesfull") - mask = temp_frame.apply( - _checkNearestForFrost, args=(refseries, tolerated_deviation, frost_level) - ) - # apply calculated flags - flagger = flagger.setFlags(field, mask.values, **kwargs) + refseries = refseries.reindex(data[field].dropna().index, method="nearest", tolerance=tolerated_deviation) + refseries = refseries[refseries < frost_level].index + + flagger = flagger.setFlags(field, refseries, **kwargs) return data, flagger @@ -236,9 +215,8 @@ def flagSoilMoistureByPrecipitationEvents( :param ignore_missing: """ - dataseries, moist_rate = retrieveTrustworthyOriginal(data, field, flagger) - # data harmonized: refseries, ref_rate = retrieveTrustworthyOriginal(data, prec_reference, flagger) + dataseries, moist_rate = retrieveTrustworthyOriginal(data, field, flagger) # data not hamronized: refseries = data[prec_reference].dropna() @@ -249,7 +227,7 @@ def flagSoilMoistureByPrecipitationEvents( return data, flagger refseries = refseries.reindex(refseries.index.join(dataseries.index, how='outer')) - # get 24 h prec. monitor (this makes last-24h-rainfall-evaluation independent from preceeding entries) + # get 24 h prec. monitor prec_count = refseries.rolling(window="1D").sum() # exclude data not signifying a raise:: if raise_reference is None: diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index ed394d4e58515a5cec3b608cc5a1dc13e4b7efae..3ea8dea32475651bcbfaf9f12347c873b55b404e 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -2,37 +2,43 @@ # -*- coding: utf-8 -*- # TODO: use the logging module +import logging +import pandas as pd +import numpy as np from warnings import warn __plotvars = [] +_colors = dict(unflagged='silver', good='seagreen', bad='firebrick', suspicious='gold') -def plotAllHook(data, flags, flagger): + +def plotAllHook(data, flagger): if len(__plotvars) > 1: - _plot(data, flags, True, __plotvars, flagger) + _plot(data, flagger, True, __plotvars) -def plotHook(data, old, new, varname, do_plot, flag_test, flagger): +def plotHook(data, old, new, varname, do_plot, flag_test, plot_nans=True): + # old/new: flagger if do_plot: __plotvars.append(varname) # cannot use getFlags here, because if a flag was set (e.g. with force) the # flag may be the same, but any additional row (e.g. comment-field) would differ - mask = (old[varname] == new[varname]).any(axis=1) - _plot(data, new, mask, varname, flagger, title=flag_test) + mask = (old._flags[varname] != new._flags[varname]).any(axis=1) + _plot(data, new, mask, varname, title=flag_test, plot_nans=plot_nans) def _plot( data, - flags, + flagger, flagmask, varname, - flagger, interactive_backend=True, title="Data Plot", - show_nans=True, + plot_nans=True, ): + # todo: try catch warn (once) return # only import if plotting is requested by the user import matplotlib as mpl @@ -47,22 +53,19 @@ def _plot( # needed for datetime conversion from pandas.plotting import register_matplotlib_converters - register_matplotlib_converters() if not isinstance(varname, (list, set)): varname = [varname] varname = set(varname) - # filter out variables to which no data is associated + # filter out variables to which no data is associated (e.g. freshly assigned vars) tmp = [] for var in varname: if var in data.columns: tmp.append(var) else: - warn( - f"Cannot plot column '{var}' that is not present in data.", UserWarning - ) + logging.warning(f"Cannot plot column '{var}', because it is not present in data.") if not tmp: return varname = tmp @@ -72,62 +75,84 @@ def _plot( fig, axes = plt.subplots(plots, 1, sharex=True) axes[0].set_title(title) for i, v in enumerate(varname): - _plotQflags(data, v, flagger, flagmask, axes[i], show_nans) + _plotByQualtyFlag(data, v, flagger, flagmask, axes[i], plot_nans) else: fig, ax = plt.subplots() plt.title(title) - _plotQflags(data, varname.pop(), flagger, flagmask, ax, show_nans) + _plotByQualtyFlag(data, varname.pop(), flagger, flagmask, ax, plot_nans) - plt.xlabel("time") # dummy plot for the label `missing` see plot_vline for more info plt.plot([], [], ":", color="silver", label="missing data") + + plt.xlabel("time") plt.legend() + if interactive_backend: plt.show() -def _plotQflags(data, varname, flagger, flagmask, ax, show_nans): +def _plotByQualtyFlag(data, varname, flagger, flagmask, ax, plot_nans): ax.set_ylabel(varname) x = data.index y = data[varname] - ax.plot(x, y, "-", markersize=1, color="silver") - # plot all data in silver (NaNs as vertical lines) + # base plot: show all(!) data ax.plot(x, y, "-", color="silver", label="data") - flagged = flagger.isFlagged(varname) - if show_nans: - nans = y.isna() - idx = y.index[nans & ~flagged] - _plotVline(ax, idx, color="silver") - - # plot all data (and nans) that are already flagged in black - ax.plot(x[flagged], y[flagged], ".", color="black", label="flagged by other test") - if show_nans: - idx = y.index[nans & flagged & ~flagmask] - _plotVline(ax, idx, color="black") - - -# # plot flags in the color corresponding to the flag -# # BAD red, GOOD green, all in between aka SUSPISIOUS in yellow -# <<<<<<< HEAD -# for i, f in enumerate(flagger.categories): -# if i == 0: -# continue -# flagged = flagger.isFlagged(varname, flag=f, comparator='==') & flagmask -# ======= -# bads = flagger.isFlagged(flags, varname, flag=flagger.BAD, comparator='==') & flagmask -# good = flagger.isFlagged(flags, varname, flag=flagger.GOOD, comparator='==') & flagmask -# susp = flagger.isFlagged(flags, varname, flag=flagger.GOOD, comparator='>') & flagmask & ~bads -# flaglist = [flagger.GOOD, flagger.BAD, 'Suspicious'] -# for f, flagged in zip(flaglist, [good, bads, susp]): -# >>>>>>> master -# label = f"flag: {f}" -# color = _getColor(f, flagger) -# ax.plot(x[flagged], y[flagged], '.', color=color, label=label) -# if show_nans: -# idx = y.index[nans & flagged] -# _plotVline(ax, idx, color=color) + + # ANY OLD FLAG + # plot all(!) data that are already flagged in black + flagged = flagger.isFlagged(varname, flag=flagger.GOOD, comparator='>=') + oldflags = flagged & ~flagmask + ax.plot(x[oldflags], y[oldflags], ".", color="black", label="flagged by other test") + if plot_nans: + _plot_nans(y[oldflags], 'black', ax) + + # now we just want to show data that was flagged + if flagmask is not True: + x = x[flagmask] + y = y[flagmask] + flagger = flagger.getFlagger(varname, flagmask) + + if x.empty: + return + + suspicious = pd.Series(data=np.ones(len(y), dtype=bool), index=y.index) + # flag by categories + + # plot UNFLAGGED (only nans are needed) + flag, color = flagger.UNFLAGGED, _colors['unflagged'] + flagged = flagger.isFlagged(varname, flag=flag, comparator='==') + ax.plot(x[flagged], y[flagged], '.', color=color, label=f"flag: {flag}") + if plot_nans: + _plot_nans(y[flagged], color, ax) + + # plot GOOD + flag, color = flagger.GOOD, _colors['good'] + flagged = flagger.isFlagged(varname, flag=flag, comparator='==') + ax.plot(x[flagged], y[flagged], '.', color=color, label=f"flag: {flag}") + if plot_nans: + _plot_nans(y[flagged], color, ax) + + # plot BAD + flag, color = flagger.BAD, _colors['bad'] + flagged = flagger.isFlagged(varname, flag=flag, comparator='==') + ax.plot(x[flagged], y[flagged], '.', color=color, label=f"flag: {flag}") + if plot_nans: + _plot_nans(y[flagged], color, ax) + + # plot SUSPICIOS + color = _colors['suspicious'] + flagged = flagger.isFlagged(varname, flag=flagger.GOOD, comparator='>') + flagged &= flagger.isFlagged(varname, flag=flagger.BAD, comparator='<') + ax.plot(x[flagged], y[flagged], '.', color=color, label=f"{flagger.GOOD} < flag < {flagger.BAD}") + if plot_nans: + _plot_nans(y[flagged], color, ax) + + +def _plot_nans(y, color, ax): + nans = y.isna() + _plotVline(ax, y[nans].index, color=color) def _plotVline(plt, points, color="blue"): @@ -137,14 +162,3 @@ def _plotVline(plt, points, color="blue"): for point in points: plt.axvline(point, color=color, linestyle=":") - -def _getColor(flag, flagger): - if flag == flagger.UNFLAGGED: - return "silver" - elif flag == flagger.GOOD: - return "green" - elif flag == flagger.BAD: - return "red" - else: - # suspicios - return "yellow" diff --git a/test/funcs/test_soil_moisture_tests.py b/test/funcs/test_soil_moisture_tests.py index 2f5c4f06211a204b7a3eb7bcb12296a7f40c29c1..e9834f2bf990d262fc3109015aceca48d12befa7 100644 --- a/test/funcs/test_soil_moisture_tests.py +++ b/test/funcs/test_soil_moisture_tests.py @@ -29,10 +29,10 @@ def test_flagSoilMoistureBySoilFrost(flagger): data, flagger_result = flagSoilMoistureBySoilFrost( data, "soil_moisture", flagger, "soil_temperature" ) - flag_assertion = np.arange(18, 37) + flag_assertion = np.arange(19, 37) flag_result = flagger_result.getFlags("soil_moisture") # .iloc[:, 0] - test_sum = (flag_result[flag_assertion] == flagger.BAD).sum() - assert test_sum == len(flag_assertion) + assert (flag_result[flag_assertion] == flagger.BAD).all() + @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -60,4 +60,4 @@ def test_flagSoilMoisturePrecipitationEvents(flagger): if __name__ == "__main__": flagger = TESTFLAGGER[2] - test_flagSoilMoisturePrecipitationEvents(flagger) + test_flagSoilMoistureBySoilFrost(flagger)