diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index ed64f7ca0401d17d6b2669f9c2d9b4bf04c3160a..862b3b98171f6fababe8d5bd6cfd19367cb125d6 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -21,13 +21,33 @@ _OP = {'<': op.lt, '<=': op.le, '==': op.eq, '!=': op.ne, '>': op.gt, '>=': op.g def _dslIsFlagged( - flagger: Flagger, var: pd.Series, flag: float = UNFLAGGED, comparator: str = ">" + flagger: Flagger, var: pd.Series, flag: float = None, comparator: str = None ) -> Union[pd.Series, DictOfSeries]: """ helper function for `flag` + + Param Combinations + ------------------ + - ``isflagged('var')`` : show me (anything) flagged + - ``isflagged('var', DOUBT)`` : show me ``flags >= DOUBT`` + - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT`` + + Raises + ------ + ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')`` """ - comparison = _OP[comparator] - return comparison(flagger[var.name], flag) + if flag is None: + if comparator is not None: + raise ValueError('if `comparator` is used, explicitly pass a `flag` level.') + flag = UNFLAGGED + comparator = '>' + + # default + if comparator is None: + comparator = '>=' + + _op = _OP[comparator] + return _op(flagger[var.name], flag) def _execGeneric(flagger: Flagger, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], field: str, diff --git a/test/common.py b/test/common.py index f774cd5ed17f09447b7ffdc6344a43f2b522bca2..f61ddac3e34a142049a383ec6bb7fab870f92acd 100644 --- a/test/common.py +++ b/test/common.py @@ -2,32 +2,11 @@ # -*- coding: utf-8 -*- import io -from typing import get_type_hints - import numpy as np import pandas as pd import dios -from hypothesis.strategies import ( - lists, - sampled_from, - composite, - from_regex, - sampled_from, - datetimes, - integers, - register_type_strategy, - from_type, -) -from hypothesis.extra.numpy import arrays, from_dtype -from hypothesis.strategies._internal.types import _global_type_lookup - -from dios import DictOfSeries - from saqc.common import * -from saqc.core.register import FUNC_MAP -from saqc.core.lib import SaQCFunction -from saqc.lib.types import FreqString, ColumnName, IntegerWindow from saqc.flagger import Flagger, initFlagsLike @@ -63,141 +42,3 @@ def writeIO(content): return f -MAX_EXAMPLES = 50 #100000 - - -@composite -def dioses(draw, min_cols=1): - """ - initialize data according to the current restrictions - """ - # NOTE: - # The following restriction showed up and should be enforced during init: - # - Column names need to satisify the following regex: [A-Za-z0-9_-]+ - # - DatetimeIndex needs to be sorted - # - Integer values larger than 2**53 lead to numerical instabilities during - # the integer->float->integer type conversion in _maskData/_unmaskData. - - cols = draw(lists(columnNames(), unique=True, min_size=min_cols)) - columns = { - c: draw(dataSeries(min_size=3)) - for c in cols - } - return DictOfSeries(columns) - -import numbers - -@composite -def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")): - if np.isscalar(dtypes): - dtypes = (dtypes,) - - dtype = np.dtype(draw(sampled_from(dtypes))) - if issubclass(dtype.type, numbers.Integral): - info = np.iinfo(dtype) - elif issubclass(dtype.type, numbers.Real): - info = np.finfo(dtype) - else: - raise ValueError("only numerical dtypes are supported") - # we don't want to fail just because of overflows - elements = from_dtype(dtype, min_value=info.min+1, max_value=info.max-1) - - index = draw(daterangeIndexes(min_size=min_size, max_size=max_size)) - values = draw(arrays(dtype=dtype, elements=elements, shape=len(index))) - return pd.Series(data=values, index=index) - - -@composite -def columnNames(draw): - return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True)) - - -@composite -def flaggers(draw, data): - """ - initialize a flagger and set some flags - """ - # flagger = draw(sampled_from(TESTFLAGGER)).initFlags(data) - flagger = initFlagsLike(data) - for col, srs in data.items(): - loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs)-1) - flagger[draw(loc_st), col] = BAD - return flagger - - -@composite -def functions(draw, module: str=None): - samples = tuple(FUNC_MAP.values()) - if module: - samples = tuple(f for f in samples if f.name.startswith(module)) - # samples = [FUNC_MAP["drift.correctExponentialDrift"]] - return draw(sampled_from(samples)) - - -@composite -def daterangeIndexes(draw, min_size=0, max_size=100): - min_date = pd.Timestamp("1900-01-01").to_pydatetime() - max_date = pd.Timestamp("2099-12-31").to_pydatetime() - start = draw(datetimes(min_value=min_date, max_value=max_date)) - periods = draw(integers(min_value=min_size, max_value=max_size)) - freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"])) - return pd.date_range(start, periods=periods, freq=freq) - - -@composite -def frequencyStrings(draw, _): - freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"])) - mult = draw(integers(min_value=1, max_value=10)) - value = f"{mult}{freq}" - return value - -@composite -def dataFieldFlagger(draw): - data = draw(dioses()) - field = draw(sampled_from(sorted(data.columns))) - flagger = draw(flaggers(data)) - return data, field, flagger - - -@composite -def functionCalls(draw, module: str=None): - func = draw(functions(module)) - kwargs = draw(functionKwargs(func)) - return func, kwargs - - -@composite -def functionKwargs(draw, func: SaQCFunction): - data = draw(dioses()) - field = draw(sampled_from(sorted(data.columns))) - - kwargs = { - "data": data, - "field": field, - "flagger": draw(flaggers(data)) - } - - column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field)) - interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1) - - register_type_strategy(FreqString, frequencyStrings) - register_type_strategy(ColumnName, column_name_strategy) - register_type_strategy(IntegerWindow, interger_window_strategy) - - for k, v in get_type_hints(func.func).items(): - if k not in {"data", "field", "flagger", "return"}: - value = draw(from_type(v)) - # if v is TimestampColumnName: - # value = draw(columnNames()) - # # we don't want to overwrite 'field' - # assume(value != field) - # # let's generate and add a timestamp column - # data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field]))) - # # data[value] = draw(dataSeries(dtypes="datetime64[ns]")) - kwargs[k] = value - - del _global_type_lookup[FreqString] - del _global_type_lookup[ColumnName] - del _global_type_lookup[IntegerWindow] - - return kwargs diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py index 7677c3c279d30af1c2ac2348d9ec50251af2f74c..81e91d6430e019231537413862b8cc8305c105db 100644 --- a/test/funcs/test_generic_config_functions.py +++ b/test/funcs/test_generic_config_functions.py @@ -2,22 +2,20 @@ # -*- coding: utf-8 -*- import ast - import pytest import numpy as np import pandas as pd - -from dios import DictOfSeries - -from test.common import TESTFLAGGER, TESTNODATA, initData, writeIO +import dios from saqc.common import * from saqc.flagger import Flagger, initFlagsLike from saqc.core.visitor import ConfigFunctionParser from saqc.core.config import Fields as F from saqc.core.register import register -from saqc import SaQC from saqc.funcs.generic import _execGeneric +from saqc import SaQC + +from test.common import TESTNODATA, initData, writeIO @pytest.fixture @@ -32,7 +30,7 @@ def data_diff(): col1 = data[data.columns[1]] mid = len(col0) // 2 offset = len(col0) // 8 - return DictOfSeries(data={col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :],}) + return dios.DictOfSeries(data={col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :],}) def _compileGeneric(expr, flagger): @@ -41,8 +39,8 @@ def _compileGeneric(expr, flagger): return kwargs["func"] -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_missingIdentifier(data, flagger): +def test_missingIdentifier(data): + flagger = Flagger() # NOTE: # - the error is only raised at runtime during parsing would be better @@ -57,9 +55,8 @@ def test_missingIdentifier(data, flagger): _execGeneric(flagger, data, func, field="", nodata=np.nan) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_syntaxError(flagger): - +def test_syntaxError(): + flagger = Flagger() tests = [ "range(x=5", "rangex=5)", @@ -106,8 +103,7 @@ def test_comparisonOperators(data): assert np.all(result == expected) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_arithmeticOperators(data, flagger): +def test_arithmeticOperators(data): flagger = initFlagsLike(data) var1, *_ = data.columns this = data[var1] @@ -127,8 +123,7 @@ def test_arithmeticOperators(data, flagger): assert np.all(result == expected) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_nonReduncingBuiltins(data, flagger): +def test_nonReduncingBuiltins(data): flagger = initFlagsLike(data) var1, *_ = data.columns this = var1 @@ -147,10 +142,8 @@ def test_nonReduncingBuiltins(data, flagger): assert (result == expected).all() -@pytest.mark.parametrize("flagger", TESTFLAGGER) @pytest.mark.parametrize("nodata", TESTNODATA) -def test_reduncingBuiltins(data, flagger, nodata): - +def test_reduncingBuiltins(data, nodata): data.loc[::4] = nodata flagger = initFlagsLike(data) var1 = data.columns[0] @@ -171,10 +164,10 @@ def test_reduncingBuiltins(data, flagger, nodata): assert result == expected -@pytest.mark.parametrize("flagger", TESTFLAGGER) @pytest.mark.parametrize("nodata", TESTNODATA) -def test_ismissing(data, flagger, nodata): +def test_ismissing(data, nodata): + flagger = initFlagsLike(data) data.iloc[: len(data) // 2, 0] = np.nan data.iloc[(len(data) // 2) + 1 :, 0] = -9999 this = data.iloc[:, 0] @@ -190,9 +183,8 @@ def test_ismissing(data, flagger, nodata): assert np.all(result == expected) -@pytest.mark.parametrize("flagger", TESTFLAGGER) @pytest.mark.parametrize("nodata", TESTNODATA) -def test_bitOps(data, flagger, nodata): +def test_bitOps(data, nodata): var1, var2, *_ = data.columns this = var1 @@ -220,14 +212,26 @@ def test_isflagged(data): (f"isflagged({var1})", flagger[var1] > UNFLAGGED), (f"isflagged({var1}, flag=BAD)", flagger[var1] >= BAD), (f"isflagged({var1}, UNFLAGGED, '==')", flagger[var1] == UNFLAGGED), - (f"~isflagged({var2})", ~(flagger[var2] > UNFLAGGED)), - (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & ~(flagger[var2] > UNFLAGGED)), + (f"~isflagged({var2})", flagger[var2] == UNFLAGGED), + (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (flagger[var2] == UNFLAGGED)), ] - for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flagger) - result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) - assert np.all(result == expected) + for i, (test, expected) in enumerate(tests): + try: + func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flagger) + result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) + assert np.all(result == expected) + except Exception: + print(i, test) + raise + + # test bad combination + for comp in ['>', '>=', '==', '!=', '<', '<=']: + fails = f"isflagged({var1}, comparator='{comp}')" + + func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flagger) + with pytest.raises(ValueError): + _execGeneric(flagger, data, func, field=None, nodata=np.nan) def test_variableAssignments(data): @@ -249,6 +253,7 @@ def test_variableAssignments(data): assert set(result_flagger.columns) == set(data.columns) | {"dummy1", "dummy2"} +# TODO: why this must(!) fail ? - a comment would be helpful @pytest.mark.xfail(strict=True) def test_processMultiple(data_diff): var1, var2, *_ = data_diff.columns diff --git a/testsfuzzy/__init__.py b/testsfuzzy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4265cc3e6c16c09774190fa55d609cd9fe0808e4 --- /dev/null +++ b/testsfuzzy/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python diff --git a/testsfuzzy/init.py b/testsfuzzy/init.py new file mode 100644 index 0000000000000000000000000000000000000000..adbbffdc5cd2df8920348cb400a938337eee8087 --- /dev/null +++ b/testsfuzzy/init.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + + +import numbers +import dios +import numpy as np +import pandas as pd +from typing import get_type_hints + +from hypothesis.strategies import ( + lists, + sampled_from, + composite, + from_regex, + sampled_from, + datetimes, + integers, + register_type_strategy, + from_type, +) +from hypothesis.extra.numpy import arrays, from_dtype +from hypothesis.strategies._internal.types import _global_type_lookup + +from saqc.common import * +from saqc.core.register import FUNC_MAP +from saqc.core.lib import SaQCFunction +from saqc.lib.types import FreqString, ColumnName, IntegerWindow +from saqc.flagger import Flagger, initFlagsLike + +MAX_EXAMPLES = 50 +# MAX_EXAMPLES = 100000 + + +@composite +def dioses(draw, min_cols=1): + """ + initialize data according to the current restrictions + """ + # NOTE: + # The following restriction showed up and should be enforced during init: + # - Column names need to satisify the following regex: [A-Za-z0-9_-]+ + # - DatetimeIndex needs to be sorted + # - Integer values larger than 2**53 lead to numerical instabilities during + # the integer->float->integer type conversion in _maskData/_unmaskData. + + cols = draw(lists(columnNames(), unique=True, min_size=min_cols)) + columns = { + c: draw(dataSeries(min_size=3)) + for c in cols + } + return dios.DictOfSeries(columns) + + +@composite +def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")): + if np.isscalar(dtypes): + dtypes = (dtypes,) + + dtype = np.dtype(draw(sampled_from(dtypes))) + if issubclass(dtype.type, numbers.Integral): + info = np.iinfo(dtype) + elif issubclass(dtype.type, numbers.Real): + info = np.finfo(dtype) + else: + raise ValueError("only numerical dtypes are supported") + # we don't want to fail just because of overflows + elements = from_dtype(dtype, min_value=info.min + 1, max_value=info.max - 1) + + index = draw(daterangeIndexes(min_size=min_size, max_size=max_size)) + values = draw(arrays(dtype=dtype, elements=elements, shape=len(index))) + return pd.Series(data=values, index=index) + + +@composite +def columnNames(draw): + return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True)) + + +@composite +def flaggers(draw, data): + """ + initialize a flagger and set some flags + """ + flagger = initFlagsLike(data) + for col, srs in data.items(): + loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs) - 1) + flagger[draw(loc_st), col] = BAD + return flagger + + +@composite +def functions(draw, module: str = None): + samples = tuple(FUNC_MAP.values()) + if module: + samples = tuple(f for f in samples if f.name.startswith(module)) + # samples = [FUNC_MAP["drift.correctExponentialDrift"]] + return draw(sampled_from(samples)) + + +@composite +def daterangeIndexes(draw, min_size=0, max_size=100): + min_date = pd.Timestamp("1900-01-01").to_pydatetime() + max_date = pd.Timestamp("2099-12-31").to_pydatetime() + start = draw(datetimes(min_value=min_date, max_value=max_date)) + periods = draw(integers(min_value=min_size, max_value=max_size)) + freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"])) + return pd.date_range(start, periods=periods, freq=freq) + + +@composite +def frequencyStrings(draw, _): + freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"])) + mult = draw(integers(min_value=1, max_value=10)) + value = f"{mult}{freq}" + return value + + +@composite +def dataFieldFlagger(draw): + data = draw(dioses()) + field = draw(sampled_from(sorted(data.columns))) + flagger = draw(flaggers(data)) + return data, field, flagger + + +@composite +def functionCalls(draw, module: str = None): + func = draw(functions(module)) + kwargs = draw(functionKwargs(func)) + return func, kwargs + + +@composite +def functionKwargs(draw, func: SaQCFunction): + data = draw(dioses()) + field = draw(sampled_from(sorted(data.columns))) + + kwargs = { + "data": data, + "field": field, + "flagger": draw(flaggers(data)) + } + + column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field)) + interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1) + + register_type_strategy(FreqString, frequencyStrings) + register_type_strategy(ColumnName, column_name_strategy) + register_type_strategy(IntegerWindow, interger_window_strategy) + + for k, v in get_type_hints(func.func).items(): + if k not in {"data", "field", "flagger", "return"}: + value = draw(from_type(v)) + # if v is TimestampColumnName: + # value = draw(columnNames()) + # # we don't want to overwrite 'field' + # assume(value != field) + # # let's generate and add a timestamp column + # data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field]))) + # # data[value] = draw(dataSeries(dtypes="datetime64[ns]")) + kwargs[k] = value + + del _global_type_lookup[FreqString] + del _global_type_lookup[ColumnName] + del _global_type_lookup[IntegerWindow] + + return kwargs diff --git a/testsfuzzy/test_functions.py b/testsfuzzy/test_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..fc3caa00ed468731071578df148f8ed656290e0e --- /dev/null +++ b/testsfuzzy/test_functions.py @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + + +from hypothesis import given, settings +from hypothesis.strategies import data, from_type + +from saqc.core.register import FUNC_MAP +from testsfuzzy.init import MAX_EXAMPLES, functionKwargs + + +@settings(max_examples=MAX_EXAMPLES, deadline=None) +@given(drawer=data()) +def callWontBreak(drawer, func_name: str): + func = FUNC_MAP[func_name] + kwargs = drawer.draw(functionKwargs(func)) + + # TODO: workaround until `flag` is explicitly exposed in signature + flag = drawer.draw(from_type(float)) + kwargs.setdefault('flag', flag) + + func(**kwargs) + + +# breaks +# ------ + +# NOTE: +# needs a more elaborated test, as it calls into +# `changepoints.assignChangePointClusters` +def test_breaks_flagJumps(): + callWontBreak("breaks.flagJumps") + + +def test_breaks_flagIsolated(): + callWontBreak("breaks.flagIsolated") + + +def test_breaks_flagMissing(): + callWontBreak("breaks.flagMissing") + + +# constants +# --------- + +def test_constats_flagConstats(): + callWontBreak("constants.flagConstants") + + +def test_constants_flagByVariance(): + callWontBreak("constants.flagByVariance") + + +# flagtools +# --------- + +def test_flagtools_clearFlags(): + callWontBreak("flagtools.clearFlags") + + +def test_flagtools_forceFlags(): + callWontBreak("flagtools.clearFlags") + + +# NOTE: +# all of the following tests fail to sample data for `flag=typing.Any` +# with the new flagger in place this should be easy to fix +def test_flagtools_flagGood(): + callWontBreak("flagtools.flagGood") + + +def test_flagtools_flagUnflagged(): + callWontBreak("flagtools.flagUnflagged") + + +# NOTE: the problem is `mflag` which can be Any +# def test_flagtools_flagManual(): +# callWontBreak("flagtools.flagManual") + + +# outliers +# -------- +# +# NOTE: needs a more elaborated test, I guess +# def test_outliers_flagByStray(): +# callWontBreak("outliers.flagByStray") + + +# NOTE: fails in a strategy, maybe `Sequence[ColumnName]` +# def test_outliers_flagMVScores(): +# callWontBreak("outliers.flagMVScores") + + +# NOTE: +# fails as certain combinations of frquency strings don't make sense +# a more elaborate test is needed +# def test_outliers_flagRaise(): +# callWontBreak("outliers.flagRaise") +# +# +# def test_outliers_flagMAD(): +# callWontBreak("outliers.flagMAD") +# +# +# def test_outliers_flagByGrubbs(): +# callWontBreak("outliers.flagByGrubbs") +# +# +# def test_outliers_flagRange(): +# callWontBreak("outliers.flagRange") + + +# NOTE: fails in a strategy, maybe `Sequence[ColumnName]` +# def test_outliers_flagCrossStatistic(): +# callWontBreak("outliers.flagCrossStatistic") diff --git a/test/core/test_masking.py b/testsfuzzy/test_masking.py similarity index 98% rename from test/core/test_masking.py rename to testsfuzzy/test_masking.py index 6236a55e2a0677ea59cbee6eb6ebaeca977838bd..b1eb5861e40581b4365eede2cadeb654d419ec8f 100644 --- a/test/core/test_masking.py +++ b/testsfuzzy/test_masking.py @@ -16,7 +16,7 @@ from saqc.common import * from saqc.flagger import Flagger, initFlagsLike from saqc.core.register import _maskData, _unmaskData -from test.common import dataFieldFlagger, MAX_EXAMPLES +from testsfuzzy.init import dataFieldFlagger, MAX_EXAMPLES logging.disable(logging.CRITICAL)