moved fuzzy tests to own dir

ae00e0e2 · Bert Palm · fba49a89 · ae00e0e2 · ae00e0e2 · ae00e0e2
Commit ae00e0e2 authored 4 years ago by Bert Palm 🎇
--- a/test/common.py
+++ b/test/common.py
@@ -2,32 +2,11 @@
 # -*- coding: utf-8 -*-

 import io
-from typing import get_type_hints
-
 import numpy as np
 import pandas as pd
 import dios

-from hypothesis.strategies import (
-    lists,
-    sampled_from,
-    composite,
-    from_regex,
-    sampled_from,
-    datetimes,
-    integers,
-    register_type_strategy,
-    from_type,
-)
-from hypothesis.extra.numpy import arrays, from_dtype
-from hypothesis.strategies._internal.types import _global_type_lookup
-
-from dios import DictOfSeries
-
 from saqc.common import *
-from saqc.core.register import FUNC_MAP
-from saqc.core.lib import SaQCFunction
-from saqc.lib.types import FreqString, ColumnName, IntegerWindow
 from saqc.flagger import Flagger, initFlagsLike


@@ -63,141 +42,3 @@ def writeIO(content):
    return f


-MAX_EXAMPLES = 50 #100000
-
-
-@composite
-def dioses(draw, min_cols=1):
-    """
-    initialize data according to the current restrictions
-    """
-    # NOTE:
-    # The following restriction showed up and should be enforced during init:
-    # - Column names need to satisify the following regex: [A-Za-z0-9_-]+
-    # - DatetimeIndex needs to be sorted
-    # - Integer values larger than 2**53 lead to numerical instabilities during
-    #   the integer->float->integer type conversion in _maskData/_unmaskData.
-
-    cols = draw(lists(columnNames(), unique=True, min_size=min_cols))
-    columns = {
-        c: draw(dataSeries(min_size=3))
-        for c in cols
-    }
-    return DictOfSeries(columns)
-
-import numbers
-
-@composite
-def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")):
-    if np.isscalar(dtypes):
-        dtypes = (dtypes,)
-
-    dtype = np.dtype(draw(sampled_from(dtypes)))
-    if issubclass(dtype.type, numbers.Integral):
-        info = np.iinfo(dtype)
-    elif issubclass(dtype.type, numbers.Real):
-        info = np.finfo(dtype)
-    else:
-        raise ValueError("only numerical dtypes are supported")
-    # we don't want to fail just because of overflows
-    elements = from_dtype(dtype, min_value=info.min+1, max_value=info.max-1)
-
-    index = draw(daterangeIndexes(min_size=min_size, max_size=max_size))
-    values = draw(arrays(dtype=dtype, elements=elements, shape=len(index)))
-    return pd.Series(data=values, index=index)
-
-
-@composite
-def columnNames(draw):
-    return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True))
-
-
-@composite
-def flaggers(draw, data):
-    """
-    initialize a flagger and set some flags
-    """
-    # flagger = draw(sampled_from(TESTFLAGGER)).initFlags(data)
-    flagger = initFlagsLike(data)
-    for col, srs in data.items():
-        loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs)-1)
-        flagger[draw(loc_st), col] = BAD
-    return flagger
-
-
-@composite
-def functions(draw, module: str=None):
-    samples = tuple(FUNC_MAP.values())
-    if module:
-        samples = tuple(f for f in samples if f.name.startswith(module))
-    # samples = [FUNC_MAP["drift.correctExponentialDrift"]]
-    return draw(sampled_from(samples))
-
-
-@composite
-def daterangeIndexes(draw, min_size=0, max_size=100):
-    min_date = pd.Timestamp("1900-01-01").to_pydatetime()
-    max_date = pd.Timestamp("2099-12-31").to_pydatetime()
-    start = draw(datetimes(min_value=min_date, max_value=max_date))
-    periods = draw(integers(min_value=min_size, max_value=max_size))
-    freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
-    return pd.date_range(start, periods=periods, freq=freq)
-
-
-@composite
-def frequencyStrings(draw, _):
-    freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
-    mult = draw(integers(min_value=1, max_value=10))
-    value = f"{mult}{freq}"
-    return value
-
-@composite
-def dataFieldFlagger(draw):
-    data = draw(dioses())
-    field = draw(sampled_from(sorted(data.columns)))
-    flagger = draw(flaggers(data))
-    return data, field, flagger
-
-
-@composite
-def functionCalls(draw, module: str=None):
-    func = draw(functions(module))
-    kwargs = draw(functionKwargs(func))
-    return func, kwargs
-
-
-@composite
-def functionKwargs(draw, func: SaQCFunction):
-    data = draw(dioses())
-    field = draw(sampled_from(sorted(data.columns)))
-
-    kwargs = {
-        "data": data,
-        "field": field,
-        "flagger": draw(flaggers(data))
-    }
-
-    column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field))
-    interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1)
-
-    register_type_strategy(FreqString, frequencyStrings)
-    register_type_strategy(ColumnName, column_name_strategy)
-    register_type_strategy(IntegerWindow, interger_window_strategy)
-
-    for k, v in get_type_hints(func.func).items():
-        if k not in {"data", "field", "flagger", "return"}:
-            value = draw(from_type(v))
-            # if v is TimestampColumnName:
-            #     value = draw(columnNames())
-            #     # we don't want to overwrite 'field'
-            #     assume(value != field)
-            #     # let's generate and add a timestamp column
-            #     data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field])))
-            #     # data[value] = draw(dataSeries(dtypes="datetime64[ns]"))
-            kwargs[k] = value
-
-    del _global_type_lookup[FreqString]
-    del _global_type_lookup[ColumnName]
-    del _global_type_lookup[IntegerWindow]
-
-    return kwargs
--- a/testsfuzzy/__init__.py
+++ b/testsfuzzy/__init__.py
+#!/usr/bin/env python
--- a/testsfuzzy/init.py
+++ b/testsfuzzy/init.py
+#!/usr/bin/env python
+
+
+import numbers
+import dios
+import numpy as np
+import pandas as pd
+from typing import get_type_hints
+
+from hypothesis.strategies import (
+    lists,
+    sampled_from,
+    composite,
+    from_regex,
+    sampled_from,
+    datetimes,
+    integers,
+    register_type_strategy,
+    from_type,
+)
+from hypothesis.extra.numpy import arrays, from_dtype
+from hypothesis.strategies._internal.types import _global_type_lookup
+
+from saqc.common import *
+from saqc.core.register import FUNC_MAP
+from saqc.core.lib import SaQCFunction
+from saqc.lib.types import FreqString, ColumnName, IntegerWindow
+from saqc.flagger import Flagger, initFlagsLike
+
+MAX_EXAMPLES = 50
+
+
+# MAX_EXAMPLES = 100000
+
+
+@composite
+def dioses(draw, min_cols=1):
+    """
+    initialize data according to the current restrictions
+    """
+    # NOTE:
+    # The following restriction showed up and should be enforced during init:
+    # - Column names need to satisify the following regex: [A-Za-z0-9_-]+
+    # - DatetimeIndex needs to be sorted
+    # - Integer values larger than 2**53 lead to numerical instabilities during
+    #   the integer->float->integer type conversion in _maskData/_unmaskData.
+
+    cols = draw(lists(columnNames(), unique=True, min_size=min_cols))
+    columns = {
+        c: draw(dataSeries(min_size=3))
+        for c in cols
+    }
+    return dios.DictOfSeries(columns)
+
+
+@composite
+def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")):
+    if np.isscalar(dtypes):
+        dtypes = (dtypes,)
+
+    dtype = np.dtype(draw(sampled_from(dtypes)))
+    if issubclass(dtype.type, numbers.Integral):
+        info = np.iinfo(dtype)
+    elif issubclass(dtype.type, numbers.Real):
+        info = np.finfo(dtype)
+    else:
+        raise ValueError("only numerical dtypes are supported")
+    # we don't want to fail just because of overflows
+    elements = from_dtype(dtype, min_value=info.min + 1, max_value=info.max - 1)
+
+    index = draw(daterangeIndexes(min_size=min_size, max_size=max_size))
+    values = draw(arrays(dtype=dtype, elements=elements, shape=len(index)))
+    return pd.Series(data=values, index=index)
+
+
+@composite
+def columnNames(draw):
+    return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True))
+
+
+@composite
+def flaggers(draw, data):
+    """
+    initialize a flagger and set some flags
+    """
+    flagger = initFlagsLike(data)
+    for col, srs in data.items():
+        loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs) - 1)
+        flagger[draw(loc_st), col] = BAD
+    return flagger
+
+
+@composite
+def functions(draw, module: str = None):
+    samples = tuple(FUNC_MAP.values())
+    if module:
+        samples = tuple(f for f in samples if f.name.startswith(module))
+    # samples = [FUNC_MAP["drift.correctExponentialDrift"]]
+    return draw(sampled_from(samples))
+
+
+@composite
+def daterangeIndexes(draw, min_size=0, max_size=100):
+    min_date = pd.Timestamp("1900-01-01").to_pydatetime()
+    max_date = pd.Timestamp("2099-12-31").to_pydatetime()
+    start = draw(datetimes(min_value=min_date, max_value=max_date))
+    periods = draw(integers(min_value=min_size, max_value=max_size))
+    freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
+    return pd.date_range(start, periods=periods, freq=freq)
+
+
+@composite
+def frequencyStrings(draw, _):
+    freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
+    mult = draw(integers(min_value=1, max_value=10))
+    value = f"{mult}{freq}"
+    return value
+
+
+@composite
+def dataFieldFlagger(draw):
+    data = draw(dioses())
+    field = draw(sampled_from(sorted(data.columns)))
+    flagger = draw(flaggers(data))
+    return data, field, flagger
+
+
+@composite
+def functionCalls(draw, module: str = None):
+    func = draw(functions(module))
+    kwargs = draw(functionKwargs(func))
+    return func, kwargs
+
+
+@composite
+def functionKwargs(draw, func: SaQCFunction):
+    data = draw(dioses())
+    field = draw(sampled_from(sorted(data.columns)))
+
+    kwargs = {
+        "data": data,
+        "field": field,
+        "flagger": draw(flaggers(data))
+    }
+
+    column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field))
+    interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1)
+
+    register_type_strategy(FreqString, frequencyStrings)
+    register_type_strategy(ColumnName, column_name_strategy)
+    register_type_strategy(IntegerWindow, interger_window_strategy)
+
+    for k, v in get_type_hints(func.func).items():
+        if k not in {"data", "field", "flagger", "return"}:
+            value = draw(from_type(v))
+            # if v is TimestampColumnName:
+            #     value = draw(columnNames())
+            #     # we don't want to overwrite 'field'
+            #     assume(value != field)
+            #     # let's generate and add a timestamp column
+            #     data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field])))
+            #     # data[value] = draw(dataSeries(dtypes="datetime64[ns]"))
+            kwargs[k] = value
+
+    del _global_type_lookup[FreqString]
+    del _global_type_lookup[ColumnName]
+    del _global_type_lookup[IntegerWindow]
+
+    return kwargs
--- a/test/funcs/test_functions_fuzzy.py
+++ b/test/funcs/test_functions_fuzzy.py
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-

-from saqc.core.register import FUNC_MAP

 from hypothesis import given, settings
-from hypothesis.strategies import (
-    data,
-)
+from hypothesis.strategies import data

-from test.common import MAX_EXAMPLES, functionKwargs
+from saqc.core.register import FUNC_MAP
+from testsfuzzy.init import MAX_EXAMPLES, functionKwargs


 @settings(max_examples=MAX_EXAMPLES, deadline=None)
@@ -23,12 +21,15 @@ def callWontBreak(drawer, func_name: str):
 # ------

 # NOTE:
-# needs a more alaborated test, as it calls into
+# needs a more elaborated test, as it calls into
 # `changepoints.assignChangePointClusters`
-# def test_breaks_flagJumps():
-#     callWontBreak("breaks.flagJumps")
-# def test_breaks_flagIsolated():
-#     callWontBreak("breaks.flagIsolated")
+def test_breaks_flagJumps():
+    callWontBreak("breaks.flagJumps")
+
+
+def test_breaks_flagIsolated():
+    callWontBreak("breaks.flagIsolated")
+

 def test_breaks_flagMissing():
    callWontBreak("breaks.flagMissing")
@@ -40,6 +41,7 @@ def test_breaks_flagMissing():
 def test_constats_flagConstats():
    callWontBreak("constants.flagConstants")

+
 def test_constants_flagByVariance():
    callWontBreak("constants.flagByVariance")

@@ -50,48 +52,58 @@ def test_constants_flagByVariance():
 def test_flagtools_clearFlags():
    callWontBreak("flagtools.clearFlags")

+
 def test_flagtools_forceFlags():
    callWontBreak("flagtools.clearFlags")

+
 # NOTE:
 # all of the following tests fail to sample data for `flag=typing.Any`
 # with the new flagger in place this should be easy to fix
-# def test_flagtools_flagGood():
-#     callWontBreak("flagtools.flagGood")
+def test_flagtools_flagGood():
+    callWontBreak("flagtools.flagGood")

-# def test_flagtools_flagUnflagged():
-#     callWontBreak("flagtools.flagUnflagged")

-# def test_flagtools_flagManual():
-#     callWontBreak("flagtools.flagManual")
+def test_flagtools_flagUnflagged():
+    callWontBreak("flagtools.flagUnflagged")
+
+
+def test_flagtools_flagManual():
+    callWontBreak("flagtools.flagManual")


 # outliers
 # --------

 # NOTE: needs a more elaborated test, I guess
-# def test_outliers_flagByStray():
-#     callWontBreak("outliers.flagByStray")
+def test_outliers_flagByStray():
+    callWontBreak("outliers.flagByStray")
+

 # NOTE: fails in a strategy, maybe `Sequence[ColumnName]`
-# def test_outliers_flagMVScores():
-#     callWontBreak("outliers.flagMVScores")
+def test_outliers_flagMVScores():
+    callWontBreak("outliers.flagMVScores")
+

 # NOTE:
 # fails as certain combinations of frquency strings don't make sense
 # a more elaborate test is needed
-# def test_outliers_flagRaise():
-#     callWontBreak("outliers.flagRaise")
+def test_outliers_flagRaise():
+    callWontBreak("outliers.flagRaise")
+

 def test_outliers_flagMAD():
    callWontBreak("outliers.flagMAD")

+
 def test_outliers_flagByGrubbs():
    callWontBreak("outliers.flagByGrubbs")

+
 def test_outliers_flagRange():
    callWontBreak("outliers.flagRange")

+
 # NOTE: fails in a strategy, maybe `Sequence[ColumnName]`
-# def test_outliers_flagCrossStatistic():
-#     callWontBreak("outliers.flagCrossStatistic")
+def test_outliers_flagCrossStatistic():
+    callWontBreak("outliers.flagCrossStatistic")
--- a/test/core/test_masking.py
+++ b/test/core/test_masking.py
@@ -16,7 +16,7 @@ from saqc.common import *
 from saqc.flagger import Flagger, initFlagsLike
 from saqc.core.register import _maskData, _unmaskData

-from test.common import dataFieldFlagger, MAX_EXAMPLES
+from testsfuzzy.init import dataFieldFlagger, MAX_EXAMPLES


 logging.disable(logging.CRITICAL)