Skip to content
Snippets Groups Projects
Commit ae00e0e2 authored by Bert Palm's avatar Bert Palm 🎇
Browse files

moved fuzzy tests to own dir

parent fba49a89
No related branches found
No related tags found
4 merge requests!271Static expansion of regular expressions,!260Follow-Up Translations,!237Flagger Translations,!232WIP: Fuzzy testing
......@@ -2,32 +2,11 @@
# -*- coding: utf-8 -*-
import io
from typing import get_type_hints
import numpy as np
import pandas as pd
import dios
from hypothesis.strategies import (
lists,
sampled_from,
composite,
from_regex,
sampled_from,
datetimes,
integers,
register_type_strategy,
from_type,
)
from hypothesis.extra.numpy import arrays, from_dtype
from hypothesis.strategies._internal.types import _global_type_lookup
from dios import DictOfSeries
from saqc.common import *
from saqc.core.register import FUNC_MAP
from saqc.core.lib import SaQCFunction
from saqc.lib.types import FreqString, ColumnName, IntegerWindow
from saqc.flagger import Flagger, initFlagsLike
......@@ -63,141 +42,3 @@ def writeIO(content):
return f
MAX_EXAMPLES = 50 #100000
@composite
def dioses(draw, min_cols=1):
"""
initialize data according to the current restrictions
"""
# NOTE:
# The following restriction showed up and should be enforced during init:
# - Column names need to satisify the following regex: [A-Za-z0-9_-]+
# - DatetimeIndex needs to be sorted
# - Integer values larger than 2**53 lead to numerical instabilities during
# the integer->float->integer type conversion in _maskData/_unmaskData.
cols = draw(lists(columnNames(), unique=True, min_size=min_cols))
columns = {
c: draw(dataSeries(min_size=3))
for c in cols
}
return DictOfSeries(columns)
import numbers
@composite
def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")):
if np.isscalar(dtypes):
dtypes = (dtypes,)
dtype = np.dtype(draw(sampled_from(dtypes)))
if issubclass(dtype.type, numbers.Integral):
info = np.iinfo(dtype)
elif issubclass(dtype.type, numbers.Real):
info = np.finfo(dtype)
else:
raise ValueError("only numerical dtypes are supported")
# we don't want to fail just because of overflows
elements = from_dtype(dtype, min_value=info.min+1, max_value=info.max-1)
index = draw(daterangeIndexes(min_size=min_size, max_size=max_size))
values = draw(arrays(dtype=dtype, elements=elements, shape=len(index)))
return pd.Series(data=values, index=index)
@composite
def columnNames(draw):
return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True))
@composite
def flaggers(draw, data):
"""
initialize a flagger and set some flags
"""
# flagger = draw(sampled_from(TESTFLAGGER)).initFlags(data)
flagger = initFlagsLike(data)
for col, srs in data.items():
loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs)-1)
flagger[draw(loc_st), col] = BAD
return flagger
@composite
def functions(draw, module: str=None):
samples = tuple(FUNC_MAP.values())
if module:
samples = tuple(f for f in samples if f.name.startswith(module))
# samples = [FUNC_MAP["drift.correctExponentialDrift"]]
return draw(sampled_from(samples))
@composite
def daterangeIndexes(draw, min_size=0, max_size=100):
min_date = pd.Timestamp("1900-01-01").to_pydatetime()
max_date = pd.Timestamp("2099-12-31").to_pydatetime()
start = draw(datetimes(min_value=min_date, max_value=max_date))
periods = draw(integers(min_value=min_size, max_value=max_size))
freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
return pd.date_range(start, periods=periods, freq=freq)
@composite
def frequencyStrings(draw, _):
freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
mult = draw(integers(min_value=1, max_value=10))
value = f"{mult}{freq}"
return value
@composite
def dataFieldFlagger(draw):
data = draw(dioses())
field = draw(sampled_from(sorted(data.columns)))
flagger = draw(flaggers(data))
return data, field, flagger
@composite
def functionCalls(draw, module: str=None):
func = draw(functions(module))
kwargs = draw(functionKwargs(func))
return func, kwargs
@composite
def functionKwargs(draw, func: SaQCFunction):
data = draw(dioses())
field = draw(sampled_from(sorted(data.columns)))
kwargs = {
"data": data,
"field": field,
"flagger": draw(flaggers(data))
}
column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field))
interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1)
register_type_strategy(FreqString, frequencyStrings)
register_type_strategy(ColumnName, column_name_strategy)
register_type_strategy(IntegerWindow, interger_window_strategy)
for k, v in get_type_hints(func.func).items():
if k not in {"data", "field", "flagger", "return"}:
value = draw(from_type(v))
# if v is TimestampColumnName:
# value = draw(columnNames())
# # we don't want to overwrite 'field'
# assume(value != field)
# # let's generate and add a timestamp column
# data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field])))
# # data[value] = draw(dataSeries(dtypes="datetime64[ns]"))
kwargs[k] = value
del _global_type_lookup[FreqString]
del _global_type_lookup[ColumnName]
del _global_type_lookup[IntegerWindow]
return kwargs
#!/usr/bin/env python
#!/usr/bin/env python
import numbers
import dios
import numpy as np
import pandas as pd
from typing import get_type_hints
from hypothesis.strategies import (
lists,
sampled_from,
composite,
from_regex,
sampled_from,
datetimes,
integers,
register_type_strategy,
from_type,
)
from hypothesis.extra.numpy import arrays, from_dtype
from hypothesis.strategies._internal.types import _global_type_lookup
from saqc.common import *
from saqc.core.register import FUNC_MAP
from saqc.core.lib import SaQCFunction
from saqc.lib.types import FreqString, ColumnName, IntegerWindow
from saqc.flagger import Flagger, initFlagsLike
MAX_EXAMPLES = 50
# MAX_EXAMPLES = 100000
@composite
def dioses(draw, min_cols=1):
"""
initialize data according to the current restrictions
"""
# NOTE:
# The following restriction showed up and should be enforced during init:
# - Column names need to satisify the following regex: [A-Za-z0-9_-]+
# - DatetimeIndex needs to be sorted
# - Integer values larger than 2**53 lead to numerical instabilities during
# the integer->float->integer type conversion in _maskData/_unmaskData.
cols = draw(lists(columnNames(), unique=True, min_size=min_cols))
columns = {
c: draw(dataSeries(min_size=3))
for c in cols
}
return dios.DictOfSeries(columns)
@composite
def dataSeries(draw, min_size=0, max_size=100, dtypes=("float32", "float64", "int32", "int64")):
if np.isscalar(dtypes):
dtypes = (dtypes,)
dtype = np.dtype(draw(sampled_from(dtypes)))
if issubclass(dtype.type, numbers.Integral):
info = np.iinfo(dtype)
elif issubclass(dtype.type, numbers.Real):
info = np.finfo(dtype)
else:
raise ValueError("only numerical dtypes are supported")
# we don't want to fail just because of overflows
elements = from_dtype(dtype, min_value=info.min + 1, max_value=info.max - 1)
index = draw(daterangeIndexes(min_size=min_size, max_size=max_size))
values = draw(arrays(dtype=dtype, elements=elements, shape=len(index)))
return pd.Series(data=values, index=index)
@composite
def columnNames(draw):
return draw(from_regex(r"[A-Za-z0-9_-]+", fullmatch=True))
@composite
def flaggers(draw, data):
"""
initialize a flagger and set some flags
"""
flagger = initFlagsLike(data)
for col, srs in data.items():
loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs) - 1)
flagger[draw(loc_st), col] = BAD
return flagger
@composite
def functions(draw, module: str = None):
samples = tuple(FUNC_MAP.values())
if module:
samples = tuple(f for f in samples if f.name.startswith(module))
# samples = [FUNC_MAP["drift.correctExponentialDrift"]]
return draw(sampled_from(samples))
@composite
def daterangeIndexes(draw, min_size=0, max_size=100):
min_date = pd.Timestamp("1900-01-01").to_pydatetime()
max_date = pd.Timestamp("2099-12-31").to_pydatetime()
start = draw(datetimes(min_value=min_date, max_value=max_date))
periods = draw(integers(min_value=min_size, max_value=max_size))
freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
return pd.date_range(start, periods=periods, freq=freq)
@composite
def frequencyStrings(draw, _):
freq = draw(sampled_from(["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]))
mult = draw(integers(min_value=1, max_value=10))
value = f"{mult}{freq}"
return value
@composite
def dataFieldFlagger(draw):
data = draw(dioses())
field = draw(sampled_from(sorted(data.columns)))
flagger = draw(flaggers(data))
return data, field, flagger
@composite
def functionCalls(draw, module: str = None):
func = draw(functions(module))
kwargs = draw(functionKwargs(func))
return func, kwargs
@composite
def functionKwargs(draw, func: SaQCFunction):
data = draw(dioses())
field = draw(sampled_from(sorted(data.columns)))
kwargs = {
"data": data,
"field": field,
"flagger": draw(flaggers(data))
}
column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field))
interger_window_strategy = lambda _: integers(min_value=1, max_value=len(data[field]) - 1)
register_type_strategy(FreqString, frequencyStrings)
register_type_strategy(ColumnName, column_name_strategy)
register_type_strategy(IntegerWindow, interger_window_strategy)
for k, v in get_type_hints(func.func).items():
if k not in {"data", "field", "flagger", "return"}:
value = draw(from_type(v))
# if v is TimestampColumnName:
# value = draw(columnNames())
# # we don't want to overwrite 'field'
# assume(value != field)
# # let's generate and add a timestamp column
# data[value] = draw(dataSeries(dtypes="datetime64[ns]", length=len(data[field])))
# # data[value] = draw(dataSeries(dtypes="datetime64[ns]"))
kwargs[k] = value
del _global_type_lookup[FreqString]
del _global_type_lookup[ColumnName]
del _global_type_lookup[IntegerWindow]
return kwargs
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from saqc.core.register import FUNC_MAP
from hypothesis import given, settings
from hypothesis.strategies import (
data,
)
from hypothesis.strategies import data
from test.common import MAX_EXAMPLES, functionKwargs
from saqc.core.register import FUNC_MAP
from testsfuzzy.init import MAX_EXAMPLES, functionKwargs
@settings(max_examples=MAX_EXAMPLES, deadline=None)
......@@ -23,12 +21,15 @@ def callWontBreak(drawer, func_name: str):
# ------
# NOTE:
# needs a more alaborated test, as it calls into
# needs a more elaborated test, as it calls into
# `changepoints.assignChangePointClusters`
# def test_breaks_flagJumps():
# callWontBreak("breaks.flagJumps")
# def test_breaks_flagIsolated():
# callWontBreak("breaks.flagIsolated")
def test_breaks_flagJumps():
callWontBreak("breaks.flagJumps")
def test_breaks_flagIsolated():
callWontBreak("breaks.flagIsolated")
def test_breaks_flagMissing():
callWontBreak("breaks.flagMissing")
......@@ -40,6 +41,7 @@ def test_breaks_flagMissing():
def test_constats_flagConstats():
callWontBreak("constants.flagConstants")
def test_constants_flagByVariance():
callWontBreak("constants.flagByVariance")
......@@ -50,48 +52,58 @@ def test_constants_flagByVariance():
def test_flagtools_clearFlags():
callWontBreak("flagtools.clearFlags")
def test_flagtools_forceFlags():
callWontBreak("flagtools.clearFlags")
# NOTE:
# all of the following tests fail to sample data for `flag=typing.Any`
# with the new flagger in place this should be easy to fix
# def test_flagtools_flagGood():
# callWontBreak("flagtools.flagGood")
def test_flagtools_flagGood():
callWontBreak("flagtools.flagGood")
# def test_flagtools_flagUnflagged():
# callWontBreak("flagtools.flagUnflagged")
# def test_flagtools_flagManual():
# callWontBreak("flagtools.flagManual")
def test_flagtools_flagUnflagged():
callWontBreak("flagtools.flagUnflagged")
def test_flagtools_flagManual():
callWontBreak("flagtools.flagManual")
# outliers
# --------
# NOTE: needs a more elaborated test, I guess
# def test_outliers_flagByStray():
# callWontBreak("outliers.flagByStray")
def test_outliers_flagByStray():
callWontBreak("outliers.flagByStray")
# NOTE: fails in a strategy, maybe `Sequence[ColumnName]`
# def test_outliers_flagMVScores():
# callWontBreak("outliers.flagMVScores")
def test_outliers_flagMVScores():
callWontBreak("outliers.flagMVScores")
# NOTE:
# fails as certain combinations of frquency strings don't make sense
# a more elaborate test is needed
# def test_outliers_flagRaise():
# callWontBreak("outliers.flagRaise")
def test_outliers_flagRaise():
callWontBreak("outliers.flagRaise")
def test_outliers_flagMAD():
callWontBreak("outliers.flagMAD")
def test_outliers_flagByGrubbs():
callWontBreak("outliers.flagByGrubbs")
def test_outliers_flagRange():
callWontBreak("outliers.flagRange")
# NOTE: fails in a strategy, maybe `Sequence[ColumnName]`
# def test_outliers_flagCrossStatistic():
# callWontBreak("outliers.flagCrossStatistic")
def test_outliers_flagCrossStatistic():
callWontBreak("outliers.flagCrossStatistic")
......@@ -16,7 +16,7 @@ from saqc.common import *
from saqc.flagger import Flagger, initFlagsLike
from saqc.core.register import _maskData, _unmaskData
from test.common import dataFieldFlagger, MAX_EXAMPLES
from testsfuzzy.init import dataFieldFlagger, MAX_EXAMPLES
logging.disable(logging.CRITICAL)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment