diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 490a4cf65ce118a0b322e7b436de134272ccc87e..aeb0137bffd0f21fa88765037359bb27451ce146 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,7 +24,7 @@ python37: - schedules image: python:3.7 script: - - pytest tests/core tests/flagger tests/funcs + - pytest tests/core tests/funcs - python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv @@ -34,7 +34,7 @@ python38: except: - schedules script: - - pytest tests/core tests/flagger tests/funcs + - pytest tests/core tests/funcs - python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv @@ -46,7 +46,7 @@ coverage: allow_failure: true script: - pip install pytest-cov coverage - - pytest --cov=saqc tests/core tests/flagger tests/funcs + - pytest --cov=saqc tests/core tests/funcs after_script: - coverage xml diff --git a/saqc/__main__.py b/saqc/__main__.py index 7b7a3c5b05f191b974fa3e4e1657f79e245ccadf..06a30ff83281ab67830bfda4272959f79513c2e5 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -83,7 +83,7 @@ def writeData(writer_dict, df, fname): def main(config, data, flagger, outfile, nodata, log_level, fail): if SCHEMES[flagger] is NotImplemented: - warnings.warn("flagger is currently not supported") + warnings.warn("--flagger is deprecated", DeprecationWarning) _setupLogging(log_level) reader, writer = setupIO(nodata) @@ -92,11 +92,11 @@ def main(config, data, flagger, outfile, nodata, log_level, fail): saqc = SaQC(data=data, nodata=nodata, error_policy="raise" if fail else "warn",) - data_result, flagger_result = saqc.readConfig(config).getResult(raw=True) + data_result, flags_result = saqc.readConfig(config).getResult(raw=True) if outfile: data_frame = data_result.to_df() - flags_frame = flagger_result.toFrame() + flags_frame = flags_result.toFrame() unflagged = (flags_frame == UNFLAGGED) | flags_frame.isna() flags_frame[unflagged] = GOOD diff --git a/saqc/core/core.py b/saqc/core/core.py index 2fe5c6e1106cbe3c0fc1a267a43c47fa83fb7cbc..8041f6f4f1fea1cf66f03ff6d197961bf43fdd26 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -18,7 +18,7 @@ import timeit import inspect from saqc.constants import * -from saqc.core.flags import initFlagsLike, Flags as Flagger +from saqc.core.flags import initFlagsLike, Flags from saqc.core.lib import APIController, ColumnSelector from saqc.core.register import FUNC_MAP, SaQCFunction from saqc.core.modules import FuncModules @@ -71,7 +71,7 @@ def _prepInput(data, flags): if isinstance(flags.index, pd.MultiIndex) or isinstance(flags.columns, pd.MultiIndex): raise TypeError("'flags' should not use MultiIndex") - if isinstance(flags, (dios.DictOfSeries, pd.DataFrame, Flagger)): + if isinstance(flags, (dios.DictOfSeries, pd.DataFrame, Flags)): # NOTE: only test common columns, data as well as flags could # have more columns than the respective other. cols = flags.columns.intersection(data.columns) @@ -80,8 +80,8 @@ def _prepInput(data, flags): raise ValueError(f"the index of 'flags' and 'data' missmatch in column {c}") # this also ensures float dtype - if not isinstance(flags, Flagger): - flags = Flagger(flags, copy=True) + if not isinstance(flags, Flags): + flags = Flags(flags, copy=True) return data, flags @@ -108,30 +108,30 @@ class SaQC(FuncModules): self._data = data self._nodata = nodata self._to_mask = to_mask - self._flagger = self._initFlagger(data, flags) + self._flags = self._initFlags(data, flags) self._error_policy = error_policy # NOTE: will be filled by calls to `_wrap` self._to_call: List[Tuple[ColumnSelector, APIController, SaQCFunction]] = [] - def _initFlagger(self, data, flagger: Union[Flagger, None]): + def _initFlags(self, data, flags: Union[Flags, None]): """ Init the internal flagger object. Ensures that all data columns are present and user passed flags from a flags frame or an already initialised flagger are used. """ - if flagger is None: + if flags is None: return initFlagsLike(data) # add columns that are present in data but not in flagger - for c in data.columns.difference(flagger.columns): - flagger[c] = pd.Series(UNFLAGGED, index=data[c].index, dtype=float) + for c in data.columns.difference(flags.columns): + flags[c] = pd.Series(UNFLAGGED, index=data[c].index, dtype=float) - return flagger + return flags def _constructSimple(self) -> SaQC: return SaQC( data=dios.DictOfSeries(), - flags=Flagger(), + flags=Flags(), nodata=self._nodata, to_mask=self._to_mask, error_policy=self._error_policy @@ -140,7 +140,7 @@ class SaQC(FuncModules): def readConfig(self, fname): from saqc.core.reader import readConfig out = stdcopy.deepcopy(self) - out._to_call.extend(readConfig(fname, self._flagger)) + out._to_call.extend(readConfig(fname, self._flags)) return out def _expandFields(self, selector: ColumnSelector, func: SaQCFunction, variables: pd.Index) -> Sequence[Tuple[ColumnSelector, SaQCFunction]]: @@ -166,15 +166,15 @@ class SaQC(FuncModules): # NOTE: It would be nicer to separate the plotting into an own # method instead of intermingling it with the computation - data, flagger = self._data, self._flagger + data, flags = self._data, self._flags for selector, control, function in self._to_call: - for sel, func in self._expandFields(selector, function, data.columns.union(flagger.columns)): + for sel, func in self._expandFields(selector, function, data.columns.union(flags.columns)): logger.debug(f"processing: {sel.field}, {func.name}, {func.keywords}") t0 = timeit.default_timer() try: - data_result, flagger_result = _saqcCallFunc(sel, control, func, data, flagger) + data_result, flags_result = _saqcCallFunc(sel, control, func, data, flags) except Exception as e: t1 = timeit.default_timer() logger.debug(f"{func.name} failed after {t1 - t0} sec") @@ -188,23 +188,23 @@ class SaQC(FuncModules): plotHook( data_old=data, data_new=data_result, - flagger_old=flagger, - flagger_new=flagger_result, + flagger_old=flags, + flagger_new=flags_result, sources=[], targets=[sel.field], plot_name=func.name, ) data = data_result - flagger = flagger_result + flags = flags_result if any([control.plot for _, control, _ in self._to_call]): - plotAllHook(data, flagger) + plotAllHook(data, flags) # This is way faster for big datasets, than to throw everything in the constructor. # Simply because of _initFlagger -> merge() -> mergeDios() over all columns. new = self._constructSimple() - new._flagger, new._data = flagger, data + new._flags, new._data = flags, data return new def getResult(self, raw=False): @@ -217,12 +217,12 @@ class SaQC(FuncModules): """ realization = self.evaluate() - data, flagger = realization._data, realization._flagger + data, flags = realization._data, realization._flags if raw: - return data, flagger + return data, flags - return data.to_df(), flagger.toFrame() + return data.to_df(), flags.toFrame() def _wrap(self, func: SaQCFunction): @@ -267,26 +267,26 @@ class SaQC(FuncModules): return stdcopy.copy(self) -def _saqcCallFunc(locator, controller, function, data, flagger): +def _saqcCallFunc(locator, controller, function, data, flags): # NOTE: # We assure that all columns in data have an equivalent column in flags, # we might have more flagger columns though - assert data.columns.difference(flagger.columns).empty + assert data.columns.difference(flags.columns).empty field = locator.field target = locator.target if (target != field) and (locator.regex is False): - data, flagger = copy(data, field, flagger, target) + data, flags = copy(data, field, flags, target) field = target - data_result, flagger_result = function(data, field, flagger) + data_result, flags_result = function(data, field, flags) # we check the passed function-kwargs after the actual call, because now "hard" errors would already have been # raised (Eg. `TypeError: got multiple values for argument 'data'`, when the user pass data=...) _warnForUnusedKwargs(function) - return data_result, flagger_result + return data_result, flags_result def _warnForUnusedKwargs(func): diff --git a/saqc/core/lib.py b/saqc/core/lib.py index 2236e3b6387679416077e66fc42fe76e84820107..24fb2963356c70ee5e8bb5ca5a8e8e72aec059a5 100644 --- a/saqc/core/lib.py +++ b/saqc/core/lib.py @@ -50,9 +50,9 @@ class SaQCFunction: **{**self.keywords, **keywords} ) - def __call__(self, data, field, flagger, *args, **keywords): + def __call__(self, data, field, flags, *args, **keywords): keywords = {**self.keywords, **keywords} - return self.func(data, field, flagger, *self.args, *args, **keywords) + return self.func(data, field, flags, *self.args, *args, **keywords) def errorMessage(self) -> str: return f"function: {self.name}\narguments: {self.args}\nkeywords: {self.keywords}" diff --git a/saqc/core/modules/breaks.py b/saqc/core/modules/breaks.py index 6fab21ff34a07674f430322f377f410019a5acce..49826f4c17fa9c18c2cdc3b428c4ab5ef9ef288a 100644 --- a/saqc/core/modules/breaks.py +++ b/saqc/core/modules/breaks.py @@ -7,7 +7,7 @@ from dios import DictOfSeries from saqc.constants import * from saqc.core.modules.base import ModuleBase -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.lib.types import FreqString, IntegerWindow, ColumnName @@ -19,7 +19,7 @@ class Breaks(ModuleBase): nodata: float = np.nan, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagMissing", locals()) def flagIsolated( @@ -29,7 +29,7 @@ class Breaks(ModuleBase): group_window: FreqString, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagIsolated", locals()) def flagJumps( @@ -40,5 +40,5 @@ class Breaks(ModuleBase): min_periods: IntegerWindow = 1, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagJumps", locals()) diff --git a/saqc/core/modules/changepoints.py b/saqc/core/modules/changepoints.py index 7e5946cc6669ec65c5247fb5e6f6ded4fc601f22..bab2fe897909d01c5240e0c8ec6a80fefed23ceb 100644 --- a/saqc/core/modules/changepoints.py +++ b/saqc/core/modules/changepoints.py @@ -8,7 +8,7 @@ from dios import DictOfSeries from typing_extensions import Literal from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase from saqc.lib.types import FreqString, IntegerWindow @@ -29,7 +29,7 @@ class ChangePoints(ModuleBase): reduce_func: Callable[[np.ndarray, np.ndarray], int] = lambda x, _: x.argmax(), flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagChangePoints", locals()) def assignChangePointCluster( @@ -49,5 +49,5 @@ class ChangePoints(ModuleBase): assign_cluster: bool = True, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("assignChangePointCluster", locals()) diff --git a/saqc/core/modules/constants.py b/saqc/core/modules/constants.py index 6787b08ed7a342b17ab99c0e188fb1b252b4edeb..cf6e9851c79d1f84dd530e1c35ea148d4481d3f6 100644 --- a/saqc/core/modules/constants.py +++ b/saqc/core/modules/constants.py @@ -5,7 +5,7 @@ from typing import Tuple from dios import DictOfSeries from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase from saqc.lib.types import FreqString, ColumnName @@ -21,7 +21,7 @@ class Constants(ModuleBase): max_consec_missing: int = None, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagByVariance", locals()) def flagConstants( @@ -31,5 +31,5 @@ class Constants(ModuleBase): window: FreqString, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagConstants", locals()) diff --git a/saqc/core/modules/curvefit.py b/saqc/core/modules/curvefit.py index de43a906b6658bd8903af73d79bc820f37fbb064..edb9aa75b896f6dd7f7caf037febca3eea5bb6f8 100644 --- a/saqc/core/modules/curvefit.py +++ b/saqc/core/modules/curvefit.py @@ -6,7 +6,7 @@ from dios import DictOfSeries from typing_extensions import Literal from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -22,5 +22,5 @@ class Curvefit(ModuleBase): return_residues: bool = False, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("fitPolynomial", locals()) diff --git a/saqc/core/modules/drift.py b/saqc/core/modules/drift.py index e063e62f327759857fae71f52d4e604e96582479..0616dd53d9b4f2ed7f86b8cfc2fca9129cfc405f 100644 --- a/saqc/core/modules/drift.py +++ b/saqc/core/modules/drift.py @@ -8,7 +8,7 @@ from scipy.spatial.distance import pdist from saqc.constants import * from saqc.core.modules.base import ModuleBase -from saqc.funcs import LinkageString, DictOfSeries, Flagger +from saqc.funcs import LinkageString, DictOfSeries, Flags from saqc.lib.types import ColumnName, FreqString, CurveFitter @@ -24,7 +24,7 @@ class Drift(ModuleBase): linkage_method: LinkageString = "single", flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagDriftFromNorm", locals()) def flagDriftFromReference( @@ -36,7 +36,7 @@ class Drift(ModuleBase): metric: Callable[[np.ndarray, np.ndarray], float] = lambda x, y: pdist(np.array([x, y]), metric='cityblock') / len(x), flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagDriftFromReference", locals()) def flagDriftFromScaledNorm( @@ -51,7 +51,7 @@ class Drift(ModuleBase): linkage_method: LinkageString = "single", flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagDriftFromScaledNorm", locals()) def correctExponentialDrift( @@ -62,7 +62,7 @@ class Drift(ModuleBase): flag_maint_period: bool = False, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("correctExponentialDrift", locals()) def correctRegimeAnomaly( @@ -73,7 +73,7 @@ class Drift(ModuleBase): regime_transmission: Optional[FreqString] = None, x_date: bool = False, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("correctRegimeAnomaly", locals()) def correctOffset( @@ -85,5 +85,5 @@ class Drift(ModuleBase): min_periods: int, regime_transmission: Optional[FreqString] = None, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("correctOffset", locals()) diff --git a/saqc/core/modules/flagtools.py b/saqc/core/modules/flagtools.py index 94b4748f8a7aaf3f3d907cef9fb39fe2d5c53fb2..68bd703646e9821b8c2ac08457dfc9dd42544fe6 100644 --- a/saqc/core/modules/flagtools.py +++ b/saqc/core/modules/flagtools.py @@ -7,7 +7,7 @@ import pandas as pd from dios.dios import DictOfSeries from typing_extensions import Literal -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.constants import * from saqc.core.modules.base import ModuleBase from saqc.lib.types import ColumnName @@ -15,15 +15,15 @@ from saqc.lib.types import ColumnName class FlagTools(ModuleBase): - def clearFlags(self, field: ColumnName, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def clearFlags(self, field: ColumnName, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("clearFlags", locals()) def forceFlags( self, field: ColumnName, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("forceFlags", locals()) - def flagDummy(self, field: ColumnName, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def flagDummy(self, field: ColumnName, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("flagDummy", locals()) def flagForceFail(self, field: ColumnName, **kwargs): @@ -31,10 +31,10 @@ class FlagTools(ModuleBase): def flagUnflagged( self, field: ColumnName, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagUnflagged", locals()) - def flagGood(self, field: ColumnName, flag=BAD, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def flagGood(self, field: ColumnName, flag=BAD, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("flagGood", locals()) def flagManual( @@ -44,5 +44,5 @@ class FlagTools(ModuleBase): method: Literal["plain", "ontime", "left-open", "right-open"] = 'plain', flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagManual", locals()) diff --git a/saqc/core/modules/generic.py b/saqc/core/modules/generic.py index 649f5aafbc13bdc6b529186c3eea3b7006b6634a..87cde4f93f26d14fd28fbd745c5543ce768fc78a 100644 --- a/saqc/core/modules/generic.py +++ b/saqc/core/modules/generic.py @@ -8,7 +8,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -20,7 +20,7 @@ class Generic(ModuleBase): func: Callable[[pd.Series], pd.Series], nodata: float = np.nan, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("process", locals()) def flag( @@ -30,5 +30,5 @@ class Generic(ModuleBase): nodata: float = np.nan, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flag", locals()) diff --git a/saqc/core/modules/interpolation.py b/saqc/core/modules/interpolation.py index 8df763b902c4cd9eaa2dfd0ab8a686e1a46c57fa..0b31e46186462d17d30cd76ad59f4e24cc317696 100644 --- a/saqc/core/modules/interpolation.py +++ b/saqc/core/modules/interpolation.py @@ -8,7 +8,7 @@ import pandas as pd from dios import DictOfSeries from typing_extensions import Literal -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.constants import * from saqc.core.modules.base import ModuleBase from saqc.funcs.interpolation import _SUPPORTED_METHODS @@ -24,7 +24,7 @@ class Interpolation(ModuleBase): min_periods: int = 0, flag: float = UNFLAGGED, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("interpolateByRolling", locals()) def interpolateInvalid( @@ -36,7 +36,7 @@ class Interpolation(ModuleBase): downgrade_interpolation: bool = False, flag: float = UNFLAGGED, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("interpolateInvalid", locals()) def interpolateIndex( @@ -48,6 +48,6 @@ class Interpolation(ModuleBase): inter_limit: int = 2, downgrade_interpolation: bool = False, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("interpolateIndex", locals()) diff --git a/saqc/core/modules/outliers.py b/saqc/core/modules/outliers.py index d202af9b904c7b71c4918581387b0834aac067fc..40737dd6d3271b8d870e48a8f0b13b7dacc13c5d 100644 --- a/saqc/core/modules/outliers.py +++ b/saqc/core/modules/outliers.py @@ -9,7 +9,7 @@ from dios import DictOfSeries from typing_extensions import Literal from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase from saqc.lib.types import IntegerWindow, FreqString, ColumnName @@ -25,7 +25,7 @@ class Outliers(ModuleBase): alpha: float = 0.05, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagByStray", locals()) def flagMVScores( @@ -46,7 +46,7 @@ class Outliers(ModuleBase): reduction_min_periods: int = 1, flag: float = BAD, **kwargs, - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagMVScores", locals()) def flagRaise( @@ -62,7 +62,7 @@ class Outliers(ModuleBase): numba_boost: bool = True, # TODO: rm, not a user decision flag: float = BAD, **kwargs, - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagRaise", locals()) def flagMAD( @@ -72,7 +72,7 @@ class Outliers(ModuleBase): z: float = 3.5, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagMAD", locals()) def flagOffset( @@ -85,7 +85,7 @@ class Outliers(ModuleBase): numba_kickin: int = 200000, # TODO: rm, not a user decision flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagOffset", locals()) def flagByGrubbs( @@ -97,7 +97,7 @@ class Outliers(ModuleBase): check_lagged: bool = False, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagByGrubbs", locals()) def flagRange( @@ -107,7 +107,7 @@ class Outliers(ModuleBase): max: float = np.inf, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagRange", locals()) def flagCrossStatistic( @@ -118,5 +118,5 @@ class Outliers(ModuleBase): cross_stat: Literal["modZscore", "Zscore"] = "modZscore", flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagCrossStatistic", locals()) diff --git a/saqc/core/modules/pattern.py b/saqc/core/modules/pattern.py index 56db5f8523d9d4d7c30b16e3b197f653786edd97..16ab2949a56a89c2adc7cb2f746d393fa85f3a5e 100644 --- a/saqc/core/modules/pattern.py +++ b/saqc/core/modules/pattern.py @@ -6,7 +6,7 @@ from typing import Sequence, Tuple from dios import DictOfSeries from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -20,7 +20,7 @@ class Pattern(ModuleBase): waveform: str = "mexh", flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagPatternByDTW", locals()) def flagPatternByWavelet( @@ -31,5 +31,5 @@ class Pattern(ModuleBase): normalize: bool = True, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("flagPatternByWavelet", locals()) diff --git a/saqc/core/modules/resampling.py b/saqc/core/modules/resampling.py index e5996987a49c8a6ee026b40589e52fc8bb16b32b..3100d79e03958973e7dcd288e873c7d07d28df0e 100644 --- a/saqc/core/modules/resampling.py +++ b/saqc/core/modules/resampling.py @@ -9,7 +9,7 @@ from dios import DictOfSeries from typing_extensions import Literal from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase from saqc.funcs.interpolation import _SUPPORTED_METHODS @@ -25,7 +25,7 @@ class Resampling(ModuleBase): method: Literal["fagg", "bagg", "nagg"] = "nagg", flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("aggregate", locals()) def linear( @@ -33,7 +33,7 @@ class Resampling(ModuleBase): field: str, freq: str, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("linear", locals()) def interpolate( @@ -43,7 +43,7 @@ class Resampling(ModuleBase): method: _SUPPORTED_METHODS, order: int = 1, **kwargs, - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("interpolate", locals()) def mapToOriginal( @@ -55,7 +55,7 @@ class Resampling(ModuleBase): "inverse_interpolation" ], **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("mapToOriginal", locals()) def shift( @@ -65,7 +65,7 @@ class Resampling(ModuleBase): method: Literal["fshift", "bshift", "nshift"] = "nshift", freq_check: Optional[Literal["check", "auto"]] = None, # TODO: not a user decision **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("shift", locals()) def resample( @@ -81,7 +81,7 @@ class Resampling(ModuleBase): flag_agg_func: Callable[[pd.Series], float] = max, freq_check: Optional[Literal["check", "auto"]] = None, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("resample", locals()) def reindexFlags( @@ -94,5 +94,5 @@ class Resampling(ModuleBase): source: str, freq: Optional[str] = None, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("reindexFlags", locals()) diff --git a/saqc/core/modules/residues.py b/saqc/core/modules/residues.py index d0a03bac90f52703d0d4d8e44be6b413e25cd80c..dc8fd8bb3a4544a268b566efcf3151203ceb1213 100644 --- a/saqc/core/modules/residues.py +++ b/saqc/core/modules/residues.py @@ -8,7 +8,7 @@ from dios import DictOfSeries from typing_extensions import Literal from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -24,7 +24,7 @@ class Residues(ModuleBase): min_periods: Optional[int] = 0, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("calculatePolynomialResidues", locals()) def calculateRollingResidues( @@ -37,5 +37,5 @@ class Residues(ModuleBase): center: bool = True, flag: float = BAD, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("calculateRollingResidues", locals()) diff --git a/saqc/core/modules/scores.py b/saqc/core/modules/scores.py index eafc44d096c3dff5442087c0d7a0e0f47c3a9ab9..7b52179c111b85db670f043417386b23bad3437c 100644 --- a/saqc/core/modules/scores.py +++ b/saqc/core/modules/scores.py @@ -8,7 +8,7 @@ import pandas as pd from dios import DictOfSeries from typing_extensions import Literal -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -29,5 +29,5 @@ class Scores(ModuleBase): metric: str = 'minkowski', p: int = 2, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("assignKNNScore", locals()) diff --git a/saqc/core/modules/tools.py b/saqc/core/modules/tools.py index 16a19bc0e298a683b6d945cb49ed18b71bb14e9a..70469b1f3fc3cb5354bb1af9296673d68dacfdbc 100644 --- a/saqc/core/modules/tools.py +++ b/saqc/core/modules/tools.py @@ -6,18 +6,18 @@ from typing import Optional, Tuple from dios import DictOfSeries from typing_extensions import Literal -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase class Tools(ModuleBase): - def copy(self, field: str, new_field: str, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def copy(self, field: str, new_field: str, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("copy", locals()) - def drop(self, field: str, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def drop(self, field: str, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("drop", locals()) - def rename(self, field: str, new_name: str, **kwargs) -> Tuple[DictOfSeries, Flagger]: + def rename(self, field: str, new_name: str, **kwargs) -> Tuple[DictOfSeries, Flags]: return self.defer("rename", locals()) def mask( @@ -29,5 +29,5 @@ class Tools(ModuleBase): period_end: Optional[str]=None, include_bounds: bool=True, **kwargs, - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("mask", locals()) diff --git a/saqc/core/modules/transformation.py b/saqc/core/modules/transformation.py index ff94e0bbd2827ba6dcffd5aa2d3c007783842724..9fcddac47c4cfa50bd221a7442056a036f412430 100644 --- a/saqc/core/modules/transformation.py +++ b/saqc/core/modules/transformation.py @@ -6,7 +6,7 @@ from typing import Callable, Optional, Union, Tuple import pandas as pd from dios import DictOfSeries -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.modules.base import ModuleBase @@ -18,5 +18,5 @@ class Transformation(ModuleBase): func: Callable[[pd.Series], pd.Series], partition_freq: Optional[Union[float, str]] = None, **kwargs - ) -> Tuple[DictOfSeries, Flagger]: + ) -> Tuple[DictOfSeries, Flags]: return self.defer("transform", locals()) diff --git a/saqc/core/reader.py b/saqc/core/reader.py index e5aa0bce925bf798e3ac048ad05aea667c4c58d2..9be731b067f672eb98f5522eac22ca2e688caed9 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -56,7 +56,7 @@ def _injectOptionalColumns(df): return df -def _parseConfig(df, flagger): +def _parseConfig(df, flags): funcs = [] for lineno, (_, target, expr, plot) in enumerate(df.itertuples()): if target == "None" or pd.isnull(target) or pd.isnull(expr): @@ -68,7 +68,7 @@ def _parseConfig(df, flagger): target = target[1:-1] tree = ast.parse(expr, mode="eval") - func_name, kwargs = ConfigFunctionParser(flagger).parse(tree.body) + func_name, kwargs = ConfigFunctionParser(flags).parse(tree.body) func = FUNC_MAP[func_name] selector = ColumnSelector( @@ -89,7 +89,7 @@ def _parseConfig(df, flagger): return funcs -def readConfig(fname, flagger): +def readConfig(fname, flags): df = pd.read_csv( fname, sep=r"\s*;\s*", @@ -108,4 +108,4 @@ def readConfig(fname, flagger): df[F.TEST] = df[F.TEST].replace(r"^\s*$", np.nan, regex=True) df[F.PLOT] = df[F.PLOT].replace({"False": "", EMPTY: "", np.nan: ""}) df = df.astype({F.PLOT: bool}) - return _parseConfig(df, flagger) + return _parseConfig(df, flags) diff --git a/saqc/core/register.py b/saqc/core/register.py index 5d991e8036dd3ee21d8477486f21146db8ed0e19..b00f353d08953a0b728a6b85f02d504093049b02 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -10,7 +10,7 @@ import warnings from saqc.constants import * from saqc.core.lib import SaQCFunction -from saqc.core.flags import initFlagsLike, Flags as Flagger +from saqc.core.flags import initFlagsLike, Flags from saqc.lib.types import FuncReturnT # NOTE: @@ -26,7 +26,7 @@ class CallState: func: callable data: dios.DictOfSeries - flagger: Flagger + flags: Flags field: str args: tuple @@ -48,8 +48,8 @@ def register(masking: MaskingStrT = "all", module: Optional[str] = None): # executed if a register-decorated function is called, # nevertheless if it is called plain or via `SaQC.func`. @wraps(func) - def callWrapper(data, field, flagger, *args, **kwargs): - args = data, field, flagger, *args + def callWrapper(data, field, flags, *args, **kwargs): + args = data, field, flags, *args args, kwargs, old_state = _preCall(func, args, kwargs, masking, func_name) result = func(*args, **kwargs) return _postCall(result, old_state) @@ -99,25 +99,25 @@ def _preCall(func: callable, args: tuple, kwargs: dict, masking: MaskingStrT, fn mthresh = _getMaskingThresh(masking, kwargs, fname) kwargs['to_mask'] = mthresh - data, field, flagger, *args = args + data, field, flags, *args = args # handle data - masking columns = _getMaskingColumns(data, field, masking) - masked_data, mask = _maskData(data, flagger, columns, mthresh) + masked_data, mask = _maskData(data, flags, columns, mthresh) # store current state state = CallState( func=func, - data=data, flagger=flagger, field=field, + data=data, flags=flags, field=field, args=args, kwargs=kwargs, masking=masking, mthresh=mthresh, mask=mask ) # handle flags - clearing - prepped_flagger = _prepareFlags(flagger, masking) + prepped_flags = _prepareFlags(flags, masking) - args = masked_data, field, prepped_flagger, *args + args = masked_data, field, prepped_flags, *args return args, kwargs, state @@ -140,10 +140,10 @@ def _postCall(result, old_state: CallState) -> FuncReturnT: ------- data, flagger : dios.DictOfSeries, saqc.flagger.Flagger """ - data, flagger = result - flagger = _restoreFlags(flagger, old_state) + data, flags = result + flags = _restoreFlags(flags, old_state) data = _unmaskData(data, old_state) - return data, flagger + return data, flags def _getMaskingColumns(data: dios.DictOfSeries, field: str, masking: MaskingStrT): @@ -220,7 +220,7 @@ def _getMaskingThresh(masking, kwargs, fname): # TODO: this is heavily undertested -def _maskData(data, flagger, columns, thresh) -> Tuple[dios.DictOfSeries, dios.DictOfSeries]: +def _maskData(data, flags, columns, thresh) -> Tuple[dios.DictOfSeries, dios.DictOfSeries]: """ Mask data with Nans by flags worse that a threshold and according to ``masking`` keyword from the functions decorator. @@ -237,7 +237,7 @@ def _maskData(data, flagger, columns, thresh) -> Tuple[dios.DictOfSeries, dios.D # we use numpy here because it is faster for c in columns: - col_mask = _isflagged(flagger[c].to_numpy(), thresh) + col_mask = _isflagged(flags[c].to_numpy(), thresh) if any(col_mask): col_data = data[c].to_numpy(dtype=np.float64) @@ -259,7 +259,7 @@ def _isflagged(flagscol: Union[np.array, pd.Series], thresh: float) -> Union[np. return flagscol >= thresh -def _prepareFlags(flagger: Flagger, masking) -> Flagger: +def _prepareFlags(flags: Flags, masking) -> Flags: """ Prepare flags before each call. Always returns a copy. @@ -269,27 +269,27 @@ def _prepareFlags(flagger: Flagger, masking) -> Flagger: """ # Either the index or the columns itself changed if masking == 'none': - return flagger.copy() + return flags.copy() - return initFlagsLike(flagger, initial_value=UNTOUCHED) + return initFlagsLike(flags, initial_value=UNTOUCHED) -def _restoreFlags(flagger: Flagger, old_state: CallState): +def _restoreFlags(flags: Flags, old_state: CallState): if old_state.masking == 'none': - return flagger + return flags - columns = flagger.columns + columns = flags.columns # take field column and all possibly newly added columns if old_state.masking == 'field': - columns = columns.difference(old_state.flagger.columns) + columns = columns.difference(old_state.flags.columns) columns = columns.append(pd.Index([old_state.field])) - out = old_state.flagger.copy() + out = old_state.flags.copy() for c in columns: # this implicitly squash the new-flagger history (RHS) to a single column, which than is appended to # the old history (LHS). The new-flagger history possibly consist of multiple columns, one for each # time flags was set to the flagger. - out[c] = flagger[c] + out[c] = flags[c] return out diff --git a/saqc/core/visitor.py b/saqc/core/visitor.py index 0fadf4878bce9575450a92035bc0156284db317b..7d7203fa0c556991c106c73e0046aafadc034f14 100644 --- a/saqc/core/visitor.py +++ b/saqc/core/visitor.py @@ -137,7 +137,7 @@ class ConfigFunctionParser(ast.NodeVisitor): ast.Attribute ) - def __init__(self, flagger): + def __init__(self, flags): self.kwargs = {} self.environment = { diff --git a/saqc/funcs/breaks.py b/saqc/funcs/breaks.py index d9cbbc6d0e11735ad70888eea72a0f37d4d825a9..f3ad9eeb2e8c7ae84c832a056725b883009c163c 100644 --- a/saqc/funcs/breaks.py +++ b/saqc/funcs/breaks.py @@ -20,18 +20,18 @@ from saqc.constants import * from saqc.lib.tools import groupConsecutives from saqc.lib.types import FreqString, ColumnName, IntegerWindow from saqc.funcs.changepoints import assignChangePointCluster -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags @register(masking='field', module="breaks") def flagMissing( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, nodata: float = np.nan, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags all values indicating missing data. @@ -62,20 +62,20 @@ def flagMissing( else: mask = datacol == nodata - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='field', module="breaks") def flagIsolated( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, gap_window: FreqString, group_window: FreqString, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags arbitrary large groups of values, if they are surrounded by sufficiently large data gaps. @@ -136,21 +136,21 @@ def flagIsolated( if right.all(): bools[start:stop] = True - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='field', module="breaks") def flagJumps( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, thresh: float, winsz: FreqString, min_periods: IntegerWindow = 1, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Flag datapoints, where the mean of the values significantly changes (where the value course "jumps"). @@ -174,7 +174,7 @@ def flagJumps( flag to set. """ return assignChangePointCluster( - data, field, flagger, + data, field, flags, stat_func=lambda x, y: np.abs(np.mean(x) - np.mean(y)), thresh_func=lambda x, y: thresh, bwd_window=winsz, diff --git a/saqc/funcs/changepoints.py b/saqc/funcs/changepoints.py index 7c37b9ca1804a14e975c5fac197afed20bdcece1..83439157ec0687f6210a3c9f69962835ea649518 100644 --- a/saqc/funcs/changepoints.py +++ b/saqc/funcs/changepoints.py @@ -13,7 +13,7 @@ from dios import DictOfSeries from saqc.constants import * from saqc.lib.tools import customRoller -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.types import ColumnName, FreqString, IntegerWindow logger = logging.getLogger("SaQC") @@ -21,7 +21,7 @@ logger = logging.getLogger("SaQC") @register(masking='field', module="changepoints") def flagChangePoints( - data: DictOfSeries, field: str, flagger: Flagger, + data: DictOfSeries, field: str, flags: Flags, stat_func: Callable[[np.ndarray, np.ndarray], float], thresh_func: Callable[[np.ndarray, np.ndarray], float], bwd_window: FreqString, @@ -34,7 +34,7 @@ def flagChangePoints( reduce_func: Callable[[np.ndarray, np.ndarray], int] = lambda x, _: x.argmax(), flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Flag datapoints, where the parametrization of the process, the data is assumed to generate by, significantly changes. @@ -88,7 +88,7 @@ def flagChangePoints( return assignChangePointCluster( data, field, - flagger, + flags, stat_func=stat_func, thresh_func=thresh_func, bwd_window=bwd_window, @@ -109,7 +109,7 @@ def flagChangePoints( @register(masking='field', module="changepoints") def assignChangePointCluster( - data: DictOfSeries, field: str, flagger: Flagger, + data: DictOfSeries, field: str, flags: Flags, stat_func: Callable[[np.array, np.array], float], thresh_func: Callable[[np.array, np.array], float], bwd_window: str, @@ -125,7 +125,7 @@ def assignChangePointCluster( assign_cluster: bool = True, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Assigns label to the data, aiming to reflect continous regimes of the processes the data is assumed to be generated by. @@ -233,8 +233,8 @@ def assignChangePointCluster( residues = pd.Series(np.nan, index=data[field].index) residues[masked_index] = stat_arr data[field] = residues - flagger[:, field] = UNFLAGGED - return data, flagger + flags[:, field] = UNFLAGGED + return data, flags det_index = masked_index[result_arr] detected = pd.Series(True, index=det_index) @@ -253,11 +253,11 @@ def assignChangePointCluster( # (better to start cluster labels with number one) cluster += 1 data[field] = cluster - flagger[:, field] = UNFLAGGED + flags[:, field] = UNFLAGGED if flag_changepoints: - flagger[det_index, field] = flag - return data, flagger + flags[det_index, field] = flag + return data, flags @numba.jit(parallel=True, nopython=True) diff --git a/saqc/funcs/constants.py b/saqc/funcs/constants.py index a6b99a07c7f354a5c7394e06ab21cf5d36f4cf4b..3791eaabfe164ada318ef713210c8b6dfead6e68 100644 --- a/saqc/funcs/constants.py +++ b/saqc/funcs/constants.py @@ -10,7 +10,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.ts_operators import varQC from saqc.lib.tools import customRoller, getFreqDelta from saqc.lib.types import FreqString, ColumnName @@ -20,12 +20,12 @@ from saqc.lib.types import FreqString, ColumnName def flagConstants( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, thresh: float, window: FreqString, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ This functions flags plateaus/series of constant values of length `window` if their maximum total change is smaller than thresh. @@ -76,22 +76,22 @@ def flagConstants( m2 = r.max() - r.min() <= thresh mask = m1 | m2 - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='field', module="constants") def flagByVariance( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, window: FreqString = "12h", thresh: float = 0.0005, max_missing: int = None, max_consec_missing: int = None, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function flags plateaus/series of constant values. Any interval of values y(t),..y(t+n) is flagged, if: @@ -153,12 +153,12 @@ def flagByVariance( # are there any candidates for beeing flagged plateau-ish if plateaus.sum() == 0: - return data, flagger + return data, flags plateaus.fillna(method="bfill", limit=min_periods - 1, inplace=True) # result: plateaus = (plateaus[plateaus == 1.0]).index - flagger[plateaus, field] = flag - return data, flagger + flags[plateaus, field] = flag + return data, flags diff --git a/saqc/funcs/curvefit.py b/saqc/funcs/curvefit.py index d48d7ae4e0ed59b8e8e5b773620ff88d9cb81f3d..3465e07d141d62d84855469161f69e8ebea951d2 100644 --- a/saqc/funcs/curvefit.py +++ b/saqc/funcs/curvefit.py @@ -9,7 +9,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.tools import getFreqDelta from saqc.lib.ts_operators import ( polyRollerIrregular, @@ -24,7 +24,7 @@ from saqc.lib.ts_operators import ( def fitPolynomial( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, winsz: Union[int, str], polydeg: int, numba: Literal[True, False, "auto"] = "auto", @@ -33,7 +33,7 @@ def fitPolynomial( return_residues: bool = False, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function fits a polynomial model to the data and returns the fitted data curve. @@ -112,7 +112,7 @@ def fitPolynomial( """ # TODO: some (rater large) parts are functional similar to saqc.funcs.rolling.roll if data[field].empty: - return data, flagger + return data, flags data = data.copy() to_fit = data[field] regular = getFreqDelta(to_fit.index) @@ -202,7 +202,7 @@ def fitPolynomial( data[field] = residues if eval_flags: # TODO: we does not get any flags here, because of masking=field - worst = flagger[field].rolling(winsz, center=True, min_periods=min_periods).max() - flagger[field] = worst + worst = flags[field].rolling(winsz, center=True, min_periods=min_periods).max() + flags[field] = worst - return data, flagger + return data, flags diff --git a/saqc/funcs/drift.py b/saqc/funcs/drift.py index 0892673a80a4e042511d3bc4a19c022e340f637b..65f5b043dfd22ee3a6327cff7bc935d15a501940 100644 --- a/saqc/funcs/drift.py +++ b/saqc/funcs/drift.py @@ -15,7 +15,7 @@ from scipy.spatial.distance import pdist from saqc.constants import * from saqc.core.register import register -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.funcs.resampling import shift from saqc.funcs.changepoints import assignChangePointCluster from saqc.funcs.tools import drop, copy @@ -30,7 +30,7 @@ LinkageString = Literal["single", "complete", "average", "weighted", "centroid", def flagDriftFromNorm( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, fields: Sequence[ColumnName], segment_freq: FreqString, norm_spread: float, @@ -39,7 +39,7 @@ def flagDriftFromNorm( linkage_method: LinkageString = "single", flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags value courses that significantly deviate from a group of normal value courses. @@ -138,23 +138,23 @@ def flagDriftFromNorm( drifters = detectDeviants(segment[1], metric, norm_spread, norm_frac, linkage_method, 'variables') for var in drifters: - flagger[segment[1].index, fields[var]] = flag + flags[segment[1].index, fields[var]] = flag - return data, flagger + return data, flags @register(masking='all', module="drift") def flagDriftFromReference( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, fields: Sequence[ColumnName], segment_freq: FreqString, thresh: float, metric: Callable[[np.ndarray, np.ndarray], float] = lambda x, y: pdist(np.array([x, y]), metric='cityblock') / len(x), flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags value courses that deviate from a reference course by a margin exceeding a certain threshold. @@ -216,16 +216,16 @@ def flagDriftFromReference( dist = metric(segment[1].iloc[:, i].values, segment[1].loc[:, field].values) if dist > thresh: - flagger[segment[1].index, fields[i]] = flag + flags[segment[1].index, fields[i]] = flag - return data, flagger + return data, flags @register(masking='all', module="drift") def flagDriftFromScaledNorm( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, fields_scale1: Sequence[ColumnName], fields_scale2: Sequence[ColumnName], segment_freq: FreqString, @@ -235,7 +235,7 @@ def flagDriftFromScaledNorm( linkage_method: LinkageString = "single", flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function linearly rescales one set of variables to another set of variables with a different scale and then flags value courses that significantly deviate from a group of normal value courses. @@ -334,22 +334,22 @@ def flagDriftFromScaledNorm( drifters = detectDeviants(segment[1], metric, norm_spread, norm_frac, linkage_method, 'variables') for var in drifters: - flagger[segment[1].index, fields[var]] = flag + flags[segment[1].index, fields[var]] = flag - return data, flagger + return data, flags @register(masking='all', module="drift") def correctExponentialDrift( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, maint_data_field: ColumnName, cal_mean: int = 5, flag_maint_period: bool = False, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function fits an exponential model to chunks of data[field]. It is assumed, that between maintenance events, there is a drift effect shifting the meassurements in a way, that @@ -412,7 +412,7 @@ def correctExponentialDrift( """ # 1: extract fit intervals: if data[maint_data_field].empty: - return data, flagger + return data, flags data = data.copy() to_correct = data[field] @@ -446,22 +446,22 @@ def correctExponentialDrift( to_flag = drift_frame["drift_group"] to_flag = to_flag.drop(to_flag[: maint_data.index[0]].index) to_flag = to_flag.dropna() - flagger[to_flag, field] = flag + flags[to_flag, field] = flag - return data, flagger + return data, flags @register(masking='all', module="drift") def correctRegimeAnomaly( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, cluster_field: ColumnName, model: CurveFitter, regime_transmission: Optional[FreqString] = None, x_date: bool = False, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function fits the passed model to the different regimes in data[field] and tries to correct those values, that have assigned a negative label by data[cluster_field]. @@ -561,21 +561,21 @@ def correctRegimeAnomaly( last_valid = 1 data[field] = data_ser - return data, flagger + return data, flags @register(masking='all', module="drift") def correctOffset( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, max_mean_jump: float, normal_spread: float, search_winsz: FreqString, min_periods: int, regime_transmission: Optional[FreqString] = None, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Parameters ---------- @@ -609,23 +609,23 @@ def correctOffset( flagger : saqc.flagger.Flagger The flagger object, holding flags and additional Informations related to `data`. """ - data, flagger = copy(data, field, flagger, field + '_CPcluster') - data, flagger = assignChangePointCluster( - data, field + '_CPcluster', flagger, + data, flags = copy(data, field, flags, field + '_CPcluster') + data, flags = assignChangePointCluster( + data, field + '_CPcluster', flags, lambda x, y: np.abs(np.mean(x) - np.mean(y)), lambda x, y: max_mean_jump, bwd_window=search_winsz, min_periods_bwd=min_periods ) - data, flagger = assignRegimeAnomaly(data, field, flagger, field + '_CPcluster', normal_spread) - data, flagger = correctRegimeAnomaly( - data, field, flagger, field + '_CPcluster', + data, flags = assignRegimeAnomaly(data, field, flags, field + '_CPcluster', normal_spread) + data, flags = correctRegimeAnomaly( + data, field, flags, field + '_CPcluster', lambda x, p1: np.array([p1] * x.shape[0]), regime_transmission=regime_transmission ) - data, flagger = drop(data, field + '_CPcluster', flagger) + data, flags = drop(data, field + '_CPcluster', flags) - return data, flagger + return data, flags def _driftFit(x, shift_target, cal_mean): @@ -660,7 +660,7 @@ def _driftFit(x, shift_target, cal_mean): def flagRegimeAnomaly( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, cluster_field: ColumnName, norm_spread: float, linkage_method: LinkageString = "single", @@ -668,7 +668,7 @@ def flagRegimeAnomaly( norm_frac: float = 0.5, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A function to flag values belonging to an anomalous regime regarding modelling regimes of field. @@ -716,7 +716,7 @@ def flagRegimeAnomaly( Flags values may have changed, relatively to the flagger input. """ return assignRegimeAnomaly( - data, field, flagger, + data, field, flags, cluster_field, norm_spread, linkage_method=linkage_method, @@ -733,7 +733,7 @@ def flagRegimeAnomaly( def assignRegimeAnomaly( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, cluster_field: ColumnName, norm_spread: float, linkage_method: LinkageString = "single", @@ -743,7 +743,7 @@ def assignRegimeAnomaly( set_flags: bool = False, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A function to detect values belonging to an anomalous regime regarding modelling regimes of field. @@ -805,7 +805,7 @@ def assignRegimeAnomaly( if set_flags: for p in plateaus: - flagger[cluster_dios.iloc[:, p].index, field] = flag + flags[cluster_dios.iloc[:, p].index, field] = flag if set_cluster: for p in plateaus: @@ -813,4 +813,4 @@ def assignRegimeAnomaly( series[series == cluster[p]] = -cluster[p] data[cluster_field] = series - return data, flagger + return data, flags diff --git a/saqc/funcs/flagtools.py b/saqc/funcs/flagtools.py index db0ce930df9b47c9a43e82d963cabd007907c2fa..94b04da10df0c1ff1f94973c0314a95947a9537c 100644 --- a/saqc/funcs/flagtools.py +++ b/saqc/funcs/flagtools.py @@ -7,14 +7,14 @@ from dios import DictOfSeries from saqc.constants import * from saqc.lib.types import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags import warnings @register(masking='field', module="flagtools") def forceFlags( - data: DictOfSeries, field: ColumnName, flagger: Flagger, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: + data: DictOfSeries, field: ColumnName, flags: Flags, flag: float = BAD, **kwargs +) -> Tuple[DictOfSeries, Flags]: """ Set whole column to a flag value. @@ -41,13 +41,13 @@ def forceFlags( clearFlags : set whole column to UNFLAGGED flagUnflagged : set flag value at all unflagged positions """ - flagger[:, field] = flag - return data, flagger + flags[:, field] = flag + return data, flags # masking='none' is sufficient because call is redirected @register(masking='none', module="flagtools") -def clearFlags(data: DictOfSeries, field: ColumnName, flagger: Flagger, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def clearFlags(data: DictOfSeries, field: ColumnName, flags: Flags, **kwargs) -> Tuple[DictOfSeries, Flags]: """ Set whole column to UNFLAGGED. @@ -77,13 +77,13 @@ def clearFlags(data: DictOfSeries, field: ColumnName, flagger: Flagger, **kwargs flag = kwargs.pop('flag') warnings.warn(f'`flag={flag}` is ignored here.') - return forceFlags(data, field, flagger, flag=UNFLAGGED, **kwargs) + return forceFlags(data, field, flags, flag=UNFLAGGED, **kwargs) @register(masking='field', module="flagtools") def flagUnflagged( - data: DictOfSeries, field: ColumnName, flagger: Flagger, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: + data: DictOfSeries, field: ColumnName, flags: Flags, flag: float = BAD, **kwargs +) -> Tuple[DictOfSeries, Flags]: """ Function sets a flag at all unflagged positions. @@ -112,13 +112,13 @@ def flagUnflagged( clearFlags : set whole column to UNFLAGGED forceFlags : set whole column to a flag value """ - unflagged = flagger[field].isna() | (flagger[field] == UNFLAGGED) - flagger[unflagged, field] = flag - return data, flagger + unflagged = flags[field].isna() | (flags[field] == UNFLAGGED) + flags[unflagged, field] = flag + return data, flags @register(masking='field', module="flagtools") -def flagGood(data: DictOfSeries, field: ColumnName, flagger: Flagger, flag=BAD, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def flagGood(data: DictOfSeries, field: ColumnName, flags: Flags, flag=BAD, **kwargs) -> Tuple[DictOfSeries, Flags]: """ Function sets the GOOD flag at all unflagged positions. @@ -139,18 +139,18 @@ def flagGood(data: DictOfSeries, field: ColumnName, flagger: Flagger, flag=BAD, The flagger object, holding flags and additional Informations related to `data`. """ warnings.warn("'flagGood' is deprecated and does nothing, use 'flagUnflagged' instead", DeprecationWarning) - return data, flagger + return data, flags @register(masking='field', module="flagtools") def flagManual( - data: DictOfSeries, field: ColumnName, flagger: Flagger, + data: DictOfSeries, field: ColumnName, flags: Flags, mdata: Union[pd.Series, pd.DataFrame, DictOfSeries], mflag: Any = 1, method: Literal["plain", "ontime", "left-open", "right-open"] = 'plain', flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Flag data by given, "manually generated" data. @@ -280,12 +280,12 @@ def flagManual( mask = mdata == mflag mask = mask.reindex(dat.index).fillna(False) - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='none', module="flagtools") -def flagDummy(data: DictOfSeries, field: ColumnName, flagger: Flagger, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def flagDummy(data: DictOfSeries, field: ColumnName, flags: Flags, **kwargs) -> Tuple[DictOfSeries, Flags]: """ Function does nothing but returning data and flagger. @@ -305,11 +305,11 @@ def flagDummy(data: DictOfSeries, field: ColumnName, flagger: Flagger, **kwargs flagger : saqc.flagger.Flagger The flagger object, holding flags and additional Informations related to `data`. """ - return data, flagger + return data, flags @register(masking='none', module="flagtools") -def flagForceFail(data: DictOfSeries, field: ColumnName, flagger: Flagger, **kwargs): +def flagForceFail(data: DictOfSeries, field: ColumnName, flags: Flags, **kwargs): """ Function raises a runtime error. diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 095ae1eea9cc9c750ee5c7044b9b820d143c6358..329514fcf511a0d95866950e1522b1e46a26a605 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -11,7 +11,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, initFlagsLike, Flags as Flagger +from saqc.core import register, initFlagsLike, Flags from saqc.core.visitor import ENVIRONMENT import operator as op @@ -20,7 +20,7 @@ _OP = {'<': op.lt, '<=': op.le, '==': op.eq, '!=': op.ne, '>': op.gt, '>=': op.g def _dslIsFlagged( - flagger: Flagger, var: pd.Series, flag: float = None, comparator: str = None + flags: Flags, var: pd.Series, flag: float = None, comparator: str = None ) -> Union[pd.Series, DictOfSeries]: """ helper function for `flag` @@ -46,10 +46,10 @@ def _dslIsFlagged( comparator = '>=' _op = _OP[comparator] - return _op(flagger[var.name], flag) + return _op(flags[var.name], flag) -def _execGeneric(flagger: Flagger, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], field: str, +def _execGeneric(flags: Flags, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], field: str, nodata: float) -> pd.Series: # TODO: # - check series.index compatibility @@ -65,7 +65,7 @@ def _execGeneric(flagger: Flagger, data: DictOfSeries, func: Callable[[pd.Series args.append(data[k]) globs = { - "isflagged": partial(_dslIsFlagged, flagger), + "isflagged": partial(_dslIsFlagged, flags), "ismissing": lambda var: ((var == nodata) | pd.isnull(var)), "mask": lambda cond: data[cond.name].mask(cond), "this": field, @@ -83,11 +83,11 @@ def _execGeneric(flagger: Flagger, data: DictOfSeries, func: Callable[[pd.Series def process( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, func: Callable[[pd.Series], pd.Series], nodata: float = np.nan, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ generate/process data with generically defined functions. @@ -137,27 +137,27 @@ def process( >>> lambda temperature, uncertainty: np.round(temperature) * np.sqrt(uncertainty) """ - data[field] = _execGeneric(flagger, data, func, field, nodata).squeeze() + data[field] = _execGeneric(flags, data, func, field, nodata).squeeze() # TODO: the former comment wished to overwrite the column, but i'm not sure -- palmb # see #GL177 - if field in flagger: - flagger.drop(field) + if field in flags: + flags.drop(field) - flagger[field] = initFlagsLike(data[field])[field] - return data, flagger + flags[field] = initFlagsLike(data[field])[field] + return data, flags @register(masking='all', module="generic") def flag( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, func: Callable[[pd.Series], pd.Series], nodata: float = np.nan, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: # TODO : fix docstring, check if all still works """ a function to flag a data column by evaluation of a generic expression. @@ -238,18 +238,18 @@ def flag( # NOTE: # The naming of the func parameter is pretty confusing # as it actually holds the result of a generic expression - mask = _execGeneric(flagger, data, func, field, nodata).squeeze() + mask = _execGeneric(flags, data, func, field, nodata).squeeze() if np.isscalar(mask): raise TypeError(f"generic expression does not return an array") if not np.issubdtype(mask.dtype, np.bool_): raise TypeError(f"generic expression does not return a boolean array") - if field not in flagger.columns: - flagger[field] = pd.Series(UNFLAGGED, index=mask.index, name=field) + if field not in flags.columns: + flags[field] = pd.Series(UNFLAGGED, index=mask.index, name=field) # if flagger.getFlags(field).empty: # flagger = flagger.merge( # flagger.initFlags( # data=pd.Series(name=field, index=mask.index, dtype=np.float64))) - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags diff --git a/saqc/funcs/interpolation.py b/saqc/funcs/interpolation.py index 9f3b985d7b7b7fb11242aa2e002383f02090b369..32fcd2ba45568d1a1e8fd4234f19ba484649b2d5 100644 --- a/saqc/funcs/interpolation.py +++ b/saqc/funcs/interpolation.py @@ -7,7 +7,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.core.register import _isflagged from saqc.core.history import applyFunctionOnHistory from saqc.lib.ts_operators import interpolateNANs @@ -20,14 +20,14 @@ _SUPPORTED_METHODS = Literal[ @register(masking='field', module="interpolation") def interpolateByRolling( - data: DictOfSeries, field: str, flagger: Flagger, + data: DictOfSeries, field: str, flags: Flags, winsz: Union[str, int], func: Callable[[pd.Series], float] = np.median, center: bool = True, min_periods: int = 0, flag: float = UNFLAGGED, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Interpolates nan-values in the data by assigning them the aggregation result of the window surrounding them. @@ -87,23 +87,23 @@ def interpolateByRolling( data[field] = datcol if flag is not None: - flagger[interpolated, field] = flag + flags[interpolated, field] = flag - return data, flagger + return data, flags @register(masking='field', module="interpolation") def interpolateInvalid( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, method: _SUPPORTED_METHODS, inter_order: int = 2, inter_limit: int = 2, downgrade_interpolation: bool = False, flag: float = UNFLAGGED, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to interpolate nan values in the data. @@ -160,10 +160,10 @@ def interpolateInvalid( interpolated = data[field].isna() & inter_data.notna() if flag is not None: - flagger[interpolated, field] = flag + flags[interpolated, field] = flag data[field] = inter_data - return data, flagger + return data, flags def _resampleOverlapping(data: pd.Series, freq: str, fill_value): @@ -181,14 +181,14 @@ def _resampleOverlapping(data: pd.Series, freq: str, fill_value): def interpolateIndex( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, method: _SUPPORTED_METHODS, inter_order: int = 2, inter_limit: int = 2, downgrade_interpolation: bool = False, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to interpolate the data at regular (equidistant) timestamps (or Grid points). @@ -238,14 +238,14 @@ def interpolateIndex( Flags values and shape may have changed relatively to the flagger input. """ if data[field].empty: - return data, flagger + return data, flags datcol = data[field].copy() start, end = datcol.index[0].floor(freq), datcol.index[-1].ceil(freq) grid_index = pd.date_range(start=start, end=end, freq=freq, name=datcol.index.name) - flagged = _isflagged(flagger[field], kwargs['to_mask']) + flagged = _isflagged(flags[field], kwargs['to_mask']) # drop all points that hold no relevant grid information datcol = datcol[~flagged].dropna() @@ -275,11 +275,11 @@ def interpolateIndex( data[field] = inter_data[grid_index] # do the reshaping on the history - flagger.history[field] = applyFunctionOnHistory( - flagger.history[field], + flags.history[field] = applyFunctionOnHistory( + flags.history[field], hist_func=_resampleOverlapping, hist_kws=dict(freq=freq, fill_value=UNFLAGGED), mask_func=_resampleOverlapping, mask_kws=dict(freq=freq, fill_value=False), last_column='dummy' ) - return data, flagger + return data, flags diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index e6f804be895311dc4ca7a0850fac6d53a417e083..844643f61856dbd5ea6027a4fba44d26a91697fb 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -12,7 +12,7 @@ from outliers import smirnov_grubbs from scipy.optimize import curve_fit from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.types import ColumnName, FreqString, IntegerWindow from saqc.lib.tools import customRoller, findIndex, getFreqDelta from saqc.funcs.scores import assignKNNScore @@ -23,14 +23,14 @@ import saqc.lib.ts_operators as ts_ops def flagByStray( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, partition_freq: Optional[Union[IntegerWindow, FreqString]] = None, partition_min: int = 11, iter_start: float = 0.5, alpha: float = 0.05, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Flag outliers in 1-dimensional (score) data with the STRAY Algorithm. @@ -79,7 +79,7 @@ def flagByStray( scores = data[field].dropna() if scores.empty: - return data, flagger + return data, flags if not partition_freq: partition_freq = scores.shape[0] @@ -117,16 +117,16 @@ def flagByStray( for iter_index in range(i_start - 1, sample_size): if gaps[iter_index] > log_alpha * ghat[iter_index]: index = partition.index[sorted_i[iter_index:]] - flagger[index, field] = flag + flags[index, field] = flag break - return data, flagger + return data, flags def _evalStrayLabels( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, fields: Sequence[str], reduction_range: Optional[str] = None, reduction_drop_flagged: bool = False, # TODO: still a case ? @@ -135,7 +135,7 @@ def _evalStrayLabels( at_least_one: bool = True, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function "reduces" an observations flag to components of it, by applying MAD (See references) test onto every components temporal surrounding. @@ -178,14 +178,14 @@ def _evalStrayLabels( [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm """ val_frame = data[fields].to_df() - stray_detects = flagger[field] > UNFLAGGED + stray_detects = flags[field] > UNFLAGGED stray_detects = stray_detects[stray_detects] to_flag_frame = pd.DataFrame(False, columns=fields, index=stray_detects.index) if reduction_range is None: for field in to_flag_frame.columns: - flagger[to_flag_frame.index, field] = flag - return data, flagger + flags[to_flag_frame.index, field] = flag + return data, flags for var in fields: for index in enumerate(to_flag_frame.index): @@ -232,9 +232,9 @@ def _evalStrayLabels( for field in to_flag_frame.columns: col = to_flag_frame[field] - flagger[col[col].index, field] = flag + flags[col[col].index, field] = flag - return data, flagger + return data, flags def _expFit(val_frame, scoring_method="kNNMaxGap", n_neighbors=10, iter_start=0.5, alpha=0.05, bin_frac=10): @@ -352,7 +352,7 @@ def _expFit(val_frame, scoring_method="kNNMaxGap", n_neighbors=10, iter_start=0. def flagMVScores( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, fields: Sequence[ColumnName], trafo: Callable[[pd.Series], pd.Series] = lambda x: x, alpha: float = 0.05, @@ -368,7 +368,7 @@ def flagMVScores( reduction_min_periods: int = 1, flag: float = BAD, **kwargs, -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The algorithm implements a 3-step outlier detection procedure for simultaneously flagging of higher dimensional data (dimensions > 3). @@ -473,8 +473,8 @@ def flagMVScores( outliers. See description of the `threshing` parameter for more details. Although [2] gives a fully detailed overview over the `stray` algorithm. """ - data, flagger = assignKNNScore( - data, 'dummy', flagger, + data, flags = assignKNNScore( + data, 'dummy', flags, fields=fields, n_neighbors=n_neighbors, trafo=trafo, @@ -485,8 +485,8 @@ def flagMVScores( kNN_algorithm='ball_tree', partition_min=stray_partition_min, **kwargs) - data, flagger = flagByStray( - data, 'kNN_scores', flagger, + data, flags = flagByStray( + data, 'kNN_scores', flags, partition_freq=stray_partition, partition_min=stray_partition_min, iter_start=iter_start, @@ -494,8 +494,8 @@ def flagMVScores( flag=flag, **kwargs) - data, flagger = _evalStrayLabels( - data, 'kNN_scores', flagger, + data, flags = _evalStrayLabels( + data, 'kNN_scores', flags, fields=fields, reduction_range=reduction_range, reduction_drop_flagged=reduction_drop_flagged, @@ -504,14 +504,14 @@ def flagMVScores( flag=flag, **kwargs) - return data, flagger + return data, flags @register(masking='field', module="outliers") def flagRaise( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, thresh: float, raise_window: FreqString, intended_freq: FreqString, @@ -522,7 +522,7 @@ def flagRaise( numba_boost: bool = True, # TODO: rm, not a user decision flag: float = BAD, **kwargs, -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags raises and drops in value courses, that exceed a certain threshold within a certain timespan. @@ -629,7 +629,7 @@ def flagRaise( raise_series = raise_series.apply(raise_check, args=(thresh,), raw=True) if raise_series.isna().all(): - return data, flagger + return data, flags # "unflag" values of insufficient deviation to their predecessors if min_slope is not None: @@ -672,21 +672,21 @@ def flagRaise( # check means against critical raise value: to_flag = dataseries >= weighted_rolling_mean + (raise_series / mean_raise_factor) to_flag &= raise_series.notna() - flagger[to_flag[to_flag].index, field] = flag + flags[to_flag[to_flag].index, field] = flag - return data, flagger + return data, flags @register(masking='field', module="outliers") def flagMAD( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, window: FreqString, z: float = 3.5, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function represents an implementation of the modyfied Z-score outlier detection method. @@ -739,15 +739,15 @@ def flagMAD( index = mask.index mask.loc[index < index[0] + pd.to_timedelta(window)] = False - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='field', module="outliers") def flagOffset( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, thresh: float, tolerance: float, window: Union[IntegerWindow, FreqString], @@ -755,7 +755,7 @@ def flagOffset( numba_kickin: int = 200000, # TODO: rm, not a user decision flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A basic outlier test that is designed to work for harmonized and not harmonized data. @@ -842,7 +842,7 @@ def flagOffset( post_jumps = post_jumps[post_jumps] if post_jumps.empty: - return data, flagger + return data, flags # get all the entries preceding a significant jump and its successors within "length" range to_roll = post_jumps.reindex(dataseries.index, method="bfill", tolerance=window, fill_value=False).dropna() @@ -897,22 +897,22 @@ def flagOffset( cresult = calcResult(result) cresult = cresult[cresult].index - flagger[cresult, field] = flag - return data, flagger + flags[cresult, field] = flag + return data, flags @register(masking='field', module="outliers") def flagByGrubbs( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, winsz: Union[FreqString, IntegerWindow], alpha: float = 0.05, min_periods: int = 8, check_lagged: bool = False, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The function flags values that are regarded outliers due to the grubbs test. @@ -1006,20 +1006,20 @@ def flagByGrubbs( to_flag &= to_flag_lagged - flagger[to_flag, field] = flag - return data, flagger + flags[to_flag, field] = flag + return data, flags @register(masking='field', module="outliers") def flagRange( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, min: float = -np.inf, max: float = np.inf, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function flags values not covered by the closed interval [`min`, `max`]. @@ -1050,21 +1050,21 @@ def flagRange( # using .values is much faster datacol = data[field].values mask = (datacol < min) | (datacol > max) - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='all', module="outliers") def flagCrossStatistic( data: DictOfSeries, field: ColumnName, - flagger: Flagger, + flags: Flags, fields: Sequence[ColumnName], thresh: float, cross_stat: Literal["modZscore", "Zscore"] = "modZscore", flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function checks for outliers relatively to the "horizontal" input data axis. @@ -1139,6 +1139,6 @@ def flagCrossStatistic( mask = diff_scores > thresh for var in fields: - flagger[mask[var], var] = flag + flags[mask[var], var] = flag - return data, flagger + return data, flags diff --git a/saqc/funcs/pattern.py b/saqc/funcs/pattern.py index a51a157a4e8483c90beac2858b5593bda8cba537..5f4829e9cd612fe472b0754d3652f71a72e0e7ae 100644 --- a/saqc/funcs/pattern.py +++ b/saqc/funcs/pattern.py @@ -9,7 +9,7 @@ from mlxtend.evaluate import permutation_test from dios.dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.tools import customRoller @@ -17,13 +17,13 @@ from saqc.lib.tools import customRoller def flagPatternByDTW( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, ref_field: str, widths: Sequence[int] = (1, 2, 4, 8), waveform: str = "mexh", flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Pattern recognition via wavelets. @@ -97,21 +97,21 @@ def flagPatternByDTW( sz = len(ref) mask = customRoller(dat, window=sz, min_periods=sz).apply(isPattern, raw=True) - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags @register(masking='field', module="pattern") def flagPatternByWavelet( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, ref_field: str, max_distance: float = 0.03, normalize: bool = True, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Pattern Recognition via Dynamic Time Warping. The steps are: @@ -169,5 +169,5 @@ def flagPatternByWavelet( sz = len(ref) mask = customRoller(dat, window=sz, min_periods=sz).apply(isPattern, raw=True) - flagger[mask, field] = flag - return data, flagger + flags[mask, field] = flag + return data, flags diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index 466bcf382837f2687468e7c29592fca283d466ca..967966c9c05476a0059be12b690abcbda6aee129 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -9,7 +9,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.core.register import _isflagged from saqc.core.history import applyFunctionOnHistory from saqc.lib.tools import evalFreqStr, getFreqDelta @@ -35,14 +35,14 @@ METHOD2ARGS = { def aggregate( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, value_func, flag_func: Callable[[pd.Series], float] = np.nanmax, method: Literal["fagg", "bagg", "nagg"] = "nagg", flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A method to "regularize" data by aggregating (resampling) data at a regular timestamp. @@ -106,9 +106,9 @@ def aggregate( Flags values and shape may have changed relatively to the flagger input. """ - data, flagger = copy(data, field, flagger, field + '_original') + data, flags = copy(data, field, flags, field + '_original') return resample( - data, field, flagger, + data, field, flags, freq=freq, agg_func=value_func, flag_agg_func=flag_func, @@ -122,10 +122,10 @@ def aggregate( def linear( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A method to "regularize" data by interpolating linearly the data at regular timestamp. @@ -165,20 +165,20 @@ def linear( Flags values and shape may have changed relatively to the flagger input. """ - data, flagger = copy(data, field, flagger, field + '_original') - return interpolateIndex(data, field, flagger, freq, "time", **kwargs) + data, flags = copy(data, field, flags, field + '_original') + return interpolateIndex(data, field, flags, freq, "time", **kwargs) @register(masking='none', module="resampling") def interpolate( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, method: _SUPPORTED_METHODS, order: int = 1, **kwargs, -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ A method to "regularize" data by interpolating the data at regular timestamp. @@ -232,22 +232,22 @@ def interpolate( Flags values and shape may have changed relatively to the flagger input. """ - data, flagger = copy(data, field, flagger, field + '_original') - return interpolateIndex(data, field, flagger, freq, method=method, inter_order=order, **kwargs) + data, flags = copy(data, field, flags, field + '_original') + return interpolateIndex(data, field, flags, freq, method=method, inter_order=order, **kwargs) @register(masking='none', module="resampling") def mapToOriginal( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, method: Literal[ "inverse_fagg", "inverse_bagg", "inverse_nagg", "inverse_fshift", "inverse_bshift", "inverse_nshift", "inverse_interpolation" ], **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The Function function "undoes" regularization, by regaining the original data and projecting the flags calculated for the regularized data onto the original ones. @@ -312,21 +312,21 @@ def mapToOriginal( Flags values and shape may have changed relatively to the flagger input. """ newfield = str(field) + '_original' - data, flagger = reindexFlags(data, newfield, flagger, method, source=field, to_mask=False) - data, flagger = drop(data, field, flagger) - return rename(data, newfield, flagger, field) + data, flags = reindexFlags(data, newfield, flags, method, source=field, to_mask=False) + data, flags = drop(data, field, flags) + return rename(data, newfield, flags, field) @register(masking='none', module="resampling") def shift( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, method: Literal["fshift", "bshift", "nshift"] = "nshift", freq_check: Optional[Literal["check", "auto"]] = None, # TODO: not a user decision **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to shift data and flags to a regular (equidistant) timestamp grid, according to ``method``. @@ -370,19 +370,19 @@ def shift( The flagger object, holding flags and additional Informations related to `data`. Flags values and shape may have changed relatively to the flagger input. """ - data, flagger = copy(data, field, flagger, field + '_original') - return _shift(data, field, flagger, freq, method=method, freq_check=freq_check, **kwargs) + data, flags = copy(data, field, flags, field + '_original') + return _shift(data, field, flags, freq, method=method, freq_check=freq_check, **kwargs) def _shift( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, method: Literal["fshift", "bshift", "nshift"] = "nshift", freq_check: Optional[Literal["check", "auto"]] = None, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to shift data points to regular (equidistant) timestamps. @@ -390,7 +390,7 @@ def _shift( -------- shift : Main caller, docstring """ - flagged = _isflagged(flagger[field], kwargs['to_mask']) + flagged = _isflagged(flags[field], kwargs['to_mask']) datcol = data[field] datcol[flagged] = np.nan freq = evalFreqStr(freq, freq_check, datcol.index) @@ -399,7 +399,7 @@ def _shift( datcol = shift2Freq(datcol, method, freq, fill_value=np.nan) # do the shift on the history - history = flagger.history[field] + history = flags.history[field] history.hist = shift2Freq(history.hist, method, freq, fill_value=UNTOUCHED) history.mask = shift2Freq(history.mask, method, freq, fill_value=False) @@ -409,16 +409,16 @@ def _shift( dummy = pd.Series(UNTOUCHED, index=datcol.index, dtype=float) history.append(dummy, force=True) - flagger.history[field] = history + flags.history[field] = history data[field] = datcol - return data, flagger + return data, flags @register(masking='none', module="resampling") def resample( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, freq: str, agg_func: Callable[[pd.Series], pd.Series] = np.mean, method: Literal["fagg", "bagg", "nagg"] = "bagg", @@ -429,7 +429,7 @@ def resample( flag_agg_func: Callable[[pd.Series], float] = max, freq_check: Optional[Literal["check", "auto"]] = None, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to resample the data. Afterwards the data will be sampled at regular (equidistant) timestamps (or Grid points). Sampling intervals therefor get aggregated with a function, specifyed by 'agg_func' parameter and @@ -513,7 +513,7 @@ def resample( The flagger object, holding flags and additional Informations related to `data`. Flags values and shape may have changed relatively to the flagger input. """ - flagged = _isflagged(flagger[field], kwargs['to_mask']) + flagged = _isflagged(flags[field], kwargs['to_mask']) datcol = data[field] datcol[flagged] = np.nan freq = evalFreqStr(freq, freq_check, datcol.index) @@ -537,15 +537,15 @@ def resample( max_invalid_consec=max_invalid_consec_f, ) - flagger.history[field] = applyFunctionOnHistory( - flagger.history[field], + flags.history[field] = applyFunctionOnHistory( + flags.history[field], hist_func=aggregate2Freq, hist_kws=kws, mask_func=aggregate2Freq, mask_kws=kws, last_column='dummy' ) data[field] = datcol - return data, flagger + return data, flags def _getChunkBounds(target: pd.Series, flagscol: pd.Series, freq: str): @@ -602,7 +602,7 @@ def _inverseShift(source: pd.Series, target: pd.Series, drop_mask: pd.Series, def reindexFlags( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, method: Literal[ "inverse_fagg", "inverse_bagg", "inverse_nagg", "inverse_fshift", "inverse_bshift", "inverse_nshift" @@ -610,7 +610,7 @@ def reindexFlags( source: str, freq: Optional[str] = None, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ The Function projects flags of "source" onto flags of "field". Wherever the "field" flags are "better" then the source flags projected on them, they get overridden with this associated source flag value. @@ -672,7 +672,7 @@ def reindexFlags( The flagger object, holding flags and additional Informations related to `data`. Flags values and shape may have changed relatively to the flagger input. """ - flagscol = flagger[source] + flagscol = flags[source] if freq is None: freq = getFreqDelta(flagscol.index) @@ -681,7 +681,7 @@ def reindexFlags( 'projection range to freq parameter') target_datcol = data[field] - target_flagscol = flagger[field] + target_flagscol = flags[field] dummy = pd.Series(np.nan, target_flagscol.index, dtype=float) if method[-13:] == "interpolation": @@ -709,6 +709,6 @@ def reindexFlags( else: raise ValueError(f"unknown method {method}") - history = applyFunctionOnHistory(flagger.history[source], func, func_kws, func, mask_kws, last_column=dummy) - flagger.history[field] = flagger.history[field].append(history, force=False) - return data, flagger + history = applyFunctionOnHistory(flags.history[source], func, func_kws, func, mask_kws, last_column=dummy) + flags.history[field] = flags.history[field].append(history, force=False) + return data, flags diff --git a/saqc/funcs/residues.py b/saqc/funcs/residues.py index b58c0cdf3cb322da5aa9fc464395cb0e3c006ce9..ad7b88a6414c5bb6876ab2c8dfcab2f09fe1c8c1 100644 --- a/saqc/funcs/residues.py +++ b/saqc/funcs/residues.py @@ -7,7 +7,7 @@ import numpy as np from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.funcs.rolling import roll from saqc.funcs.curvefit import fitPolynomial @@ -16,7 +16,7 @@ from saqc.funcs.curvefit import fitPolynomial def calculatePolynomialResidues( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, winsz: Union[str, int], polydeg: int, numba: Literal[True, False, "auto"] = "auto", # TODO: rm, not a a user decision @@ -24,7 +24,7 @@ def calculatePolynomialResidues( min_periods: Optional[int] = 0, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function fits a polynomial model to the data and returns the residues. @@ -101,7 +101,7 @@ def calculatePolynomialResidues( """ return fitPolynomial( - data, field, flagger, + data, field, flags, winsz=winsz, polydeg=polydeg, numba=numba, @@ -117,7 +117,7 @@ def calculatePolynomialResidues( def calculateRollingResidues( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, winsz: Union[str, int], func: Callable[[np.ndarray], np.ndarray] = np.mean, eval_flags: bool = True, @@ -125,10 +125,10 @@ def calculateRollingResidues( center: bool = True, flag: float = BAD, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ TODO: docstring needed""" return roll( - data, field, flagger, + data, field, flags, winsz=winsz, func=func, eval_flags=eval_flags, diff --git a/saqc/funcs/rolling.py b/saqc/funcs/rolling.py index 6990bb72fcd7ed35c9e9c9515e0ae1d7c6e78762..4b8a5f64e2e509a17357aa2e257a3162d41e8571 100644 --- a/saqc/funcs/rolling.py +++ b/saqc/funcs/rolling.py @@ -7,7 +7,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.tools import getFreqDelta @@ -15,7 +15,7 @@ from saqc.lib.tools import getFreqDelta def roll( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, winsz: Union[str, int], func: Callable[[pd.Series], float]=np.mean, eval_flags: bool=True, # TODO: not applicable anymore @@ -73,7 +73,7 @@ def roll( data = data.copy() to_fit = data[field] if to_fit.empty: - return data, flagger + return data, flags regular = getFreqDelta(to_fit.index) # starting with the annoying case: finding the rolling interval centers of not-harmonized input time series: @@ -123,7 +123,7 @@ def roll( data[field] = means if eval_flags: # TODO: we does not get any flags here, because of masking=field - worst = flagger[field].rolling(winsz, center=True, min_periods=min_periods).max() - flagger[field] = worst + worst = flags[field].rolling(winsz, center=True, min_periods=min_periods).max() + flags[field] = worst - return data, flagger + return data, flags diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index f1690e0fa4ad605bcabe5274c025f7a4048af86d..d5b192aa6c3a9b5b2250682e948609997663bef8 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -7,7 +7,7 @@ import pandas as pd from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.tools import toSequence import saqc.lib.ts_operators as ts_ops @@ -16,7 +16,7 @@ import saqc.lib.ts_operators as ts_ops def assignKNNScore( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, fields: Sequence[str], n_neighbors: int = 10, trafo: Callable[[pd.Series], pd.Series] = lambda x: x, @@ -29,7 +29,7 @@ def assignKNNScore( metric: str = 'minkowski', p: int = 2, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ TODO: docstring need a rework Score datapoints by an aggregation of the dictances to their k nearest neighbors. @@ -123,7 +123,7 @@ def assignKNNScore( val_frame = val_frame.transform(trafo) if val_frame.empty: - return data, flagger + return data, flags # partitioning if not partition_freq: @@ -155,9 +155,9 @@ def assignKNNScore( score_ser[partition.index] = resids # TODO: this unconditionally overwrite a column, may we should fire a warning ? -- palmb - if target_field in flagger.columns: - flagger.drop(target_field) - flagger[target_field] = pd.Series(UNFLAGGED, index=score_ser.index, dtype=float) + if target_field in flags.columns: + flags.drop(target_field) + flags[target_field] = pd.Series(UNFLAGGED, index=score_ser.index, dtype=float) data[target_field] = score_ser - return data, flagger + return data, flags diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index 4ac072016a67afac6df94b90e653961df529c1de..90db8705572a16ef4fabb1d5c579b0a1405443df 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -7,12 +7,12 @@ import numpy as np from dios import DictOfSeries from saqc.constants import * -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags from saqc.lib.tools import periodicMask @register(masking='none', module="tools") -def copy(data: DictOfSeries, field: str, flagger: Flagger, new_field: str, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def copy(data: DictOfSeries, field: str, flags: Flags, new_field: str, **kwargs) -> Tuple[DictOfSeries, Flags]: """ The function generates a copy of the data "field" and inserts it under the name field + suffix into the existing data. @@ -37,17 +37,17 @@ def copy(data: DictOfSeries, field: str, flagger: Flagger, new_field: str, **kwa The flagger object, holding flags and additional Informations related to `data`. Flags shape may have changed relatively to the flagger input. """ - if new_field in flagger.columns.union(data.columns): + if new_field in flags.columns.union(data.columns): raise ValueError(f"{field}: field already exist") data[new_field] = data[field].copy() # implicit copy in history access - flagger.history[new_field] = flagger.history[field] - return data, flagger + flags.history[new_field] = flags.history[field] + return data, flags @register(masking='none', module="tools") -def drop(data: DictOfSeries, field: str, flagger: Flagger, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def drop(data: DictOfSeries, field: str, flags: Flags, **kwargs) -> Tuple[DictOfSeries, Flags]: """ The function drops field from the data dios and the flagger. @@ -70,12 +70,12 @@ def drop(data: DictOfSeries, field: str, flagger: Flagger, **kwargs) -> Tuple[Di Flags shape may have changed relatively to the flagger input. """ del data[field] - del flagger[field] - return data, flagger + del flags[field] + return data, flags @register(masking='none', module="tools") -def rename(data: DictOfSeries, field: str, flagger: Flagger, new_name: str, **kwargs) -> Tuple[DictOfSeries, Flagger]: +def rename(data: DictOfSeries, field: str, flags: Flags, new_name: str, **kwargs) -> Tuple[DictOfSeries, Flags]: """ The function renames field to new name (in both, the flagger and the data). @@ -98,24 +98,24 @@ def rename(data: DictOfSeries, field: str, flagger: Flagger, new_name: str, **kw The flagger object, holding flags and additional Informations related to `data`. """ data[new_name] = data[field] - flagger.history[new_name] = flagger.history[field] + flags.history[new_name] = flags.history[field] del data[field] - del flagger[field] - return data, flagger + del flags[field] + return data, flags @register(masking='none', module="tools") def mask( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, mode: Literal["periodic", "mask_var"], mask_var: Optional[str]=None, period_start: Optional[str]=None, period_end: Optional[str]=None, include_bounds: bool=True, **kwargs, -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ This function realizes masking within saqc. @@ -225,5 +225,5 @@ def mask( raise ValueError("Keyword passed as masking mode is unknown ({})!".format(mode)) data.aloc[to_mask, field] = np.nan - flagger[to_mask, field] = UNFLAGGED - return data, flagger + flags[to_mask, field] = UNFLAGGED + return data, flags diff --git a/saqc/funcs/transformation.py b/saqc/funcs/transformation.py index fbda3ea6ff283dd53fb556022e36bfbb3085af2e..48a072909dc3235c46e0baa21f29217f9afda5ca 100644 --- a/saqc/funcs/transformation.py +++ b/saqc/funcs/transformation.py @@ -6,18 +6,18 @@ import numpy as np import pandas as pd from dios import DictOfSeries -from saqc.core import register, Flags as Flagger +from saqc.core import register, Flags @register(masking='field', module="transformation") def transform( data: DictOfSeries, field: str, - flagger: Flagger, + flags: Flags, func: Callable[[pd.Series], pd.Series], partition_freq: Optional[Union[float, str]] = None, **kwargs -) -> Tuple[DictOfSeries, Flagger]: +) -> Tuple[DictOfSeries, Flags]: """ Function to transform data columns with a transformation that maps series onto series of the same length. @@ -70,4 +70,4 @@ def transform( val_ser[partition.index] = func(partition) data[field] = val_ser - return data, flagger + return data, flags diff --git a/tests/common.py b/tests/common.py index eddda827d1df0852134a426f051eaeffa9e2c7d2..1a3f501a768762130c05bd8e1f75830982255fad 100644 --- a/tests/common.py +++ b/tests/common.py @@ -7,18 +7,18 @@ import pandas as pd import dios from saqc.constants import * -from saqc.core import initFlagsLike, Flags as Flagger +from saqc.core import initFlagsLike, Flags TESTNODATA = (np.nan, -9999) -TESTFLAGGER = (Flagger(),) +TESTFLAGGER = (Flags(),) -def flagAll(data, field, flagger, **kwargs): +def flagAll(data, field, flags, **kwargs): # NOTE: remember to rename flag -> flag_values - flagger.copy() - flagger[:, field] = BAD - return data, flagger + flags.copy() + flags[:, field] = BAD + return data, flags def initData(cols=2, start_date="2017-01-01", end_date="2017-12-31", freq=None, rows=None): @@ -42,7 +42,7 @@ def writeIO(content): return f -def checkDataFlaggerInvariants(data, flagger, field, identical=True): +def checkDataFlagsInvariants(data, flags, field, identical=True): """ Check all invariants that must hold at any point for * field @@ -68,23 +68,23 @@ def checkDataFlaggerInvariants(data, flagger, field, identical=True): identical (True, default) of just for equality. """ assert isinstance(data, dios.DictOfSeries) - assert isinstance(flagger, Flagger) + assert isinstance(flags, Flags) # all columns in data are in flagger - assert data.columns.difference(flagger.columns).empty + assert data.columns.difference(flags.columns).empty # ------------------------------------------------------------------------ # below here, we just check on and with field # ------------------------------------------------------------------------ assert field in data - assert field in flagger + assert field in flags - assert flagger[field].dtype == float + assert flags[field].dtype == float # `pd.Index.identical` also check index attributes like `freq` if identical: - assert data[field].index.identical(flagger[field].index) + assert data[field].index.identical(flags[field].index) else: - assert data[field].index.equals(flagger[field].index) + assert data[field].index.equals(flags[field].index) diff --git a/tests/core/test_core.py b/tests/core/test_core.py index a784cdbaceeecba991b3da662dd9a444acf4f1e5..5370f520d6cf8939bb52ec737770ad931f83cc1a 100644 --- a/tests/core/test_core.py +++ b/tests/core/test_core.py @@ -39,7 +39,7 @@ def flags(data, optional): def test_errorHandling(data): @register(masking='field') - def raisingFunc(data, field, flagger, **kwargs): + def raisingFunc(data, field, flags, **kwargs): raise TypeError var1 = data.columns[0] @@ -73,11 +73,11 @@ def test_sourceTarget(): var1 = data.columns[0] target = "new" - pdata, pflagger = SaQC(data).flagAll(field=var1, target=target).getResult(raw=True) + pdata, pflags = SaQC(data).flagAll(field=var1, target=target).getResult(raw=True) assert (pdata[var1] == pdata[target]).all(axis=None) - assert all(pflagger[var1] == UNFLAGGED) - assert all(pflagger[target] > UNFLAGGED) + assert all(pflags[var1] == UNFLAGGED) + assert all(pflags[target] > UNFLAGGED) @pytest.mark.parametrize("optional", OPTIONAL) @@ -85,14 +85,14 @@ def test_dtypes(data, flags): """ Test if the categorical dtype is preserved through the core functionality """ - flagger = initFlagsLike(data) - flags_raw = flagger.toDios() + flags = initFlagsLike(data) + flags_raw = flags.toDios() var1, var2 = data.columns[:2] - pdata, pflagger = SaQC(data, flags=flags_raw).flagAll(var1).flagAll(var2).getResult(raw=True) + pdata, pflags = SaQC(data, flags=flags_raw).flagAll(var1).flagAll(var2).getResult(raw=True) - for c in pflagger.columns: - assert pflagger[c].dtype == flagger[c].dtype + for c in pflags.columns: + assert pflags[c].dtype == flags[c].dtype def test_plotting(data): @@ -104,10 +104,10 @@ def test_plotting(data): """ pytest.importorskip("matplotlib", reason="requires matplotlib") field, *_ = data.columns - flagger = initFlagsLike(data) - _, flagger_range = flagRange(data, field, flagger, min=10, max=90, flag=BAD) - data_new, flagger_range = flagRange(data, field, flagger_range, min=40, max=60, flag=DOUBT) + flags = initFlagsLike(data) + _, flags_range = flagRange(data, field, flags, min=10, max=90, flag=BAD) + data_new, flags_range = flagRange(data, field, flags_range, min=40, max=60, flag=DOUBT) splot._interactive = False - splot._plotSingleVariable(data, data_new, flagger, flagger_range, sources=[], targets=[data_new.columns[0]]) - splot._plotMultipleVariables(data, data_new, flagger, flagger_range, targets=data_new.columns) + splot._plotSingleVariable(data, data_new, flags, flags_range, sources=[], targets=[data_new.columns[0]]) + splot._plotMultipleVariables(data, data_new, flags, flags_range, targets=data_new.columns) splot._interactive = True diff --git a/tests/core/test_creation.py b/tests/core/test_creation.py index 295d2adfca3efd4c3c6ada4e51ca60cf2c76c6ca..b9b931d292be47f508c10624c554dfd32ccef86a 100644 --- a/tests/core/test_creation.py +++ b/tests/core/test_creation.py @@ -6,7 +6,7 @@ import dios def test_init(): - from saqc import SaQC, Flags as Flagger + from saqc import SaQC, Flags arr = np.array([ [0, 1, 2], @@ -16,5 +16,5 @@ def test_init(): qc = SaQC(data) assert isinstance(qc, SaQC) - assert isinstance(qc._flagger, Flagger) + assert isinstance(qc._flags, Flags) assert isinstance(qc._data, dios.DictOfSeries) diff --git a/tests/flagger/test_flagger.py b/tests/core/test_flagger.py similarity index 100% rename from tests/flagger/test_flagger.py rename to tests/core/test_flagger.py diff --git a/tests/flagger/test_flags.py b/tests/core/test_flags.py similarity index 99% rename from tests/flagger/test_flags.py rename to tests/core/test_flags.py index d0d1585bcec5d49c425ef90b3cbdc70d5e878441..79445b487cc4e63d2ff12bfe004fbd9da8c357e3 100644 --- a/tests/flagger/test_flags.py +++ b/tests/core/test_flags.py @@ -7,7 +7,7 @@ import pandas as pd from saqc.constants import * from saqc.core.flags import Flags -from tests.flagger.test_history import ( +from tests.core.test_history import ( History, is_equal as hist_equal, ) diff --git a/tests/flagger/test_history.py b/tests/core/test_history.py similarity index 100% rename from tests/flagger/test_history.py rename to tests/core/test_history.py diff --git a/tests/core/test_reader.py b/tests/core/test_reader.py index e2d80042bfb5a10400306f937300106dcd8b759e..ded1bdf829fd8643ade1998d49cf3a79345397ce 100644 --- a/tests/core/test_reader.py +++ b/tests/core/test_reader.py @@ -106,8 +106,8 @@ def test_configChecks(data): var1, _, var3, *_ = data.columns @register(masking="none") - def flagFunc(data, field, flagger, arg, opt_arg=None, **kwargs): - return data, flagger + def flagFunc(data, field, flags, arg, opt_arg=None, **kwargs): + return data, flags header = f"{F.VARNAME};{F.TEST}" tests = [ @@ -131,8 +131,8 @@ def test_supportedArguments(data): # TODO: necessary? @register(masking='field') - def func(data, field, flagger, kwarg, **kwargs): - return data, flagger + def func(data, field, flags, kwarg, **kwargs): + return data, flags var1 = data.columns[0] diff --git a/tests/flagger/__init__.py b/tests/flagger/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tests/funcs/test_constants_detection.py b/tests/funcs/test_constants_detection.py index d6b7a68f8845bb420965aaf29178bebae5ab3a83..a7a7b5b82e5e0ba2f94c40ae1060c7501b098424 100644 --- a/tests/funcs/test_constants_detection.py +++ b/tests/funcs/test_constants_detection.py @@ -6,7 +6,7 @@ import numpy as np from saqc.constants import * from saqc.funcs.constants import flagConstants, flagByVariance -from saqc.core import initFlagsLike, Flags as Flagger +from saqc.core import initFlagsLike, Flags from tests.common import initData @@ -21,18 +21,18 @@ def data(): def test_constants_flagBasic(data): expected = np.arange(5, 22) field, *_ = data.columns - flagger = initFlagsLike(data) - data, flagger_result = flagConstants(data, field, flagger, window="15Min", thresh=0.1, flag=BAD) - flagscol = flagger_result[field] + flags = initFlagsLike(data) + data, flags_result = flagConstants(data, field, flags, window="15Min", thresh=0.1, flag=BAD) + flagscol = flags_result[field] assert np.all(flagscol[expected] == BAD) def test_constants_flagVarianceBased(data): expected = np.arange(5, 25) field, *_ = data.columns - flagger = initFlagsLike(data) - data, flagger_result1 = flagByVariance(data, field, flagger, window="1h", flag=BAD) + flags = initFlagsLike(data) + data, flags_result1 = flagByVariance(data, field, flags, window="1h", flag=BAD) - flag_result1 = flagger_result1[field] + flag_result1 = flags_result1[field] test_sum = (flag_result1[expected] == BAD).sum() assert test_sum == len(expected) diff --git a/tests/funcs/test_functions.py b/tests/funcs/test_functions.py index 06eef82da8b962c12c1142a0629f71871e3c4e98..7d625d71e63c09edb73cba2b5434704910cec788 100644 --- a/tests/funcs/test_functions.py +++ b/tests/funcs/test_functions.py @@ -28,9 +28,9 @@ def field(data): def test_flagRange(data, field): min, max = 10, 90 - flagger = initFlagsLike(data) - data, flagger = flagRange(data, field, flagger, min=min, max=max, flag=BAD) - flagged = flagger[field] > UNFLAGGED + flags = initFlagsLike(data) + data, flags = flagRange(data, field, flags, min=min, max=max, flag=BAD) + flagged = flags[field] > UNFLAGGED expected = (data[field] < min) | (data[field] > max) assert all(flagged == expected) @@ -47,47 +47,47 @@ def test_flagSesonalRange(data, field): ] for test, expected in tests: - flagger = initFlagsLike(data) + flags = initFlagsLike(data) newfield = f"{field}_masked" start = f"{test['startmonth']:02}-{test['startday']:02}T00:00:00" end = f"{test['endmonth']:02}-{test['endday']:02}T00:00:00" - data, flagger = copy(data, field, flagger, field + "_masked") - data, flagger = mask( - data, newfield, flagger, + data, flags = copy(data, field, flags, field + "_masked") + data, flags = mask( + data, newfield, flags, mode='periodic', period_start=start, period_end=end, include_bounds=True, flag=BAD ) - data, flagger = flagRange(data, newfield, flagger, min=test['min'], max=test['max'], flag=BAD) - data, flagger = reindexFlags(data, field, flagger, method='match', source=newfield, flag=BAD) - data, flagger = drop(data, newfield, flagger) - flagged = flagger[field] > UNFLAGGED + data, flags = flagRange(data, newfield, flags, min=test['min'], max=test['max'], flag=BAD) + data, flags = reindexFlags(data, field, flags, method='match', source=newfield, flag=BAD) + data, flags = drop(data, newfield, flags) + flagged = flags[field] > UNFLAGGED assert flagged.sum() == expected def test_clearFlags(data, field): - flagger = initFlagsLike(data) - flagger[:, field] = BAD - assert all(flagger[field] == BAD) + flags = initFlagsLike(data) + flags[:, field] = BAD + assert all(flags[field] == BAD) - _, flagger = clearFlags(data, field, flagger) - assert all(flagger[field] == UNFLAGGED) + _, flags = clearFlags(data, field, flags) + assert all(flags[field] == UNFLAGGED) def test_forceFlags(data, field): - flagger = initFlagsLike(data) - flagger[:, field] = BAD - assert all(flagger[field] == BAD) + flags = initFlagsLike(data) + flags[:, field] = BAD + assert all(flags[field] == BAD) - _, flagger = forceFlags(data, field, flagger, flag=DOUBT) - assert all(flagger[field] == DOUBT) + _, flags = forceFlags(data, field, flags, flag=DOUBT) + assert all(flags[field] == DOUBT) def test_flagIsolated(data, field): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) data.iloc[1:3, 0] = np.nan data.iloc[4:5, 0] = np.nan - flagger[data[field].index[5:6], field] = BAD + flags[data[field].index[5:6], field] = BAD data.iloc[11:13, 0] = np.nan data.iloc[15:17, 0] = np.nan @@ -102,15 +102,15 @@ def test_flagIsolated(data, field): # 2016-01-08 7.0 -inf # .. .. .. - _, flagger_result = flagIsolated(data, field, flagger, group_window="1D", gap_window="2.1D", flag=BAD) + _, flags_result = flagIsolated(data, field, flags, group_window="1D", gap_window="2.1D", flag=BAD) - assert flagger_result[field].iloc[[3, 5]].all() + assert flags_result[field].iloc[[3, 5]].all() - data, flagger_result = flagIsolated( - data, field, flagger_result, + data, flags_result = flagIsolated( + data, field, flags_result, group_window="2D", gap_window="2.1D", continuation_range="1.1D", flag=BAD ) - assert flagger_result[field].iloc[[3, 5, 13, 14]].all() + assert flags_result[field].iloc[[3, 5, 13, 14]].all() @pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_2")]) @@ -123,16 +123,16 @@ def test_flagCrossScoring(dat): s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"]) - flagger = initFlagsLike(data) - _, flagger_result = flagCrossStatistic(data, field, flagger, fields=fields, thresh=3, cross_stat=np.mean, flag=BAD) + flags = initFlagsLike(data) + _, flags_result = flagCrossStatistic(data, field, flags, fields=fields, thresh=3, cross_stat=np.mean, flag=BAD) for field in fields: - isflagged = flagger_result[field] > UNFLAGGED + isflagged = flags_result[field] > UNFLAGGED assert isflagged[characteristics["raise"]].all() def test_flagManual(data, field): - flagger = initFlagsLike(data) - args = data, field, flagger + flags = initFlagsLike(data) + args = data, field, flags dat = data[field] mdata = pd.Series("lala", index=dat.index) @@ -220,31 +220,31 @@ def test_flagDriftFromNormal(dat): data['d4'] = 3 + 4 * data['d1'] data['d5'] = 3 + 4 * data['d1'] - flagger = initFlagsLike(data) - data_norm, flagger_norm = flagDriftFromNorm( - data, 'dummy', flagger, + flags = initFlagsLike(data) + data_norm, flags_norm = flagDriftFromNorm( + data, 'dummy', flags, ['d1', 'd2', 'd3'], segment_freq="200min", norm_spread=5, flag=BAD, ) - data_ref, flagger_ref = flagDriftFromReference( - data, 'd1', flagger, + data_ref, flags_ref = flagDriftFromReference( + data, 'd1', flags, ['d1', 'd2', 'd3'], segment_freq="3D", thresh=20, flag=BAD, ) - data_scale, flagger_scale = flagDriftFromScaledNorm( - data, 'dummy', flagger, + data_scale, flags_scale = flagDriftFromScaledNorm( + data, 'dummy', flags, ['d1', 'd3'], ['d4', 'd5'], segment_freq="3D", thresh=20, norm_spread=5, flag=BAD, ) - assert all(flagger_norm['d3'] > UNFLAGGED) - assert all(flagger_ref['d3'] > UNFLAGGED) - assert all(flagger_scale['d3'] > UNFLAGGED) + assert all(flags_norm['d3'] > UNFLAGGED) + assert all(flags_ref['d3'] > UNFLAGGED) + assert all(flags_scale['d3'] > UNFLAGGED) diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index 8c3ce15ff3289c440d0e37a6254e26371ced22b8..64b922fae11414525eb82003affe2cd2ff6002d7 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -24,19 +24,19 @@ def test_addFieldFlagGeneric(data): saqc = SaQC(data=data) func = lambda var1: pd.Series(False, index=data[var1.name].index) - data, flagger = saqc.generic.flag("tmp1", func, flag=BAD).getResult() - assert "tmp1" in flagger.columns and "tmp1" not in data + data, flags = saqc.generic.flag("tmp1", func, flag=BAD).getResult() + assert "tmp1" in flags.columns and "tmp1" not in data def test_addFieldProcGeneric(data): saqc = SaQC(data=data) func = lambda: pd.Series([]) - data, flagger = saqc.generic.process("tmp1", func, flag=BAD ).getResult(raw=True) + data, flags = saqc.generic.process("tmp1", func, flag=BAD ).getResult(raw=True) assert "tmp1" in data.columns and data["tmp1"].empty func = lambda var1, var2: var1 + var2 - data, flagger = saqc.generic.process("tmp2", func, flag=BAD).getResult() + data, flags = saqc.generic.process("tmp2", func, flag=BAD).getResult() assert "tmp2" in data.columns and (data["tmp2"] == data["var1"] + data["var2"]).all(axis=None) @@ -48,6 +48,6 @@ def test_mask(data): data, _ = saqc.generic.process("var1", lambda var1: mask(var1 < mean), flag=BAD).getResult() assert ((data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()).all(axis=None) - data, flagger = saqc.generic.process("tmp", lambda var1: mask(var1 < mean), flag=BAD).getResult() - assert ("tmp" in data.columns) and ("tmp" in flagger.columns) + data, flags = saqc.generic.process("tmp", lambda var1: mask(var1 < mean), flag=BAD).getResult() + assert ("tmp" in data.columns) and ("tmp" in flags.columns) assert ((data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()).all(axis=None) diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 5b3a28cb52a75dd4281ea884648f0152622dfbd2..a47407866f2c3923d00581210775b2ab82dd0b1f 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -8,7 +8,7 @@ import pandas as pd import dios from saqc.constants import * -from saqc.core import initFlagsLike, Flags as Flagger +from saqc.core import initFlagsLike, Flags from saqc.core.visitor import ConfigFunctionParser from saqc.core.config import Fields as F from saqc.core.register import register @@ -33,14 +33,14 @@ def data_diff(): return dios.DictOfSeries(data={col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :],}) -def _compileGeneric(expr, flagger): +def _compileGeneric(expr, flags): tree = ast.parse(expr, mode="eval") - _, kwargs = ConfigFunctionParser(flagger).parse(tree.body) + _, kwargs = ConfigFunctionParser(flags).parse(tree.body) return kwargs["func"] def test_missingIdentifier(data): - flagger = Flagger() + flags = Flags() # NOTE: # - the error is only raised at runtime during parsing would be better @@ -50,13 +50,13 @@ def test_missingIdentifier(data): ] for test in tests: - func = _compileGeneric(f"generic.flag(func={test})", flagger) + func = _compileGeneric(f"generic.flag(func={test})", flags) with pytest.raises(NameError): - _execGeneric(flagger, data, func, field="", nodata=np.nan) + _execGeneric(flags, data, func, field="", nodata=np.nan) def test_syntaxError(): - flagger = Flagger() + flags = Flags() tests = [ "range(x=5", "rangex=5)", @@ -65,7 +65,7 @@ def test_syntaxError(): for test in tests: with pytest.raises(SyntaxError): - _compileGeneric(f"flag(func={test})", flagger) + _compileGeneric(f"flag(func={test})", flags) def test_typeError(): @@ -73,18 +73,18 @@ def test_typeError(): test that forbidden constructs actually throw an error TODO: find a few more cases or get rid of the test """ - flagger = Flagger() + flags = Flags() # : think about cases that should be forbidden tests = ("lambda x: x * 2",) for test in tests: with pytest.raises(TypeError): - _compileGeneric(f"generic.flag(func={test})", flagger) + _compileGeneric(f"generic.flag(func={test})", flags) def test_comparisonOperators(data): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) var1, var2, *_ = data.columns this = var1 @@ -98,13 +98,13 @@ def test_comparisonOperators(data): ] for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flagger) - result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) + func = _compileGeneric(f"generic.flag(func={test})", flags) + result = _execGeneric(flags, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) def test_arithmeticOperators(data): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) var1, *_ = data.columns this = data[var1] @@ -118,13 +118,13 @@ def test_arithmeticOperators(data): ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flagger) - result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) + func = _compileGeneric(f"generic.process(func={test})", flags) + result = _execGeneric(flags, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) def test_nonReduncingBuiltins(data): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) var1, *_ = data.columns this = var1 mean = data[var1].mean() @@ -137,15 +137,15 @@ def test_nonReduncingBuiltins(data): ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flagger) - result = _execGeneric(flagger, data, func, field=this, nodata=np.nan) + func = _compileGeneric(f"generic.process(func={test})", flags) + result = _execGeneric(flags, data, func, field=this, nodata=np.nan) assert (result == expected).all() @pytest.mark.parametrize("nodata", TESTNODATA) def test_reduncingBuiltins(data, nodata): data.loc[::4] = nodata - flagger = initFlagsLike(data) + flags = initFlagsLike(data) var1 = data.columns[0] this = data.iloc[:, 0] @@ -159,15 +159,15 @@ def test_reduncingBuiltins(data, nodata): ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flagger) - result = _execGeneric(flagger, data, func, field=this.name, nodata=nodata) + func = _compileGeneric(f"generic.process(func={test})", flags) + result = _execGeneric(flags, data, func, field=this.name, nodata=nodata) assert result == expected @pytest.mark.parametrize("nodata", TESTNODATA) def test_ismissing(data, nodata): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) data.iloc[: len(data) // 2, 0] = np.nan data.iloc[(len(data) // 2) + 1 :, 0] = -9999 this = data.iloc[:, 0] @@ -178,8 +178,8 @@ def test_ismissing(data, nodata): ] for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flagger) - result = _execGeneric(flagger, data, func, this.name, nodata) + func = _compileGeneric(f"generic.flag(func={test})", flags) + result = _execGeneric(flags, data, func, this.name, nodata) assert np.all(result == expected) @@ -188,7 +188,7 @@ def test_bitOps(data, nodata): var1, var2, *_ = data.columns this = var1 - flagger = initFlagsLike(data) + flags = initFlagsLike(data) tests = [ ("~(this > mean(this))", ~(data[this] > np.nanmean(data[this]))), @@ -197,29 +197,29 @@ def test_bitOps(data, nodata): ] for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flagger) - result = _execGeneric(flagger, data, func, this, nodata) + func = _compileGeneric(f"generic.flag(func={test})", flags) + result = _execGeneric(flags, data, func, this, nodata) assert np.all(result == expected) def test_isflagged(data): var1, var2, *_ = data.columns - flagger = initFlagsLike(data) - flagger[data[var1].index[::2], var1] = BAD + flags = initFlagsLike(data) + flags[data[var1].index[::2], var1] = BAD tests = [ - (f"isflagged({var1})", flagger[var1] > UNFLAGGED), - (f"isflagged({var1}, flag=BAD)", flagger[var1] >= BAD), - (f"isflagged({var1}, UNFLAGGED, '==')", flagger[var1] == UNFLAGGED), - (f"~isflagged({var2})", flagger[var2] == UNFLAGGED), - (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (flagger[var2] == UNFLAGGED)), + (f"isflagged({var1})", flags[var1] > UNFLAGGED), + (f"isflagged({var1}, flag=BAD)", flags[var1] >= BAD), + (f"isflagged({var1}, UNFLAGGED, '==')", flags[var1] == UNFLAGGED), + (f"~isflagged({var2})", flags[var2] == UNFLAGGED), + (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (flags[var2] == UNFLAGGED)), ] for i, (test, expected) in enumerate(tests): try: - func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flagger) - result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) + func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flags) + result = _execGeneric(flags, data, func, field=None, nodata=np.nan) assert np.all(result == expected) except Exception: print(i, test) @@ -229,9 +229,9 @@ def test_isflagged(data): for comp in ['>', '>=', '==', '!=', '<', '<=']: fails = f"isflagged({var1}, comparator='{comp}')" - func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flagger) + func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flags) with pytest.raises(ValueError): - _execGeneric(flagger, data, func, field=None, nodata=np.nan) + _execGeneric(flags, data, func, field=None, nodata=np.nan) def test_variableAssignments(data): @@ -245,12 +245,12 @@ def test_variableAssignments(data): fobj = writeIO(config) saqc = SaQC(data).readConfig(fobj) - result_data, result_flagger = saqc.getResult(raw=True) + result_data, result_flags = saqc.getResult(raw=True) assert set(result_data.columns) == set(data.columns) | { "dummy1", } - assert set(result_flagger.columns) == set(data.columns) | {"dummy1", "dummy2"} + assert set(result_flags.columns) == set(data.columns) | {"dummy1", "dummy2"} # TODO: why this must(!) fail ? - a comment would be helpful @@ -266,8 +266,8 @@ def test_processMultiple(data_diff): fobj = writeIO(config) saqc = SaQC(data_diff).readConfig(fobj) - result_data, result_flagger = saqc.getResult() - assert len(result_data["dummy"]) == len(result_flagger["dummy"]) + result_data, result_flags = saqc.getResult() + assert len(result_data["dummy"]) == len(result_flags["dummy"]) def test_callableArgumentsUnary(data): @@ -275,7 +275,7 @@ def test_callableArgumentsUnary(data): window = 5 @register(masking='field') - def testFuncUnary(data, field, flagger, func, **kwargs): + def testFuncUnary(data, field, flags, func, **kwargs): data[field] = data[field].rolling(window=window).apply(func) return data, initFlagsLike(data) @@ -304,7 +304,7 @@ def test_callableArgumentsBinary(data): var1, var2 = data.columns[:2] @register(masking='field') - def testFuncBinary(data, field, flagger, func, **kwargs): + def testFuncBinary(data, field, flags, func, **kwargs): data[field] = func(data[var1], data[var2]) return data, initFlagsLike(data) diff --git a/tests/funcs/test_harm_funcs.py b/tests/funcs/test_harm_funcs.py index 3f0140bd6dffb4f890c964bcde60988683c28ff5..a8606ca72faa8ebd48dc642bbaa16255c3530893 100644 --- a/tests/funcs/test_harm_funcs.py +++ b/tests/funcs/test_harm_funcs.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import dios -from saqc.core import initFlagsLike, Flags as Flagger +from saqc.core import initFlagsLike, Flags from saqc.constants import BAD, UNFLAGGED from saqc.funcs.resampling import ( linear, @@ -16,7 +16,7 @@ from saqc.funcs.resampling import ( mapToOriginal, ) -from tests.common import checkDataFlaggerInvariants +from tests.common import checkDataFlagsInvariants @pytest.fixture @@ -44,14 +44,14 @@ def data(): def test_wrapper(data, func, kws): field = 'data' freq = "15min" - flagger = initFlagsLike(data) + flags = initFlagsLike(data) import saqc func = getattr(saqc.funcs, func) - data, flagger = func(data, field, flagger, freq, **kws) + data, flags = func(data, field, flags, freq, **kws) # check minimal requirements - checkDataFlaggerInvariants(data, flagger, field) + checkDataFlagsInvariants(data, flags, field) assert data[field].index.freq == pd.Timedelta(freq) @@ -62,18 +62,18 @@ def test_gridInterpolation(data, method): data = data[field] data = (data * np.sin(data)).append(data.shift(1, "2h")).shift(1, "3s") data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) + flags = initFlagsLike(data) # we are just testing if the interpolation gets passed to the series without causing an error: - res = interpolate(data, field, flagger, freq, method=method, downcast_interpolation=True) + res = interpolate(data, field, flags, freq, method=method, downcast_interpolation=True) if method == "polynomial": - res = interpolate(data, field, flagger, freq, order=2, method=method, downcast_interpolation=True) - res = interpolate(data, field, flagger, freq, order=10, method=method, downcast_interpolation=True) + res = interpolate(data, field, flags, freq, order=2, method=method, downcast_interpolation=True) + res = interpolate(data, field, flags, freq, order=10, method=method, downcast_interpolation=True) # check minimal requirements - rdata, rflagger = res - checkDataFlaggerInvariants(rdata, rflagger, field, identical=False) + rdata, rflags = res + checkDataFlagsInvariants(rdata, rflags, field, identical=False) assert rdata[field].index.freq == pd.Timedelta(freq) @@ -105,23 +105,23 @@ def test_flagsSurviveBackprojection(): @pytest.mark.parametrize("reshaper", ["nshift", "fshift", "bshift", "nagg", "bagg", "fagg", "interpolation"]) def test_harmSingleVarIntermediateFlagging(data, reshaper): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) field = 'data' pre_data = data.copy() - pre_flagger = flagger.copy() + pre_flags = flags.copy() - data, flagger = linear(data, field, flagger, freq="15min") - checkDataFlaggerInvariants(data, flagger, field, identical=True) + data, flags = linear(data, field, flags, freq="15min") + checkDataFlagsInvariants(data, flags, field, identical=True) assert data[field].index.freq == pd.Timedelta('15min') # flag something bad - flagger[data[field].index[3:4], field] = BAD - data, flagger = mapToOriginal(data, field, flagger, method="inverse_" + reshaper) + flags[data[field].index[3:4], field] = BAD + data, flags = mapToOriginal(data, field, flags, method="inverse_" + reshaper) - assert len(data[field]) == len(flagger[field]) + assert len(data[field]) == len(flags[field]) assert data[field].equals(pre_data[field]) - assert flagger[field].index.equals(pre_flagger[field].index) + assert flags[field].index.equals(pre_flags[field].index) if 'agg' in reshaper: if reshaper == "nagg": @@ -133,9 +133,9 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): else: raise NotImplementedError('untested test case') - assert all(flagger[field].iloc[start:end] > UNFLAGGED) - assert all(flagger[field].iloc[:start] == UNFLAGGED) - assert all(flagger[field].iloc[end:] == UNFLAGGED) + assert all(flags[field].iloc[start:end] > UNFLAGGED) + assert all(flags[field].iloc[:start] == UNFLAGGED) + assert all(flags[field].iloc[end:] == UNFLAGGED) elif 'shift' in reshaper: if reshaper == "nshift": @@ -147,7 +147,7 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): else: raise NotImplementedError('untested test case') - flagged = flagger[field] > UNFLAGGED + flagged = flags[field] > UNFLAGGED assert all(flagged == exp) elif reshaper == 'interpolation': @@ -166,22 +166,22 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): (("bagg", "30Min"), pd.Series(data=[-50.0, -75.0, 50.0, 50.0], index=pd.date_range("2010-12-31 23:30:00", "2011-01-01 01:00:00", freq="30min"))), ]) def test_harmSingleVarInterpolationAgg(data, params, expected): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) field = 'data' pre_data = data.copy() - pre_flaggger = flagger.copy() + pre_flaggger = flags.copy() method, freq = params - data_harm, flagger_harm = aggregate(data, field, flagger, freq, value_func=np.sum, method=method) - checkDataFlaggerInvariants(data_harm, flagger_harm, field, identical=True) + data_harm, flags_harm = aggregate(data, field, flags, freq, value_func=np.sum, method=method) + checkDataFlagsInvariants(data_harm, flags_harm, field, identical=True) assert data_harm[field].index.freq == pd.Timedelta(freq) assert data_harm[field].equals(expected) - data_deharm, flagger_deharm = mapToOriginal(data_harm, "data", flagger_harm, method="inverse_" + method) - checkDataFlaggerInvariants(data_harm, flagger_harm, field, identical=True) + data_deharm, flags_deharm = mapToOriginal(data_harm, "data", flags_harm, method="inverse_" + method) + checkDataFlagsInvariants(data_harm, flags_harm, field, identical=True) assert data_deharm[field].equals(pre_data[field]) - assert flagger_deharm[field].equals(pre_flaggger[field]) + assert flags_deharm[field].equals(pre_flaggger[field]) @pytest.mark.parametrize( @@ -195,17 +195,17 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): (("nshift", "30min"), pd.Series(data=[np.nan, -37.5, 12.5, 50.0], index=pd.date_range("2010-12-31 23:30:00", "2011-01-01 01:00:00", freq="30Min"))), ]) def test_harmSingleVarInterpolationShift(data, params, expected): - flagger = initFlagsLike(data) + flags = initFlagsLike(data) field = 'data' pre_data = data.copy() - pre_flagger = flagger.copy() + pre_flags = flags.copy() method, freq = params - data_harm, flagger_harm = shift(data, field, flagger, freq, method=method) + data_harm, flags_harm = shift(data, field, flags, freq, method=method) assert data_harm[field].equals(expected) - data_deharm, flagger_deharm = mapToOriginal(data_harm, "data", flagger_harm, method="inverse_" + method) + data_deharm, flags_deharm = mapToOriginal(data_harm, "data", flags_harm, method="inverse_" + method) assert data_deharm[field].equals(pre_data[field]) - assert flagger_deharm[field].equals(pre_flagger[field]) + assert flags_deharm[field].equals(pre_flags[field]) diff --git a/tests/funcs/test_modelling.py b/tests/funcs/test_modelling.py index de9f1efb89ad2140522b4b58ea7ced54d324bc3e..5bfdfba88c92186d3504c9d1f19b79acf79ee3ad 100644 --- a/tests/funcs/test_modelling.py +++ b/tests/funcs/test_modelling.py @@ -20,16 +20,16 @@ def test_modelling_polyFit_forRegular(dat): # add some nice sine distortion data = data + 10 * np.sin(np.arange(0, len(data.indexes[0]))) data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - result1, _ = calculatePolynomialResidues(data, "data", flagger, 11, 2, numba=False) - result2, _ = calculatePolynomialResidues(data, "data", flagger, 11, 2, numba=True) + flags = initFlagsLike(data) + result1, _ = calculatePolynomialResidues(data, "data", flags, 11, 2, numba=False) + result2, _ = calculatePolynomialResidues(data, "data", flags, 11, 2, numba=True) assert (result1["data"] - result2["data"]).abs().max() < 10 ** -10 - result3, _ = calculatePolynomialResidues(data, "data", flagger, "110min", 2, numba=False) + result3, _ = calculatePolynomialResidues(data, "data", flags, "110min", 2, numba=False) assert result3["data"].equals(result1["data"]) - result4, _ = calculatePolynomialResidues(data, "data", flagger, 11, 2, numba=True, min_periods=11) + result4, _ = calculatePolynomialResidues(data, "data", flags, 11, 2, numba=True, min_periods=11) assert (result4["data"] - result2["data"]).abs().max() < 10 ** -10 data.iloc[13:16] = np.nan - result5, _ = calculatePolynomialResidues(data, "data", flagger, 11, 2, numba=True, min_periods=9) + result5, _ = calculatePolynomialResidues(data, "data", flags, 11, 2, numba=True, min_periods=9) assert result5["data"].iloc[10:19].isna().all() @@ -37,45 +37,45 @@ def test_modelling_polyFit_forRegular(dat): def test_modelling_rollingMean_forRegular(dat): data, _ = dat(freq="10min", periods=30, initial_level=0, final_level=100, out_val=-100) data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - calculateRollingResidues(data, "data", flagger, 5, func=np.mean, eval_flags=True, min_periods=0, center=True) - calculateRollingResidues(data, "data", flagger, 5, func=np.mean, eval_flags=True, min_periods=0, center=False) + flags = initFlagsLike(data) + calculateRollingResidues(data, "data", flags, 5, func=np.mean, eval_flags=True, min_periods=0, center=True) + calculateRollingResidues(data, "data", flags, 5, func=np.mean, eval_flags=True, min_periods=0, center=False) @pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_1")]) def test_modelling_mask(dat): data, _ = dat() data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) + flags = initFlagsLike(data) field = "data" # set flags everywhere to test unflagging - flagger[:, field] = BAD + flags[:, field] = BAD - common = dict(data=data, field=field, flagger=flagger, mode='periodic') - data_seasonal, flagger_seasonal = mask(**common, period_start="20:00", period_end="40:00", include_bounds=False) - flagscol = flagger_seasonal[field] + common = dict(data=data, field=field, flags=flags, mode='periodic') + data_seasonal, flags_seasonal = mask(**common, period_start="20:00", period_end="40:00", include_bounds=False) + flagscol = flags_seasonal[field] m = (20 <= flagscol.index.minute) & (flagscol.index.minute <= 40) - assert all(flagger_seasonal[field][m] == UNFLAGGED) + assert all(flags_seasonal[field][m] == UNFLAGGED) assert all(data_seasonal[field][m].isna()) - data_seasonal, flagger_seasonal = mask(**common, period_start="15:00:00", period_end="02:00:00") - flagscol = flagger_seasonal[field] + data_seasonal, flags_seasonal = mask(**common, period_start="15:00:00", period_end="02:00:00") + flagscol = flags_seasonal[field] m = (15 <= flagscol.index.hour) & (flagscol.index.hour <= 2) - assert all(flagger_seasonal[field][m] == UNFLAGGED) + assert all(flags_seasonal[field][m] == UNFLAGGED) assert all(data_seasonal[field][m].isna()) - data_seasonal, flagger_seasonal = mask(**common, period_start="03T00:00:00", period_end="10T00:00:00") - flagscol = flagger_seasonal[field] + data_seasonal, flags_seasonal = mask(**common, period_start="03T00:00:00", period_end="10T00:00:00") + flagscol = flags_seasonal[field] m = (3 <= flagscol.index.hour) & (flagscol.index.hour <= 10) - assert all(flagger_seasonal[field][m] == UNFLAGGED) + assert all(flags_seasonal[field][m] == UNFLAGGED) assert all(data_seasonal[field][m].isna()) mask_ser = pd.Series(False, index=data["data"].index) mask_ser[::5] = True data["mask_ser"] = mask_ser - flagger = initFlagsLike(data) - data_masked, flagger_masked = mask(data, "data", flagger, mode='mask_var', mask_var="mask_ser") + flags = initFlagsLike(data) + data_masked, flags_masked = mask(data, "data", flags, mode='mask_var', mask_var="mask_ser") m = mask_ser - assert all(flagger_masked[field][m] == UNFLAGGED) + assert all(flags_masked[field][m] == UNFLAGGED) assert all(data_masked[field][m].isna()) diff --git a/tests/funcs/test_pattern_rec.py b/tests/funcs/test_pattern_rec.py index db3c50249a109c67d2d3fed51e97bc3cd1a1e0e5..1cd7b7b4d171985f04a36fe591af4b342b93788e 100644 --- a/tests/funcs/test_pattern_rec.py +++ b/tests/funcs/test_pattern_rec.py @@ -28,12 +28,12 @@ def test_flagPattern_wavelet(): pattern = data.iloc[1:6] data = dios.DictOfSeries(dict(data=data, pattern_data=pattern)) - flagger = initFlagsLike(data, name='data') - data, flagger = flagPatternByDTW(data, "data", flagger, ref_field="pattern_data", flag=BAD) + flags = initFlagsLike(data, name='data') + data, flags = flagPatternByDTW(data, "data", flags, ref_field="pattern_data", flag=BAD) - assert all(flagger["data"][1:6]) - assert any(flagger["data"][:1]) - assert any(flagger["data"][7:]) + assert all(flags["data"][1:6]) + assert any(flags["data"][:1]) + assert any(flags["data"][7:]) @pytest.mark.skip(reason='faulty implementation - will get fixed by GL-MR191') @@ -43,9 +43,9 @@ def test_flagPattern_dtw(): pattern = data.iloc[1:6] data = dios.DictOfSeries(dict(data=data, pattern_data=pattern)) - flagger = initFlagsLike(data, name='data') - data, flagger = flagPatternByWavelet(data, "data", flagger, ref_field="pattern_data", flag=BAD) + flags = initFlagsLike(data, name='data') + data, flags = flagPatternByWavelet(data, "data", flags, ref_field="pattern_data", flag=BAD) - assert all(flagger["data"][1:6]) - assert any(flagger["data"][:1]) - assert any(flagger["data"][7:]) + assert all(flags["data"][1:6]) + assert any(flags["data"][:1]) + assert any(flags["data"][7:]) diff --git a/tests/funcs/test_proc_functions.py b/tests/funcs/test_proc_functions.py index 04739b40b6663703533a9a4cb30630761c49b112..cfcd5bcf62685dc8c2874d8c2880d7b216ebfcd7 100644 --- a/tests/funcs/test_proc_functions.py +++ b/tests/funcs/test_proc_functions.py @@ -21,15 +21,15 @@ def test_rollingInterpolateMissing(course_5): data, characteristics = course_5(periods=10, nan_slice=[5, 6]) field = data.columns[0] data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) + flags = initFlagsLike(data) dataInt, *_ = interpolateByRolling( - data, field, flagger, 3, func=np.median, center=True, min_periods=0, interpol_flag=UNFLAGGED + data, field, flags, 3, func=np.median, center=True, min_periods=0, interpol_flag=UNFLAGGED ) # import pdb # pdb.set_trace() assert dataInt[field][characteristics["missing"]].notna().all() dataInt, *_ = interpolateByRolling( - data, field, flagger, 3, func=np.nanmean, center=False, min_periods=3, interpol_flag=UNFLAGGED + data, field, flags, 3, func=np.nanmean, center=False, min_periods=3, interpol_flag=UNFLAGGED ) assert dataInt[field][characteristics["missing"]].isna().all() @@ -38,15 +38,15 @@ def test_interpolateMissing(course_5): data, characteristics = course_5(periods=10, nan_slice=[5]) field = data.columns[0] data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - dataLin, *_ = interpolateInvalid(data, field, flagger, method="linear") - dataPoly, *_ = interpolateInvalid(data, field, flagger, method="polynomial") + flags = initFlagsLike(data) + dataLin, *_ = interpolateInvalid(data, field, flags, method="linear") + dataPoly, *_ = interpolateInvalid(data, field, flags, method="polynomial") assert dataLin[field][characteristics["missing"]].notna().all() assert dataPoly[field][characteristics["missing"]].notna().all() data, characteristics = course_5(periods=10, nan_slice=[5, 6, 7]) - dataLin1, *_ = interpolateInvalid(data, field, flagger, method="linear", inter_limit=2) - dataLin2, *_ = interpolateInvalid(data, field, flagger, method="linear", inter_limit=3) - dataLin3, *_ = interpolateInvalid(data, field, flagger, method="linear", inter_limit=4) + dataLin1, *_ = interpolateInvalid(data, field, flags, method="linear", inter_limit=2) + dataLin2, *_ = interpolateInvalid(data, field, flags, method="linear", inter_limit=3) + dataLin3, *_ = interpolateInvalid(data, field, flags, method="linear", inter_limit=4) assert dataLin1[field][characteristics["missing"]].isna().all() assert dataLin2[field][characteristics["missing"]].isna().all() assert dataLin3[field][characteristics["missing"]].notna().all() @@ -56,13 +56,13 @@ def test_transform(course_5): data, characteristics = course_5(periods=10, nan_slice=[5, 6]) field = data.columns[0] data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - data1, *_ = transform(data, field, flagger, func=linearInterpolation) + flags = initFlagsLike(data) + data1, *_ = transform(data, field, flags, func=linearInterpolation) assert data1[field][characteristics["missing"]].isna().all() - data1, *_ = transform(data, field, flagger, func=lambda x: linearInterpolation(x, inter_limit=3)) + data1, *_ = transform(data, field, flags, func=lambda x: linearInterpolation(x, inter_limit=3)) assert data1[field][characteristics["missing"]].notna().all() data1, *_ = transform( - data, field, flagger, func=lambda x: polynomialInterpolation(x, inter_limit=3, inter_order=3) + data, field, flags, func=lambda x: polynomialInterpolation(x, inter_limit=3, inter_order=3) ) assert data1[field][characteristics["missing"]].notna().all() @@ -71,8 +71,8 @@ def test_resample(course_5): data, characteristics = course_5(freq="1min", periods=30, nan_slice=[1, 11, 12, 22, 24, 26]) field = data.columns[0] data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - data1, *_ = resample(data, field, flagger, "10min", np.mean, max_invalid_total_d=2, max_invalid_consec_d=1) + flags = initFlagsLike(data) + data1, *_ = resample(data, field, flags, "10min", np.mean, max_invalid_total_d=2, max_invalid_consec_d=1) assert ~np.isnan(data1[field].iloc[0]) assert np.isnan(data1[field].iloc[1]) assert np.isnan(data1[field].iloc[2]) @@ -83,8 +83,8 @@ def test_interpolateGrid(course_5, course_3): data_grid, characteristics = course_3() data['grid'] = data_grid.to_df() # data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - dataInt, *_ = interpolateIndex(data, 'data', flagger, '1h', 'time', grid_field='grid', inter_limit=10) + flags = initFlagsLike(data) + dataInt, *_ = interpolateIndex(data, 'data', flags, '1h', 'time', grid_field='grid', inter_limit=10) def test_offsetCorrecture(): @@ -92,7 +92,7 @@ def test_offsetCorrecture(): data.iloc[30:40] = -100 data.iloc[70:80] = 100 data = dios.DictOfSeries(data) - flagger = initFlagsLike(data) - data, _ = correctOffset(data, 'dat', flagger, 40, 20, '3d', 1) + flags = initFlagsLike(data) + data, _ = correctOffset(data, 'dat', flags, 40, 20, '3d', 1) assert (data == 0).all()[0] diff --git a/tests/funcs/test_spikes_detection.py b/tests/funcs/test_spikes_detection.py index 727ef415775108d8e88367a7274e2fdc4eebc878..9481d7eb0f96048b098375bd295603b59bb0bc12 100644 --- a/tests/funcs/test_spikes_detection.py +++ b/tests/funcs/test_spikes_detection.py @@ -29,9 +29,9 @@ def spiky_data(): def test_flagMad(spiky_data): data = spiky_data[0] field, *_ = data.columns - flagger = initFlagsLike(data) - data, flagger_result = flagMAD(data, field, flagger, "1H", flag=BAD) - flag_result = flagger_result[field] + flags = initFlagsLike(data) + data, flags_result = flagMAD(data, field, flags, "1H", flag=BAD) + flag_result = flags_result[field] test_sum = (flag_result[spiky_data[1]] == BAD).sum() assert test_sum == len(spiky_data[1]) @@ -39,9 +39,9 @@ def test_flagMad(spiky_data): def test_flagSpikesBasic(spiky_data): data = spiky_data[0] field, *_ = data.columns - flagger = initFlagsLike(data) - data, flagger_result = flagOffset(data, field, flagger, thresh=60, tolerance=10, window="20min", flag=BAD) - flag_result = flagger_result[field] + flags = initFlagsLike(data) + data, flags_result = flagOffset(data, field, flags, thresh=60, tolerance=10, window="20min", flag=BAD) + flag_result = flags_result[field] test_sum = (flag_result[spiky_data[1]] == BAD).sum() assert test_sum == len(spiky_data[1]) @@ -59,14 +59,14 @@ def test_flagSpikesBasic(spiky_data): def test_flagSpikesLimitRaise(dat): data, characteristics = dat() field, *_ = data.columns - flagger = initFlagsLike(data) - _, flagger_result = flagRaise( - data, field, flagger, + flags = initFlagsLike(data) + _, flags_result = flagRaise( + data, field, flags, thresh=2, intended_freq="10min", raise_window="20min", numba_boost=False, flag=BAD ) - assert np.all(flagger_result[field][characteristics["raise"]] > UNFLAGGED) - assert not np.any(flagger_result[field][characteristics["return"]] > UNFLAGGED) - assert not np.any(flagger_result[field][characteristics["drop"]] > UNFLAGGED) + assert np.all(flags_result[field][characteristics["raise"]] > UNFLAGGED) + assert not np.any(flags_result[field][characteristics["return"]] > UNFLAGGED) + assert not np.any(flags_result[field][characteristics["drop"]] > UNFLAGGED) # see test/functs/fixtures.py for the 'course_N' @@ -80,12 +80,12 @@ def test_flagMultivarScores(dat): s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"]) - flagger = initFlagsLike(data) - _, flagger_result = flagMVScores( - data, field, flagger, fields=fields, trafo=np.log, iter_start=0.95, n_neighbors=10, flag=BAD + flags = initFlagsLike(data) + _, flags_result = flagMVScores( + data, field, flags, fields=fields, trafo=np.log, iter_start=0.95, n_neighbors=10, flag=BAD ) for field in fields: - isflagged = flagger_result[field] > UNFLAGGED + isflagged = flags_result[field] > UNFLAGGED assert isflagged[characteristics["raise"]].all() assert not isflagged[characteristics["return"]].any() assert not isflagged[characteristics["drop"]].any() @@ -99,7 +99,7 @@ def test_grubbs(dat): crowd_size=1, crowd_spacing=3, out_val=-10, ) - flagger = initFlagsLike(data) - data, result_flagger = flagByGrubbs(data, "data", flagger, winsz=20, min_periods=15, flag=BAD) - assert np.all(result_flagger["data"][char_dict["drop"]] > UNFLAGGED) + flags = initFlagsLike(data) + data, result_flags = flagByGrubbs(data, "data", flags, winsz=20, min_periods=15, flag=BAD) + assert np.all(result_flags["data"][char_dict["drop"]] > UNFLAGGED) diff --git a/tests/fuzzy/init.py b/tests/fuzzy/init.py index 4096823b575097ded6621f5a241b35942e6ab755..b08bb65d8b215a6b0a47e2d679751d1be52e3aa7 100644 --- a/tests/fuzzy/init.py +++ b/tests/fuzzy/init.py @@ -25,7 +25,7 @@ from saqc.constants import * from saqc.core.register import FUNC_MAP from saqc.core.lib import SaQCFunction from saqc.lib.types import FreqString, ColumnName, IntegerWindow -from saqc.core import initFlagsLike, Flags as Flagger +from saqc.core import initFlagsLike, Flags MAX_EXAMPLES = 50 # MAX_EXAMPLES = 100000 @@ -77,15 +77,15 @@ def columnNames(draw): @composite -def flaggers(draw, data): +def flagses(draw, data): """ - initialize a flagger and set some flags + initialize a flags and set some flags """ - flagger = initFlagsLike(data) + flags = initFlagsLike(data) for col, srs in data.items(): loc_st = lists(sampled_from(sorted(srs.index)), unique=True, max_size=len(srs) - 1) - flagger[draw(loc_st), col] = BAD - return flagger + flags[draw(loc_st), col] = BAD + return flags @composite @@ -116,11 +116,11 @@ def frequencyStrings(draw, _): @composite -def dataFieldFlagger(draw): +def dataFieldFlags(draw): data = draw(dioses()) field = draw(sampled_from(sorted(data.columns))) - flagger = draw(flaggers(data)) - return data, field, flagger + flags = draw(flagses(data)) + return data, field, flags @composite @@ -138,7 +138,7 @@ def functionKwargs(draw, func: SaQCFunction): kwargs = { "data": data, "field": field, - "flagger": draw(flaggers(data)) + "flags": draw(flagses(data)) } column_name_strategy = lambda _: sampled_from(sorted(c for c in data.columns if c != field)) @@ -149,7 +149,7 @@ def functionKwargs(draw, func: SaQCFunction): register_type_strategy(IntegerWindow, interger_window_strategy) for k, v in get_type_hints(func.func).items(): - if k not in {"data", "field", "flagger", "return"}: + if k not in {"data", "field", "flags", "return"}: value = draw(from_type(v)) # if v is TimestampColumnName: # value = draw(columnNames()) diff --git a/tests/fuzzy/test_masking.py b/tests/fuzzy/test_masking.py index 9567ea7f84e58ec5f221b4d69dfcb6651dc33cb5..be77e9872f5222e6cc332dc31236081f2c25a28e 100644 --- a/tests/fuzzy/test_masking.py +++ b/tests/fuzzy/test_masking.py @@ -8,60 +8,60 @@ import pandas as pd from hypothesis import given, settings from saqc.constants import * -from saqc.core import Flags as Flagger +from saqc.core import Flags from saqc.core.register import _maskData, _unmaskData -from tests.fuzzy.init import dataFieldFlagger, MAX_EXAMPLES +from tests.fuzzy.init import dataFieldFlags, MAX_EXAMPLES logging.disable(logging.CRITICAL) @settings(max_examples=MAX_EXAMPLES, deadline=None) -@given(data_field_flagger=dataFieldFlagger()) -def test_maskingMasksData(data_field_flagger): +@given(data_field_flags=dataFieldFlags()) +def test_maskingMasksData(data_field_flags): """ test if flagged values are replaced by np.nan """ - flagger: Flagger - data_in, field, flagger = data_field_flagger - data_masked, _ = _maskData(data_in, flagger, columns=[field], to_mask=BAD) - assert data_masked.aloc[flagger.toDios() == BAD].isna().all(axis=None) + flags: Flags + data_in, field, flags = data_field_flags + data_masked, _ = _maskData(data_in, flags, columns=[field], to_mask=BAD) + assert data_masked.aloc[flags.toDios() == BAD].isna().all(axis=None) @settings(max_examples=MAX_EXAMPLES, deadline=None) -@given(data_field_flagger=dataFieldFlagger()) -def test_dataMutationPreventsUnmasking(data_field_flagger): +@given(data_field_flags=dataFieldFlags()) +def test_dataMutationPreventsUnmasking(data_field_flags): """ test if (un)masking works as expected on data-changes. if `data` is mutated after `_maskData`, `_unmaskData` should be a no-op """ filler = -9999 - data_in, field, flagger = data_field_flagger - data_masked, mask = _maskData(data_in, flagger, columns=[field], to_mask=BAD) + data_in, field, flags = data_field_flags + data_masked, mask = _maskData(data_in, flags, columns=[field], to_mask=BAD) data_masked[field] = filler - data_out = _unmaskData(data_in, mask, data_masked, flagger, to_mask=BAD) + data_out = _unmaskData(data_in, mask, data_masked, flags, to_mask=BAD) assert (data_out[field] == filler).all(axis=None) @settings(max_examples=MAX_EXAMPLES, deadline=None) -@given(data_field_flagger=dataFieldFlagger()) -def test_flaggerMutationPreventsUnmasking(data_field_flagger): +@given(data_field_flags=dataFieldFlags()) +def test_flagsMutationPreventsUnmasking(data_field_flags): """ test if (un)masking works as expected on flagger-changes. if `flagger` is mutated after `_maskData`, `_unmaskData` should be a no-op """ - data_in, field, flagger = data_field_flagger - data_masked, mask = _maskData(data_in, flagger, columns=[field], to_mask=BAD) - flagger = flagger[field] = UNFLAGGED - data_out = _unmaskData(data_in, mask, data_masked, flagger, to_mask=BAD) - assert (data_out.loc[flagger[field] == BAD, field].isna()).all(axis=None) + data_in, field, flags = data_field_flags + data_masked, mask = _maskData(data_in, flags, columns=[field], to_mask=BAD) + flags = flags[field] = UNFLAGGED + data_out = _unmaskData(data_in, mask, data_masked, flags, to_mask=BAD) + assert (data_out.loc[flags[field] == BAD, field].isna()).all(axis=None) @settings(max_examples=MAX_EXAMPLES, deadline=None) -@given(data_field_flagger=dataFieldFlagger()) -def test_reshapingPreventsUnmasking(data_field_flagger): +@given(data_field_flags=dataFieldFlags()) +def test_reshapingPreventsUnmasking(data_field_flags): """ test if (un)masking works as expected on index-changes. If the index of data (and flags) change in the func, the unmasking, @@ -70,30 +70,30 @@ def test_reshapingPreventsUnmasking(data_field_flagger): filler = -1111 - data_in, field, flagger = data_field_flagger - data_masked, mask = _maskData(data_in, flagger, columns=[field], to_mask=BAD) + data_in, field, flags = data_field_flags + data_masked, mask = _maskData(data_in, flags, columns=[field], to_mask=BAD) # mutate indexes of `data` and `flagger` index = data_masked[field].index.to_series() index.iloc[-len(data_masked[field])//2:] += pd.Timedelta("7.5Min") data_masked[field] = pd.Series(data=filler, index=index) - flagger.drop(field) - flagger[field] = pd.Series(data=flagger[field].values, index=index) + flags.drop(field) + flags[field] = pd.Series(data=flags[field].values, index=index) - data_out = _unmaskData(data_in, mask, data_masked, flagger, to_mask=BAD) + data_out = _unmaskData(data_in, mask, data_masked, flags, to_mask=BAD) assert (data_out[field] == filler).all(axis=None) @settings(max_examples=MAX_EXAMPLES, deadline=None) -@given(data_field_flagger=dataFieldFlagger()) -def test_unmaskingInvertsMasking(data_field_flagger): +@given(data_field_flags=dataFieldFlags()) +def test_unmaskingInvertsMasking(data_field_flags): """ unmasking data should invert the masking """ - data_in, field, flagger = data_field_flagger - data_masked, mask = _maskData(data_in, flagger, columns=[field], to_mask=BAD) - data_out = _unmaskData(data_in, mask, data_masked, flagger, to_mask=BAD) + data_in, field, flags = data_field_flags + data_masked, mask = _maskData(data_in, flags, columns=[field], to_mask=BAD) + data_out = _unmaskData(data_in, mask, data_masked, flags, to_mask=BAD) assert data_in.to_df().equals(data_out.to_df())