diff --git a/saqc/funcs/changepoints.py b/saqc/funcs/changepoints.py index c4d6db0955e555b5728b4c531487cf56f10125c7..544f07f756d830d4d6dad6ed43d1d21ed66d40fb 100644 --- a/saqc/funcs/changepoints.py +++ b/saqc/funcs/changepoints.py @@ -39,12 +39,10 @@ class ChangepointsMixin: **kwargs, ) -> "SaQC": """ - Flag data where it significantly changes. + Flag values that represent a system state transition. - Flag data points, where the parametrization of the process, the data is assumed to - generate by, significantly changes. - - The change points detection is based on a sliding window search. + Flag data points, where the parametrization of the assumed process generating this data, + significantly changes. Parameters ---------- diff --git a/saqc/funcs/drift.py b/saqc/funcs/drift.py index 9e8cf172aa8591fa4ad0663ee19f9dcc53431b66..aaf9e410e6975da065a71786a608d77d270cdce0 100644 --- a/saqc/funcs/drift.py +++ b/saqc/funcs/drift.py @@ -571,7 +571,7 @@ class DriftMixin: **kwargs, ) -> "SaQC": """ - Flags anomalous regimes regarding to modelling regimes of field. + Flags anomalous regimes regarding to modelling regimes of ``field``. "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect to a certain metric and linkage method. diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index a4125acab22ab247d8a25404cd37f5266c66215e..67237dd5666dafaa1268b46cf0ea4b3f9e105f73 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -105,14 +105,7 @@ class GenericMixin: """ Generate/process data with user defined functions. - Formally, what the function does, is the following: - - 1. Let F be a Callable, depending on fields f_1, f_2,...f_K, (F = F(f_1, f_2,...f_K)) - Than, for every timestamp t_i that occurs in at least one of the timeseries data[f_j] (outer join), - The value v_i is computed via: - v_i = data([f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]), if all data[f_j][t_i] do exist - v_i = ``np.nan``, if at least one of the data[f_j][t_i] is missing. - 2. The result is stored to ``data[target]``, if ``target`` is given or to ``data[field]`` otherwise + Call the given ``func`` on the variables given in ``field``. Parameters ---------- @@ -214,12 +207,9 @@ class GenericMixin: **kwargs, ) -> "SaQC": """ - Flag data with user defined functions. + Flag data based on a given function. - Formally, what the function does, is the following: - Let X be a Callable, depending on fields f_1, f_2,...f_K, (X = X(f_1, f_2,...f_K)) - Than for every timestamp t_i in data[field]: - data[field][t_i] is flagged if X(data[f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]) is True. + Evaluate ``func`` on all variables given in ``field``. Parameters ---------- @@ -227,21 +217,18 @@ class GenericMixin: The variable(s) passed to func. func : callable - Function to call on the variables given in ``field``. The function needs to accept the same - number of arguments (of type pandas.Series) as variables given in ``field`` and return an - iterable of array-like objects of with dtype bool and with the same number of elements as - given in ``target`` (or ``field`` if ``target`` is not specified). The function output - determines the values to flag. + Function to call. The function needs to accept the same number of arguments + (of type pandas.Series) as variables given in ``field`` and return an + iterable of array-like objects of data type ``bool`` with the same length as + ``target``. target: str or list of str The variable(s) to write the result of ``func`` to. If not given, the variable(s) - specified in ``field`` will be overwritten. If a ``target`` is not given, it will be - created. + specified in ``field`` will be overwritten. Non-existing ``target``s will be created + as all ``NaN`` timeseries. flag: float, default ``BAD`` - The quality flag to set. The default ``BAD`` states the general idea, that - ``processGeneric`` generates 'new' data without direct relation to the potentially - already present flags. + Quality flag to set. dfilter: float, default ``FILTER_ALL`` Threshold flag. Flag values greater than ``dfilter`` indicate that the associated @@ -251,10 +238,6 @@ class GenericMixin: ------- saqc.SaQC - Note - ----- - All the numpy functions are available within the generic expressions. - Examples -------- diff --git a/saqc/funcs/noise.py b/saqc/funcs/noise.py index 8945f22332a763794d6beb7c4179ac34767df7dd..aeed5d91a84d09dc8d0239b001ecd5f784cd147d 100644 --- a/saqc/funcs/noise.py +++ b/saqc/funcs/noise.py @@ -36,11 +36,11 @@ class NoiseMixin: **kwargs, ) -> "SaQC": """ - Flag *chunks* of length, `window`: + Flag data chunks of length ``window``, if: - 1. If they excexceed `thresh` with regard to `stat`: - 2. If all (maybe overlapping) *sub-chunks* of *chunk*, with length `sub_window`, - `excexceed `sub_thresh` with regard to `stat`: + 1. they excexceed ``thresh`` with regard to ``func`` and + 2. all (maybe overlapping) sub-chunks of the data chunks with length ``sub_window``, + exceed ``sub_thresh`` with regard to ``func`` Parameters ---------- @@ -48,21 +48,24 @@ class NoiseMixin: The fieldname of the column, holding the data-to-be-flagged. func: Callable[[np.array, pd.Series], float] - Function to aggregate chunk contnent with. + Aggregation function applied on every chunk. window: str - Temporal extension of the chunks to test + Window (i.e. chunk) size. thresh: float - Threshold, that triggers flagging, if exceeded by stat value. + Threshold. A given chunk is flagged, if the return value of ``func`` excceeds ``thresh``. sub_window: str, default None, - Window size of the sub chunks, that are additionally tested for exceeding - `sub_thresh` with respect to `stat`. + Window size of sub chunks, that are additionally tested for exceeding ``sub_thresh`` + with respect to ``func``. sub_thresh: float, default None + Threshold. A given sub chunk is flagged, if the return value of ``func` excceeds ``sub_thresh``. min_periods: int, default None + Minimum number of values needed in a chunk to perfom the test. + Ignored if ``window`` is an integer. flag : float, default BAD flag to set diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 7e0575e213f50cf1fcceb36a7c40abb3e5880382..1fc519f035502bbfa0bbed2dc623138260c85de3 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -122,8 +122,10 @@ class OutliersMixin: References ---------- - [1] Talagala, P. D., Hyndman, R. J., & Smith-Miles, K. (2019). Anomaly detection in - high dimensional data. arXiv preprint arXiv:1908.04000. + [1] Priyanga Dilini Talagala, Rob J. Hyndman & Kate Smith-Miles (2021): + Anomaly Detection in High-Dimensional Data, + Journal of Computational and Graphical Statistics, 30:2, 360-374, + DOI: 10.1080/10618600.2020.1807997 """ scores = self._data[field].dropna() @@ -280,10 +282,6 @@ class OutliersMixin: flag : float, default BAD flag to set. - Returns - ------- - saqc.SaQC - Notes ----- The basic steps are: @@ -321,6 +319,18 @@ class OutliersMixin: this gap, get flagged outliers. See description of the `threshing` parameter for more details. Although [2] gives a fully detailed overview over the `stray` algorithm. + + Returns + ------- + saqc.SaQC + + References + ---------- + [1] Priyanga Dilini Talagala, Rob J. Hyndman & Kate Smith-Miles (2021): + Anomaly Detection in High-Dimensional Data, + Journal of Computational and Graphical Statistics, 30:2, 360-374, + DOI: 10.1080/10618600.2020.1807997 + """ fields = toSequence(field) @@ -571,11 +581,13 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function represents an implementation of the modyfied Z-score outlier detection method. + Flag outiers using the modified Z-score outlier detection method. See references [1] for more details on the algorithm. - Note, that the test needs the input data to be sampled regularly (fixed sampling rate). + Note + ---- + Data needs to be sampled at a regular equidistant time grid. Parameters ---------- @@ -858,20 +870,19 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function flags values that are regarded outliers due to the grubbs test. - - See reference [1] for more information on the grubbs tests definition. + Flag outliers using the Grubbs algorithm. - The (two-sided) test gets applied onto data chunks of size "window". The tests - application will be iterated on each data-chunk under test, till no more - outliers are detected in that chunk. + See [1] for more information on the grubbs tests definition. - Note, that the test performs poorely for small data chunks (resulting in heavy - overflagging). Therefor you should select "window" so that every window contains - at least > 8 values and also adjust the min_periods values accordingly. + The (two-sided) test gets applied to data chunks of size ``window``. The + tests will be iterated chunkwise until no more outliers are detected. - Note, that the data to be tested by the grubbs test are expected to be distributed - "normalish". + Note + ---- + * The test performs poorly for small data chunks, resulting in considerable + overflagging. Select ``window`` such that every data chunck contains at + least 8 values and also adjust the ``min_periods`` values accordingly. + * The dara is expected to be normally distributed Parameters ---------- @@ -879,25 +890,22 @@ class OutliersMixin: The fieldname of the column, holding the data-to-be-flagged. window : {int, str} - The size of the window you want to use for outlier testing. If an integer is - passed, the size refers to the number of periods of every testing window. If a - string is passed, it has to be an offset string, and will denote the total - temporal extension of every window. + Size of the testing window. + If an integer, the fixed number of observations used for each window. + If an offset string the time period of each window. alpha : float, default 0.05 - The level of significance, the grubbs test is to be performed at. (between 0 and 1) + Level of significance, the grubbs test is to be performed at. Must be between 0 and 1 min_periods : int, default 8 - The minimum number of values that have to be present in an interval under test, - for a grubbs test result to be accepted. Only makes sence in case `window` is - an offset string. + Minimum number of values needed in a ``window`` in order to perform the grubs test. + Ignored if ``window`` is an integer. pedantic: boolean, default False - If True, every value gets checked twice for being an outlier. Ones in the - initial rolling window and one more time in a rolling window that is lagged - by half the windows delimeter (window/2). Recommended for avoiding false - positives at the window edges. Only available when rolling with integer - defined window size. + If ``True``, every value gets checked twice. First in the initial rolling ``window`` + and second in a rolling window that is lagging by ``window``/2. Recommended to avoid + false positives at the window edges. + Ignored if ``window`` is an offset string. flag : float, default BAD flag to set. diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index c7dcf511e5a25b0345ff65c4556993bee2d736ac..c27aa1f1318da55b54d4cdca7509b73794a8df5c 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -132,7 +132,7 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - Function to shift data and flags to a regular (equidistant) timestamp grid, according to ``method``. + Shift data points and flags to a regular frequency grid. Parameters ---------- @@ -140,24 +140,19 @@ class ResamplingMixin: The fieldname of the column, holding the data-to-be-shifted. freq : str - An frequency Offset String that will be interpreted as the sampling rate you want the data to be shifted to. + Offset string. Sampling rate of the target frequency. method : {'fshift', 'bshift', 'nshift'}, default 'nshift' - Specifies how misaligned data-points get propagated to a grid timestamp. - Following choices are available: + Method to propagate values: - * 'nshift' : every grid point gets assigned the nearest value in its range. (range = +/- 0.5 * `freq`) - * 'bshift' : every grid point gets assigned its first succeeding value, if one is available in - the succeeding sampling interval. - * 'fshift' : every grid point gets assigned its ultimately preceding value, if one is available in - the preceeding sampling interval. + * 'nshift' : shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` + * 'bshift' : shift grid points to the first succeeding time stamp (if any) + * 'fshift' : shift grid points to the last preceeding time stamp (if any) freq_check : {None, 'check', 'auto'}, default None - - * ``None`` : do not validate frequency-string passed to `freq` - * 'check' : estimate frequency and log a warning if estimate miss matches frequency string passed to `freq`, - or if no uniform sampling rate could be estimated - * 'auto' : estimate frequency and use estimate. (Ignores `freq` parameter.) + * ``None`` : do not validate the ``freq`` string. + * 'check' : check ``freq`` against an frequency estimation, produces a warning in case of miss matches. + * 'auto' : estimate frequency, `freq` is ignored. Returns ------- @@ -202,12 +197,12 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - Function to resample the data. + Resample data points and flags to a regular frequency. - The data will be sampled at regular (equidistant) timestamps aka. Grid points. + The data will be sampled to regular (equidistant) timestamps. Sampling intervals therefore get aggregated with a function, specified by - 'func' parameter and the result gets projected onto the new timestamps with a - method, specified by "method". The following method (keywords) are available: + ``func``, the result is projected to the new timestamps using + ``method``. The following methods are available: * ``'nagg'``: all values in the range (+/- `freq`/2) of a grid point get aggregated with func and assigned to it. @@ -217,15 +212,13 @@ class ResamplingMixin: the result gets assigned to the next grid point. - Note, that. if possible, functions passed to func will get projected - internally onto pandas.resample methods, wich results in some reasonable - performance boost - however, for this to work, you should pass functions that - have the __name__ attribute initialised and the according methods name assigned - to it. Furthermore, you shouldnt pass numpys nan-functions (``nansum``, - ``nanmean``,...) because those for example, have ``__name__ == 'nansum'`` and - they will thus not trigger ``resample.func()``, but the slower ``resample.apply( - nanfunc)``. Also, internally, no nans get passed to the functions anyway, - so that there is no point in passing the nan functions. + Note + ---- + For perfomance reasons, ``func`` will be mapped to pandas.resample methods, + if possible. However, for this to work, functions need an initialized + ``__name__`` attribute, holding the function's name. Furthermore, you should + not pass numpys nan-functions (``nansum``, ``nanmean``,...) because they + cannot be optimised and the handling of ``NaN`` is already taken care of. Parameters ---------- @@ -233,19 +226,18 @@ class ResamplingMixin: The fieldname of the column, holding the data-to-be-resampled. freq : str - An Offset String, that will be interpreted as the frequency you want to - resample your data with. + Offset string. Sampling rate of the target frequency grid. func : Callable - The function you want to use for aggregation. + Aggregation function. See notes for performance considerations. method: {'fagg', 'bagg', 'nagg'}, default 'bagg' Specifies which intervals to be aggregated for a certain timestamp. (preceding, succeeding or "surrounding" interval). See description above for more details. maxna : {None, int}, default None - Maximum number NaNs in a resampling interval. If maxna is exceeded, the interval - is set entirely to NaN. + Maximum number of allowed ``NaN``s in a resampling interval. If exceeded, the + entire interval is filled with ``NaN``. maxna_group : {None, int}, default None Same as `maxna` but for consecutive NaNs. @@ -341,37 +333,14 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - The Function appends flags history of ``fields`` to flags history of ``target``. - Before appending, columns in ``field`` history are projected onto the target index via ``method`` - - method: (field_flag associated with "field", source_flags associated with "source") - - * 'inverse_nagg' - all target_flags within the range +/- freq/2 of a field_flag, get assigned this field flags value. - (if field_flag > target_flag) - - * 'inverse_bagg' - all target_flags succeeding a field_flag within the range of "freq", get assigned this field flags - value. (if field_flag > target_flag) - - * 'inverse_fagg' - all target_flags preceeding a field_flag within the range of "freq", get assigned this field flags - value. (if field_flag > target_flag) - - * 'inverse_interpolation' - all target_flags within the range +/- freq of a field_flag, get assigned this source flags value. - (if field_flag > target_flag) + Append the flags/history of ``field`` to ``target``. If necessary the flags are + projected to the ``target`` frequency grid. - * 'inverse_nshift' - That target_flag within the range +/- freq/2, that is nearest to a field_flag, gets the source - flags value. (if field_flag > target_flag) - - * 'inverse_bshift' - That target_flag succeeding a field flag within the range freq, that is nearest to a - field_flag, gets assigned this field flags value. (if field_flag > target_flag) - - * 'inverse_nshift' - That target_flag preceeding a field flag within the range freq, that is nearest to a - field_flag, gets assigned this field flags value. (if field_flag > target_flag) - - * 'match' - any target_flag with a timestamp matching a field_flags timestamp gets this field_flags value - (if field_flag > target_flag) - - Note, to undo or backtrack a resampling/shifting/interpolation that has been performed with a certain method, - you can just pass the associated "inverse" method. Also you should pass the same ``drop`` keyword. + Note + ---- + To undo or backtrack resampling, shifting or interpolation operations, use the + associated inversion method (e.g. to undo a former interpolation use + ``method="inverse_interpolation"``). Parameters ---------- @@ -382,22 +351,28 @@ class ResamplingMixin: Field name of flags history to append to. method : {'inverse_fagg', 'inverse_bagg', 'inverse_nagg', 'inverse_fshift', 'inverse_bshift', 'inverse_nshift', 'match'}, default 'match' - The method used for projection of ``field`` flags onto ``target`` flags. See description above for more details. + Method to project the flags of ``field`` the flags to ``target``: + + * 'inverse_nagg': project a flag of ``field`` to all timestamps of ``target`` within the range +/- ``freq``/2. + * 'inverse_bagg': project a flag of ``field`` to all preceeding timestamps of ``target`` within the range ``freq`` + * 'inverse_fagg': project a flag of ``field`` to all succeeding timestamps of ``target`` within the range ``freq`` + * 'inverse_interpolation' - project a flag of ``field`` to all timestamps of ``target`` within the range +/- ``freq`` + * 'inverse_nshift' - project a flag of ``field`` to the neaerest timestamps in ``target`` within the range +/- ``freq``/2 + * 'inverse_bshift' - project a flag of ``field`` to nearest preceeding timestamps in ``target`` + * 'inverse_nshift' - project a flag of ``field`` to nearest succeeding timestamps in ``target`` + * 'match' - project a flag of ``field`` to all identical timestamps ``target`` freq : str or None, default None - The ``freq`` determines the projection range for the projection method. See above description for more details. - Defaultly (None), the sampling frequency of ``field`` is used. + Projection range. If ``None`` the sampling frequency of ``field`` is used. drop : bool, default False - If set to `True`, the `field` column will be removed after processing + Remove ``field`` if ``True`` squeeze : bool, default False - If set to `True`, the appended flags frame will be squeezed - resulting in function specific flags informations - getting lost. + Squueze the history into a single column if ``True``. Function specific flag information is lost. overwrite: bool, default False - If set to True, the newly appended flags will overwrite exsiting flags. This might result in a loss of previous - flagging information. + Overwrite existing flags if ``True`` Returns ------- diff --git a/saqc/funcs/transformation.py b/saqc/funcs/transformation.py index a55568ddb1b300cf438898513c07a11a73fdec3f..ee6d4292442795a24753bcb551ff2900e8ab3b22 100644 --- a/saqc/funcs/transformation.py +++ b/saqc/funcs/transformation.py @@ -23,14 +23,12 @@ class TransformationMixin: def transform( self: "SaQC", field: str, - func: Callable[[pd.Series], pd.Series], + func: Callable[[pd.Series | np.ndarray], pd.Series], freq: Optional[Union[float, str]] = None, **kwargs, ) -> "SaQC": """ - Function to transform data columns with a transformation that maps series onto series of the same length. - - Note, that flags get preserved. + Transform data by applying a custom function on data chunks of variable size. Existing flags are preserved. Parameters ---------- @@ -38,15 +36,14 @@ class TransformationMixin: The fieldname of the column, holding the data-to-be-transformed. func : Callable[{pd.Series, np.array}, np.array] - Function to transform data[field] with. + Transformation function. freq : {None, float, str}, default None - Determines the segmentation of the data into partitions, the transformation is applied on individually + Size of the data partition. The transformation is applied on each partition individually - * ``np.inf``: Apply transformation on whole data set at once - * ``x`` > 0 : Apply transformation on successive data chunks of periods length ``x`` - * Offset String : Apply transformation on successive partitions of temporal extension matching the passed offset - string + * ``None``: Apply transformation on the entire data set at once + * ``int`` : Apply transformation on successive data chunks of the given length. Must be grater than 0. + * Offset String : Apply transformation on successive data chunks of the given temporal extension. Returns -------