diff --git a/requirements.txt b/requirements.txt index 022c709f6739f70ce9256e128e79823a78c65888..39556855e9dc88520918b2d2a52450de55a99632 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ Click==8.1.3 docstring_parser==0.15 dtw==1.4.0 matplotlib==3.7.1 -numba==0.57.0 numpy==1.24.3 outlier-utils==0.0.3 pyarrow==11.0.0 @@ -14,4 +13,4 @@ pandas==2.0.1 scikit-learn==1.2.2 scipy==1.10.1 typing_extensions==4.5.0 -fancy-collections==0.2.1 +fancy-collections==0.2.1 \ No newline at end of file diff --git a/saqc/funcs/changepoints.py b/saqc/funcs/changepoints.py index b1db550fc0f07825c243796a666aee1e06286444..8037b4345369fe50b5665f336a32e203e51b311f 100644 --- a/saqc/funcs/changepoints.py +++ b/saqc/funcs/changepoints.py @@ -8,9 +8,8 @@ from __future__ import annotations import typing -from typing import TYPE_CHECKING, Callable, Tuple +from typing import TYPE_CHECKING, Callable, Literal, Tuple -import numba import numpy as np import pandas as pd @@ -44,9 +43,11 @@ class ChangepointsMixin: Parameters ---------- stat_func : - A function that assigns a value to every twin window. The backward-facing - window content will be passed as the first array, the forward-facing window - content as the second. + * If callable: A function that assigns a scalar value to every twin window. The backward-facing + window content will be passed as the first array, the forward-facing window + content as the second. + * If string: The respective statistic will be calculated for both the windows and the absolute difference of + the results will be returned. thresh_func : A function that determines the value level, exceeding wich qualifies a @@ -245,31 +246,8 @@ def _getChangePoints( check_len = len(fwd_end) data_arr = data.values - # Please keep this as I sometimes need to disable jitting manually - # to make it work with my debugger :/ - # --palmb - try_to_jit = True - if try_to_jit: - jit_sf = numba.jit(stat_func, nopython=True) - jit_tf = numba.jit(thresh_func, nopython=True) - try: - jit_sf( - data_arr[bwd_start[0] : bwd_end[0]], data_arr[fwd_start[0] : fwd_end[0]] - ) - jit_tf( - data_arr[bwd_start[0] : bwd_end[0]], data_arr[fwd_start[0] : fwd_end[0]] - ) - stat_func = jit_sf - thresh_func = jit_tf - except (numba.TypingError, numba.UnsupportedError, IndexError): - try_to_jit = False - args = data_arr, bwd_start, fwd_end, split, stat_func, thresh_func, check_len - - if try_to_jit: - stat_arr, thresh_arr = _slidingWindowSearchNumba(*args) - else: - stat_arr, thresh_arr = _slidingWindowSearch(*args) + stat_arr, thresh_arr = _slidingWindowSearch(*args) result_arr = stat_arr > thresh_arr @@ -324,20 +302,6 @@ def _getChangePoints( ) -@numba.jit(parallel=True, nopython=True) -def _slidingWindowSearchNumba( - data_arr, bwd_start, fwd_end, split, stat_func, thresh_func, num_val -): - stat_arr = np.zeros(num_val) - thresh_arr = np.zeros(num_val) - for win_i in numba.prange(0, num_val - 1): - x = data_arr[bwd_start[win_i] : split[win_i]] - y = data_arr[split[win_i] : fwd_end[win_i]] - stat_arr[win_i] = stat_func(x, y) - thresh_arr[win_i] = thresh_func(x, y) - return stat_arr, thresh_arr - - def _slidingWindowSearch( data_arr, bwd_start, fwd_end, split, stat_func, thresh_func, num_val ): @@ -353,7 +317,7 @@ def _slidingWindowSearch( def _reduceCPCluster(stat_arr, thresh_arr, start, end, obj_func, num_val): out_arr = np.zeros(shape=num_val, dtype=bool) - for win_i in numba.prange(0, num_val): + for win_i in range(num_val): s, e = start[win_i], end[win_i] x = stat_arr[s:e] y = thresh_arr[s:e] diff --git a/saqc/funcs/curvefit.py b/saqc/funcs/curvefit.py index c5382b6dc586212c0a1d540d60aadf89167b7a6c..f79250808ebcc3911300470b2c93c40665ca41d7 100644 --- a/saqc/funcs/curvefit.py +++ b/saqc/funcs/curvefit.py @@ -20,8 +20,6 @@ from saqc.lib.ts_operators import ( polyRoller, polyRollerIrregular, polyRollerNoMissing, - polyRollerNoMissingNumba, - polyRollerNumba, ) if TYPE_CHECKING: @@ -176,17 +174,13 @@ def _fitPolynomial( fitted = to_fit.rolling( pd.Timedelta(window), closed="both", min_periods=min_periods, center=True ).apply(polyRollerIrregular, args=(centers, order)) - else: + else: # if regular if isinstance(window, str): window = pd.Timedelta(window) // regular if window % 2 == 0: window = int(window - 1) if min_periods is None: min_periods = window - if len(to_fit) < 200000: - numba = False - else: - numba = True val_range = np.arange(0, window) center_index = window // 2 @@ -202,43 +196,20 @@ def _fitPolynomial( miss_marker = np.floor(miss_marker - 1) na_mask = to_fit.isna() to_fit[na_mask] = miss_marker - if numba: - fitted = to_fit.rolling(window).apply( - polyRollerNumba, - args=(miss_marker, val_range, center_index, order), - raw=True, - engine="numba", - engine_kwargs={"no_python": True}, - ) - # due to a tiny bug - rolling with center=True doesnt work - # when using numba engine. - fitted = fitted.shift(-int(center_index)) - else: - fitted = to_fit.rolling(window, center=True).apply( - polyRoller, - args=(miss_marker, val_range, center_index, order), - raw=True, - ) + + fitted = to_fit.rolling(window, center=True).apply( + polyRoller, + args=(miss_marker, val_range, center_index, order), + raw=True, + ) fitted[na_mask] = np.nan else: # we only fit fully populated intervals: - if numba: - fitted = to_fit.rolling(window).apply( - polyRollerNoMissingNumba, - args=(val_range, center_index, order), - engine="numba", - engine_kwargs={"no_python": True}, - raw=True, - ) - # due to a tiny bug - rolling with center=True doesnt work - # when using numba engine. - fitted = fitted.shift(-int(center_index)) - else: - fitted = to_fit.rolling(window, center=True).apply( - polyRollerNoMissing, - args=(val_range, center_index, order), - raw=True, - ) + fitted = to_fit.rolling(window, center=True).apply( + polyRollerNoMissing, + args=(val_range, center_index, order), + raw=True, + ) data[field] = fitted worst = flags[field].rolling(window, center=True, min_periods=min_periods).max() diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 9be06ce2d1fba8a1a051de380a525ffea14962e0..78ad2486bf58da4546dc490a111b96c1baea6f86 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -12,7 +12,6 @@ import uuid import warnings from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple -import numba import numpy as np import numpy.polynomial.polynomial as poly import pandas as pd @@ -742,14 +741,7 @@ class OutliersMixin: # get invalid-raise/drop mask: raise_series = dataseries.rolling(raise_window_td, min_periods=2, closed="both") - numba_boost = True - if numba_boost: - raise_check_boosted = numba.jit(raise_check, nopython=True) - raise_series = raise_series.apply( - raise_check_boosted, args=(thresh,), raw=True, engine="numba" - ) - else: - raise_series = raise_series.apply(raise_check, args=(thresh,), raw=True) + raise_series = raise_series.apply(raise_check, args=(thresh,), raw=True) if raise_series.isna().all(): return self @@ -790,21 +782,10 @@ class OutliersMixin: weights_rolling_sum = weights.rolling( average_window, min_periods=2, closed="both" ) - if numba_boost: - custom_rolling_mean_boosted = numba.jit(custom_rolling_mean, nopython=True) - weighted_rolling_mean = weighted_rolling_mean.apply( - custom_rolling_mean_boosted, raw=True, engine="numba" - ) - weights_rolling_sum = weights_rolling_sum.apply( - custom_rolling_mean_boosted, raw=True, engine="numba" - ) - else: - weighted_rolling_mean = weighted_rolling_mean.apply( - custom_rolling_mean, raw=True - ) - weights_rolling_sum = weights_rolling_sum.apply( - custom_rolling_mean, raw=True, engine="numba" - ) + weighted_rolling_mean = weighted_rolling_mean.apply( + custom_rolling_mean, raw=True + ) + weights_rolling_sum = weights_rolling_sum.apply(custom_rolling_mean, raw=True) weighted_rolling_mean = weighted_rolling_mean / weights_rolling_sum # check means against critical raise value: diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 375ae1f0f2b6d1b75d7a4aa9351eeb3528ae1475..8812644f49b32f622bc235f402006b1818b28131 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -14,7 +14,6 @@ import sys import warnings from typing import Union -import numba as nb import numpy as np import numpy.polynomial.polynomial as poly import pandas as pd @@ -209,7 +208,6 @@ def maxGap(in_arr): return max(in_arr[0], max(np.diff(in_arr))) -@nb.njit def _exceedConsecutiveNanLimit(arr, max_consec): """ Check if array has more consecutive NaNs than allowed. @@ -226,17 +224,13 @@ def _exceedConsecutiveNanLimit(arr, max_consec): exceeded: bool True if more than allowed consecutive NaNs appear, False otherwise. """ - current = 0 - idx = 0 - while idx < arr.size: - while idx < arr.size and arr[idx]: - current += 1 - idx += 1 - if current > max_consec: - return True - current = 0 - idx += 1 - return False + s = arr.shape[0] + if s <= max_consec: + return False + views = np.lib.stride_tricks.sliding_window_view( + arr, window_shape=min(s, max_consec + 1) + ) + return bool(views.all(axis=1).any()) def validationTrafo(data, max_nan_total, max_nan_consec): @@ -538,61 +532,6 @@ def butterFilter( return y -@nb.njit -def _coeffMat(x, deg): - # helper function to construct numba-compatible polynomial fit function - mat_ = np.zeros(shape=(x.shape[0], deg + 1)) - const = np.ones_like(x) - mat_[:, 0] = const - mat_[:, 1] = x - if deg > 1: - for n in range(2, deg + 1): - mat_[:, n] = x**n - return mat_ - - -@nb.jit(nopython=True) -def _fitX(a, b): - # helper function to construct numba-compatible polynomial fit function - # linalg solves ax = b - det_ = np.linalg.lstsq(a, b)[0] - return det_ - - -@nb.jit(nopython=True) -def _fitPoly(x, y, deg): - # a numba compatible polynomial fit function - a = _coeffMat(x, deg) - p = _fitX(a, y) - # Reverse order so p[0] is coefficient of highest order - return p[::-1] - - -@nb.jit(nopython=True) -def evalPolynomial(P, x): - # a numba compatible polynomial evaluator - result = 0 - for coeff in P: - result = x * result + coeff - return result - - -def polyRollerNumba(in_slice, miss_marker, val_range, center_index, poly_deg): - # numba compatible function to roll with when modelling data with polynomial model - miss_mask = in_slice == miss_marker - x_data = val_range[~miss_mask] - y_data = in_slice[~miss_mask] - fitted = _fitPoly(x_data, y_data, deg=poly_deg) - return evalPolynomial(fitted, center_index) - - -def polyRollerNoMissingNumba(in_slice, val_range, center_index, poly_deg): - # numba compatible function to roll with when modelling data with polynomial model - - # it is assumed, that in slice is an equidistant sample - fitted = _fitPoly(val_range, in_slice, deg=poly_deg) - return evalPolynomial(fitted, center_index) - - def polyRoller(in_slice, miss_marker, val_range, center_index, poly_deg): # function to roll with when modelling data with polynomial model miss_mask = in_slice == miss_marker diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py index 0541525865c734382a3abddd27a481e0259c4543..ad75d71d768f335331d0ddefb7b1e4df7899906d 100644 --- a/tests/funcs/test_outlier_detection.py +++ b/tests/funcs/test_outlier_detection.py @@ -72,7 +72,6 @@ def test_flagSpikesLimitRaise(dat): thresh=2, freq="10min", raise_window="20min", - numba_boost=False, flag=BAD, ) assert np.all(qc.flags[field][characteristics["raise"]] > UNFLAGGED) @@ -221,10 +220,7 @@ def test_flagUniLOF(spiky_data, n, p, thresh): qc = SaQC(data).flagUniLOF(field, n=n, p=p, thresh=thresh) flag_result = qc.flags[field] test_sum = (flag_result[spiky_data[1]] == BAD).sum() - try: - assert test_sum == len(spiky_data[1]) - except AssertionError: - print("stop") + assert test_sum == len(spiky_data[1]) @pytest.mark.parametrize("vars", [1, 2, 3]) diff --git a/tests/lib/test_ts_operators.py b/tests/lib/test_ts_operators.py index 96fed103586c828ca5ac4003d0d6dc2fd349aa76..875e78a80bb67e0e7773bdae349ad8762471282e 100644 --- a/tests/lib/test_ts_operators.py +++ b/tests/lib/test_ts_operators.py @@ -45,9 +45,12 @@ F = False (np.array([F, T, T, F, T, T, F]), 2, False), ], ) -def test__exceedConsecutiveNanLimit(arr, maxc, expected): +def test_exceedConsecutiveNanLimit(arr, maxc, expected): result = tsops._exceedConsecutiveNanLimit(arr, maxc) - assert result is expected + try: + assert result is expected + except AssertionError: + print("stop") def dtSeries(data, freq="1d"):