diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index e0cc62ef8258c0124288b9b3274aacd14108b031..3cd681221b37cfa17eb580128becd98a1fc774f2 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -266,12 +266,14 @@ def flagPattern(data, field, flagger, reference_field, method='dtw', partition_f waveform='mexh', **kwargs): """ Implementation of two pattern recognition algorithms: - - Dynamic Time Warping (dtw) [1] - - Pattern recognition via wavelets [2] + + * Dynamic Time Warping (dtw) [1] + * Pattern recognition via wavelets [2] The steps are: + 1. Get the frequency of partitions, in which the time series has to be divided (for example: a pattern occurs daily, - or every hour) + or every hour) 2. Compare each partition with the given pattern 3. Check if the compared partition contains the pattern or not 4. Flag partition if it contains the pattern @@ -290,7 +292,7 @@ def flagPattern(data, field, flagger, reference_field, method='dtw', partition_f Pattern Recognition method to be used. partition_freq : str, default 'days' Frequency, in which the pattern occurs. - Has to be an offset string or one out of {"days", "months"}. If 'days' or 'months' is passed, + Has to be an offset string or one out of {``'days'``, ``'months'``}. If ``'days'`` or ``'months'`` is passed, then precise length of partition is calculated from pattern length. partition_offset : str, default '0' If partition frequency is given by an offset string and the pattern starts after a timely offset, this offset @@ -308,10 +310,10 @@ def flagPattern(data, field, flagger, reference_field, method='dtw', partition_f Weather or not, the ending of the probe and of the pattern have to be projected onto each other in the search for the optimal dtw-mapping. Recommendation of [1]. widths : tuple[int], default (1,2,4,8) - Only effective if method = 'wavelets'. + Only effective if `method` = ``'wavelets'``. Widths for wavelet decomposition. [2] recommends a dyadic scale. waveform: str, default 'mexh' - Only effective if method = 'wavelets'. + Only effective if `method` = ``'wavelets'``. Wavelet to be used for decomposition. Default: 'mexh' Returns @@ -326,7 +328,7 @@ def flagPattern(data, field, flagger, reference_field, method='dtw', partition_f ---------- [1] https://cran.r-project.org/web/packages/dtw/dtw.pdf [2] Maharaj, E.A. (2002): Pattern Recognition of Time Series using Wavelets. In: Härdle W., Rönz B. (eds) Compstat. - Physica, Heidelberg, 978-3-7908-1517-7. + Physica, Heidelberg, 978-3-7908-1517-7. """ test = data[field].copy() @@ -695,11 +697,12 @@ def flagManual(data, field, flagger, mdata, mflag: Any = 1, method="plain", **kw method : {'plain', 'ontime', 'left-open', 'right-open'}, default plain Defines how mdata is projected on data. Except for the 'plain' method, the methods assume mdata to have an index. + * 'plain': mdata must have the same length as data and is projected one-to-one on data. * 'ontime': works only with indexed mdata. mdata entries are matched with data entries that have the same index. * 'right-open': mdata defines intervals, values are to be projected on. - The intervals are defined by any two consecutive timestamps t_1 and 1_2 in mdata. - the value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2. + The intervals are defined by any two consecutive timestamps t_1 and 1_2 in mdata. + the value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2. * 'left-open': like 'right-open', but the projected interval now covers all t with t_1 < t <= t_2. Returns @@ -802,14 +805,16 @@ def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat='modZscore """ Function checks for outliers relatively to the "horizontal" input data axis. - For fields=[f_1,f_2,...,f_N] and timestamps [t_1,t_2,...,t_K], the following steps are taken for outlier detection: + For `fields` :math:`=[f_1,f_2,...,f_N]` and timestamps :math:`[t_1,t_2,...,t_K]`, the following steps are taken + for outlier detection: - 1. All timestamps t_i, where there is one f_k, with data[f_K] having no entry at t_i, are excluded from the - following process (inner join of the f_i fields.) - 2. for every 0 <= i <= K, the value m_j = median({data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]}) is - calculated - 2. for every 0 <= i <= K, the set {data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j} is tested - for outliers with the specified method (`cross_stat` parameter) + 1. All timestamps :math:`t_i`, where there is one :math:`f_k`, with :math:`data[f_K]` having no entry at + :math:`t_i`, are excluded from the following process (inner join of the :math:`f_i` fields.) + 2. for every :math:`0 <= i <= K`, the value + :math:`m_j = median(\\{data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]\\})` is calculated + 2. for every :math:`0 <= i <= K`, the set + :math:`\\{data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j\\}` is tested for outliers with the + specified method (`cross_stat` parameter). Parameters ---------- @@ -825,9 +830,10 @@ def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat='modZscore Threshold which the outlier score of an value must exceed, for being flagged an outlier. cross_stat : {'modZscore', 'Zscore'}, default 'modZscore' Method used for calculating the outlier scores. - * 'modZscore': Median based "sigma"-ish approach. See Referenecs [1]. - * 'Zscore': Score values by how many times the standard deviation they differ from the median. - See References [1] + + * ``'modZscore'``: Median based "sigma"-ish approach. See Referenecs [1]. + * ``'Zscore'``: Score values by how many times the standard deviation they differ from the median. + See References [1] Returns ------- @@ -925,13 +931,15 @@ def flagDriftFromNorm(data, field, flagger, fields, segment_freq, norm_spread, n following steps are performed for every data "segment" of length `segment_freq` in order to find the "abnormal" data: - 1. Calculate the distances d(x_i,x_j) for all x_i in parameter `fields` and "d" denoting the distance function - passed to the parameter `metric`. - 2. Calculate a dendogram with a hierarchical linkage algorithm, specified by the parameter `linkage_method` + 1. Calculate the distances :math:`d(x_i,x_j)` for all :math:`x_i` in parameter `fields`. (with :math:`d` + denoting the distance function + passed to the parameter `metric`. + 2. Calculate a dendogram with a hierarchical linkage algorithm, specified by the parameter `linkage_method`. 3. Flatten the dendogram at the level, the agglomeration costs exceed the value given by the parameter `norm_spread` 4. check if there is a cluster containing more than `norm_frac` percentage of the variables in fields. - if yes: flag all the variables that are not in that cluster (inside the segment) - if no: flag nothing + + 1. if yes: flag all the variables that are not in that cluster (inside the segment) + 2. if no: flag nothing The main parameter giving control over the algorithms behavior is the `norm_spread` parameter, that determines the maximum spread of a normal group by limiting the costs, a cluster agglomeration must not exceed in every diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index 4039a16444f79a52f04c74480ef737c748892cd2..49762412c2e0473d0604b59209739cd7641f0ce5 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -28,30 +28,30 @@ def harm_shift2Grid(data, field, flagger, freq, method="nshift", to_drop=None, * Method keywords: - 'nshift' - every grid point gets assigned the nearest value in its range ( range = +/-(freq/2) ) - 'bshift' - every grid point gets assigned its first succeeding value - if there is one available in the - succeeding sampling interval. (equals resampling wih "first") - 'fshift' - every grid point gets assigned its ultimately preceeding value - if there is one available in - the preceeding sampling interval. (equals resampling with "last") + * ``'nshift'``: every grid point gets assigned the nearest value in its range (*range = +/-(freq/2)*) + * ``'bshift'``: every grid point gets assigned its first succeeding value - if there is one available in the + succeeding sampling interval. + * ``'fshift'``: every grid point gets assigned its ultimately preceding value - if there is one available in + the preceeding sampling interval. Note: the flags associated with every datapoint will just get shifted with them. - Note: if there is no valid data (exisiing and not-na) available in a sampling interval assigned to a regular + Note: if there is no valid data (existing and not-na) available in a sampling interval assigned to a regular timestamp by the selected method, nan gets assigned to this timestamp. The associated flag will be of value - flagger.UNFLAGGED. + ``flagger.UNFLAGGED``. - Note: all data nans get excluded defaultly from shifting. If to_drop is None - all BAD flagged values get + Note: all data nans get excluded defaultly from shifting. If to_drop is None - all *BAD* flagged values get excluded as well. - Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept - in the data dios and assigned to the fieldname field + "_original". + Note: the method will likely and significantly alter values and shape of ``data[field]``. The original data is kept + in the data dios and assigned to the fieldname ``field + '_original'``. Parameters ---------- data : dios.DictOfSeries A dictionary of pandas.Series, holding all the data. field : str - The fieldname of the column, holding the data-to-be-regularized. + The field name of the column, holding the data-to-be-regularized. flagger : saqc.flagger.BaseFlagger A flagger object, holding flags and additional Informations related to `data`.freq freq : str @@ -60,14 +60,9 @@ def harm_shift2Grid(data, field, flagger, freq, method="nshift", to_drop=None, * Specifies if datapoints get propagated forwards, backwards or to the nearest grid timestamp. See description above for details to_drop : {List[str], str}, default None - Flagtypes you want to drop before shifting - effectively excluding values that are flagged + Flag types you want to drop before shifting - effectively excluding values that are flagged with a flag in to_drop from the shifting process. Default - results in flagger.BAD values being dropped initially. - freq_check : {None, 'check', 'auto'}, default None - - None: do not validate frequency-string passed to `freq` - - 'check': estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or - if no uniform sampling rate could be estimated - - 'auto': estimate frequency and use estimate. (Ignores `freq` parameter.) Returns ------- @@ -100,23 +95,19 @@ def harm_aggregate2Grid( The following method (keywords) are available: - 'nagg' (aggreagtion to nearest) - all values in the range (+/- freq/2) of a grid point get aggregated with agg_func - and assigned to it. - Flags get aggregated by `flag_func` and assigned the same way. - 'bagg' (backwards aggregation) - all values in a sampling interval get aggregated with agg_func and the result gets - assigned to the last regular timestamp. - Flags get aggregated by flag_func and assigned the same way. - 'fagg' (forward aggregation) - all values in a sampling interval get aggregated with agg_func and the result gets - assigned to the next regular timestamp. - Flags get aggregated by flag_func and assigned the same way. - + * ``'nagg'``: (aggreagtion to nearest) - all values in the range (+/- freq/2) of a grid point get aggregated with + `agg_func`. and assigned to it. Flags get aggregated by `flag_func` and assigned the same way. + * ``'bagg'``: (backwards aggregation) - all values in a sampling interval get aggregated with agg_func and the + result gets assigned to the last regular timestamp. Flags get aggregated by `flag_func` and assigned the same way. + * ``'fagg'``: (forward aggregation) - all values in a sampling interval get aggregated with agg_func and the result + gets assigned to the next regular timestamp. Flags get aggregated by `flag_func` and assigned the same way. Note, that, if there is no valid data (exisitng and not-na) available in a sampling interval assigned to a regular timestamp by the selected method, nan gets assigned to this timestamp. The associated flag will be of value - flagger.UNFLAGGED. + ``flagger.UNFLAGGED``. - Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept - in the data dios and assigned to the fieldname field + "_original". + Note: the method will likely and significantly alter values and shape of ``data[field]``. The original data is kept + in the data dios and assigned to the fieldname ``field + '_original'``. Parameters ---------- @@ -177,13 +168,13 @@ def harm_linear2Grid(data, field, flagger, freq, to_drop=None, **kwargs): Interpolated values will get assigned the worst flag within freq-range. - Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept - in the data dios and assigned to the fieldname field + "_original". + Note: the method will likely and significantly alter values and shape of ``data[field]``. The original data is kept + in the data dios and assigned to the fieldname ``field + '_original'``. Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and not-na) datapoint preceeding them and one succeeding them within freq range. Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value - flagger.UNFLAGGED. + ``flagger.UNFLAGGED``. Parameters ---------- @@ -229,15 +220,16 @@ def harm_interpolate2Grid(data, field, flagger, freq, method, order=1, to_drop=N There are available all the interpolations from the pandas.Series.interpolate method and they are called by the very same keywords. - Note, that, to perform a timestamp aware, linear interpolation, you have to pass 'time' as method, and NOT 'linear'. + Note, that, to perform a timestamp aware, linear interpolation, you have to pass ``'time'`` as `method`, + and NOT ``'linear'``. - Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept - in the data dios and assigned to the fieldname field + "_original". + Note: the `method` will likely and significantly alter values and shape of ``data[field]``. The original data is + kept in the data dios and assigned to the fieldname ``field + '_original'``. Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and not-na) datapoint preceeding them and one succeeding them within freq range. Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value - flagger.UNFLAGGED. + ``flagger.UNFLAGGED``. Parameters ---------- @@ -253,11 +245,11 @@ def harm_interpolate2Grid(data, field, flagger, freq, method, order=1, to_drop=N "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string The interpolation method you want to apply. order : int, default 1 - If your selected interpolation method can be performed at different 'orders' - here you pass the desired + If your selected interpolation method can be performed at different *orders* - here you pass the desired order. to_drop : {List[str], str}, default None Flagtypes you want to drop before interpolation - effectively excluding values that are flagged - with a flag in to_drop from the interpolation process. Default results in flagger.BAD + with a flag in `to_drop` from the interpolation process. Default results in ``flagger.BAD`` values being dropped initially. Returns @@ -291,16 +283,16 @@ def harm_deharmonize(data, field, flagger, method, to_drop=None, **kwargs): The Function function "undoes" regularization, by regaining the original data and projecting the flags calculated for the regularized data onto the original ones. - Afterwards the regularized data is removed from the data dios and 'field' will be associated - to the original data "again". + Afterwards the regularized data is removed from the data dios and ``'field'`` will be associated + with the original data "again". Wherever the flags in the original data are "better" then the regularized flags projected on them, they get overridden with this regularized flags value. Which regularized flags are to be projected on which original flags, is controlled by the "method" parameters. - Generally, if you regularized with the method 'X', you should pass the method 'inverse_X' to the deharmonization. - If you regularized with an interpolation, the method 'inverse_interpolation' would be the appropriate choice. + Generally, if you regularized with the method "X", you should pass the method "inverse_X" to the deharmonization. + If you regularized with an interpolation, the method "inverse_interpolation" would be the appropriate choice. Also you should pass the same drop flags keyword. The deharm methods in detail: @@ -308,26 +300,26 @@ def harm_deharmonize(data, field, flagger, method, to_drop=None, **kwargs): "regularized_flags" are associated with the regularized data that is to be "deharmonized", "freq" refers to the regularized datas sampling frequencie) - 'inverse_nagg' - all original_flags within the range +/- freq/2 of a regularized_flag, get assigned this - regularized flags value. (if regularized_flags > original_flag) - 'inverse_bagg' - all original_flags succeeding a regularized_flag within the range of "freq", get assigned this - regularized flags value. (if regularized_flag > original_flag) - 'inverse_fagg' - all original_flags preceeding a regularized_flag within the range of "freq", get assigned this - regularized flags value. (if regularized_flag > original_flag) + * ``'inverse_nagg'``: all original_flags within the range *+/- freq/2* of a regularized_flag, get assigned this + regularized flags value. (if regularized_flags > original_flag) + * ``'inverse_bagg'``: all original_flags succeeding a regularized_flag within the range of "freq", get assigned this + regularized flags value. (if regularized_flag > original_flag) + * ``'inverse_fagg'``: all original_flags preceeding a regularized_flag within the range of "freq", get assigned this + regularized flags value. (if regularized_flag > original_flag) - 'inverse_interpolation' - all original_flags within the range +/- freq of a regularized_flag, get assigned this - regularized flags value (if regularized_flag > original_flag). + * ``'inverse_interpolation'``: all original_flags within the range *+/- freq* of a regularized_flag, get assigned this + regularized flags value (if regularized_flag > original_flag). - 'inverse_nshift' - That original_flag within the range +/- freq/2, that is nearest to a regularized_flag, gets the - regularized flags value. (if regularized_flag > original_flag) - 'inverse_bshift' - That original_flag succeeding a source flag within the range freq, that is nearest to a - regularized_flag, gets assigned this regularized flags value. (if regularized_flag > original_flag) - 'inverse_nshift' - That original_flag preceeding a regularized flag within the range freq, that is nearest to a - regularized_flag, gets assigned this regularized flags value. (if source_flag > original_flag) + * ``'inverse_nshift'``: That original_flag within the range +/- *freq/2*, that is nearest to a regularized_flag, + gets the regularized flags value. (if regularized_flag > original_flag) + * ``'inverse_bshift'``: That original_flag succeeding a source flag within the range freq, that is nearest to a + regularized_flag, gets assigned this regularized flags value. (if regularized_flag > original_flag) + * ``'inverse_nshift'``: That original_flag preceeding a regularized flag within the range freq, that is nearest to a + regularized_flag, gets assigned this regularized flags value. (if source_flag > original_flag) Parameters ---------- - data : dios.DictOfSeries + data : dios.DictOfSeries A dictionary of pandas.Series, holding all the data. field : str The fieldname of the column, holding the data-to-be-deharmonized. @@ -335,7 +327,7 @@ def harm_deharmonize(data, field, flagger, method, to_drop=None, **kwargs): A flagger object, holding flags and additional Informations related to `data`.freq method : {'inverse_fagg', 'inverse_bagg', 'inverse_nagg', 'inverse_fshift', 'inverse_bshift', 'inverse_nshift', 'inverse_interpolation'} - The method used for projection of regularized flags onto opriginal flags. See description above for more + The method used for projection of regularized flags onto original flags. See description above for more details. to_drop : {List[str], str}, default None Flagtypes you want to drop before interpolation - effectively excluding values that are flagged diff --git a/saqc/funcs/proc_functions.py b/saqc/funcs/proc_functions.py index e4bc5a262e6c0c47ed8918f5e26417f8a2405bb2..be64c1a92e6b472366a017d3aa337edb0876e2a0 100644 --- a/saqc/funcs/proc_functions.py +++ b/saqc/funcs/proc_functions.py @@ -113,7 +113,7 @@ def proc_interpolateMissing( Function to interpolate nan values in the data. There are available all the interpolation methods from the pandas.interpolate method and they are applicable by - the very same key words, that you would pass to pd.Series.interpolates's method parameter. + the very same key words, that you would pass to the ``pd.Series.interpolate``'s method parameter. Note, that the `inter_limit` keyword really restricts the interpolation to chunks, not containing more than `inter_limit` successive nan entries. @@ -141,8 +141,8 @@ def proc_interpolateMissing( Flag that is to be inserted for the interpolated values. You can either pass one of the three major flag-classes or specify directly a certain flag from the passed flagger. downgrade_interpolation : bool, default False - If interpolation can not be performed at 'inter_order' - (not enough values or not implemented at this order) - - automaticalyy try to interpolate at order 'inter_order' - 1. + If interpolation can not be performed at `inter_order` - (not enough values or not implemented at this order) - + automaticalyy try to interpolate at order `inter_order` :math:`- 1`. not_interpol_flags : {None, str, List[str]}, default None A list of flags or a single Flag, marking values, you want NOT to be interpolated. @@ -204,7 +204,7 @@ def proc_interpolateGrid( """ Function to interpolate the data at regular (equidistant) timestamps (or Grid points). - Note, that the interpolation will only be calculated, for grid timestamps that have a preceeding AND a succeeding + Note, that the interpolation will only be calculated, for grid timestamps that have a preceding AND a succeeding valid data value within "freq" range. Note, that the function differs from proc_interpolateMissing, by returning a whole new data set, only containing @@ -233,26 +233,28 @@ def proc_interpolateGrid( If there your selected interpolation method can be performed at different 'orders' - here you pass the desired order. to_drop : {None, str, List[str]}, default None - Flags that refer to values you want to drop before interpotion - effectively excluding grid points from + Flags that refer to values you want to drop before interpolation - effectively excluding grid points from interpolation, that are only surrounded by values having a flag in them, that is listed in drop flags. Default - results in the flaggers 'BAD' flag to be the drop_flag. + results in the flaggers *BAD* flag to be the drop_flag. downgrade_interpolation : bool, default False - If interpolation can not be performed at 'inter_order' - (not enough values or not implemented at this order) - - automaticalyy try to interpolate at order 'inter_order' - 1. + If interpolation can not be performed at `inter_order` - (not enough values or not implemented at this order) - + automatically try to interpolate at order `inter_order` :math:`- 1`. empty_intervals_flag : str, default None - A Flag, that you want to assign to those values resulting equidistant sample grid, that were not surrounded by - valid (flagged) data in the original dataset and thus werent interpolated. Default automatically assigns - flagger.BAD flag to those values. + A Flag, that you want to assign to those values in the resulting equidistant sample grid, that were not + surrounded by valid data in the original dataset, and thus were not interpolated. Default automatically assigns + ``flagger.BAD`` flag to those values. grid_field : String, default None - Use the timestamp of another variable as (not nessecarily regular) "grid" to be interpolated. + Use the timestamp of another variable as (not necessarily regular) "grid" to be interpolated. inter_limit : Integer, default 2 Maximum number of consecutive Grid values allowed for interpolation. If set - to "n", in the result, chunks of "n" consecutive grid values wont be interpolated. + to *n*, chunks of *n* and more consecutive grid values, where there is no value in between, wont be + interpolated. freq_check : {None, 'check', 'auto'}, default None - - None: do not validate frequency-string passed to `freq` - - 'check': estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or - if no uniform sampling rate could be estimated - - 'auto': estimate frequency and use estimate. (Ignores `freq` parameter.) + + * ``None``: do not validate frequency-string passed to `freq` + * ``'check'``: estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or + if no uniform sampling rate could be estimated + * ``'auto'``: estimate frequency and use estimate. (Ignores `freq` parameter.) Returns ------- @@ -405,20 +407,20 @@ def proc_resample( the result gets projected onto the new timestamps with a method, specified by "method". The following method (keywords) are available: - 'nagg' - all values in the range (+/- freq/2) of a grid point get aggregated with agg_func and assigned to it. - 'bagg' - all values in a sampling interval get aggregated with agg_func and the result gets assigned to the last - grid point. - 'fagg' - all values in a sampling interval get aggregated with agg_func and the result gets assigned to the next - grid point. + * ``'nagg'``: all values in the range (+/- `freq`/2) of a grid point get aggregated with agg_func and assigned to it. + * ``'bagg'``: all values in a sampling interval get aggregated with agg_func and the result gets assigned to the last + grid point. + * ``'fagg'``: all values in a sampling interval get aggregated with agg_func and the result gets assigned to the next + grid point. Note, that. if possible, functions passed to agg_func will get projected internally onto pandas.resample methods, wich results in some reasonable performance boost - however, for this to work, you should pass functions that have the __name__ attribute initialised and the according methods name assigned to it. Furthermore, you shouldnt pass numpys nan-functions - (nansum, nanmean,...) because those for example, have __name__ == 'nansum' and they will thus not trigger - resample.func(), but the slower resample.apply(nanfunc). Also, internally, no nans get passed to the functions - anyway, so that there is no point in passing the nan functions. + (``nansum``, ``nanmean``,...) because those for example, have ``__name__ == 'nansum'`` and they will thus not + trigger ``resample.func()``, but the slower ``resample.apply(nanfunc)``. Also, internally, no nans get passed to + the functions anyway, so that there is no point in passing the nan functions. Parameters ---------- @@ -433,12 +435,12 @@ def proc_resample( agg_func : Callable The function you want to use for aggregation. method: {'fagg', 'bagg', 'nagg'}, default 'bagg' - Specifies which intervals to be aggregated for a certain timestamp. (preceeding, succeeding or + Specifies which intervals to be aggregated for a certain timestamp. (preceding, succeeding or "surrounding" interval). See description above for more details. max_invalid_total_d : {np.inf, int}, np.inf Maximum number of invalid (nan) datapoints, allowed per resampling interval. If max_invalid_total_d is - exceeded, the interval gets resampled to nan. By default (np.inf), there is no bound to the number of nan values - in an interval and only intervals containing ONLY nan values or those, containing no values at all, + exceeded, the interval gets resampled to nan. By default (``np.inf``), there is no bound to the number of nan + values in an interval and only intervals containing ONLY nan values or those, containing no values at all, get projected onto nan max_invalid_consec_d : {np.inf, int}, default np.inf Maximum number of consecutive invalid (nan) data points, allowed per resampling interval. @@ -446,11 +448,11 @@ def proc_resample( there is no bound to the number of consecutive nan values in an interval and only intervals containing ONLY nan values, or those containing no values at all, get projected onto nan. max_invalid_total_f : {np.inf, int}, default np.inf - Same as "max_invalid_total_d", only applying for the flags. The flag regarded as "invalid" value, - is the one passed to empty_intervals_flag (default=flagger.BAD). + Same as `max_invalid_total_d`, only applying for the flags. The flag regarded as "invalid" value, + is the one passed to empty_intervals_flag (default=``flagger.BAD``). Also this is the flag assigned to invalid/empty intervals. max_invalid_total_f : {np.inf, int}, default np.inf - Same as "max_invalid_total_f", only applying onto flgas. The flag regarded as "invalid" value, is the one passed + Same as `max_invalid_total_f`, only applying onto flags. The flag regarded as "invalid" value, is the one passed to empty_intervals_flag (default=flagger.BAD). Also this is the flag assigned to invalid/empty intervals. flag_agg_func : Callable, default: max The function you want to aggregate the flags with. It should be capable of operating on the flags dtype @@ -458,17 +460,18 @@ def proc_resample( empty_intervals_flag : {None, str}, default None A Flag, that you want to assign to invalid intervals. Invalid are those intervals, that contain nan values only, or no values at all. Furthermore the empty_intervals_flag is the flag, serving as "invalid" identifyer when - checking for "max_total_invalid_f" and "max_consec_invalid_f patterns". Default triggers flagger.BAD to be + checking for `max_total_invalid_f` and `max_consec_invalid_f patterns`. Default triggers ``flagger.BAD`` to be assigned. to_drop : {None, str, List[str]}, default None Flags that refer to values you want to drop before resampling - effectively excluding values that are flagged with a flag in to_drop from the resampling process - this means that they also will not be counted in the - the max_consec/max_total evaluation. to_drop = None results in NO flags being dropped initially. + the `max_consec`/`max_total evaluation`. `to_drop` = ``None`` results in NO flags being dropped initially. freq_check : {None, 'check', 'auto'}, default None - - None: do not validate frequency-string passed to `freq` - - 'check': estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or - if no uniform sampling rate could be estimated - - 'auto': estimate frequency and use estimate. (Ignores `freq` parameter.) + + * ``None``: do not validate frequency-string passed to `freq` + * ``'check'``: estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or + if no uniform sampling rate could be estimated + * ``'auto'``: estimate frequency and use estimate. (Ignores `freq` parameter.) Returns ------- @@ -532,16 +535,16 @@ def proc_resample( def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals_flag=None, freq_check=None, **kwargs): """ Function to shift data points to regular (equidistant) timestamps. - Values get shifted according to the keyword passed to 'method'. + Values get shifted according to the keyword passed to the `method` parameter. - Note: all data nans get excluded defaultly from shifting. If to_drop is None - all BAD flagged values get - excluded as well. + * ``'nshift'``: every grid point gets assigned the nearest value in its range. (range = +/- 0.5 * `freq`) + * ``'bshift'``: every grid point gets assigned its first succeeding value - if there is one available in the + succeeding sampling interval. + * ``'fshift'``: every grid point gets assigned its ultimately preceeding value - if there is one available in + the preceeding sampling interval. - 'nshift' - every grid point gets assigned the nearest value in its range ( range = +/-(freq/2) ) - 'bshift' - every grid point gets assigned its first succeeding value - if there is one available in the - succeeding sampling interval. (equals resampling wih "first") - 'fshift' - every grid point gets assigned its ultimately preceeding value - if there is one available in - the preceeding sampling interval. (equals resampling with "last") + Note: all data nans get excluded defaultly from shifting. If `to_drop` is ``None``, - all *BAD* flagged values get + excluded as well. Parameters ---------- @@ -564,10 +567,11 @@ def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals with a flag in to_drop from the shifting process. Default - to_drop = None - results in flagger.BAD values being dropped initially. freq_check : {None, 'check', 'auto'}, default None - - None: do not validate frequency-string passed to `freq` - - 'check': estimate frequency and log a warning if estimate miss matchs frequency string passed to 'freq', or - if no uniform sampling rate could be estimated - - 'auto': estimate frequency and use estimate. (Ignores `freq` parameter.) + + * ``None``: do not validate frequency-string passed to `freq` + * ``'check'``: estimate frequency and log a warning if estimate miss matches frequency string passed to `freq`, + or if no uniform sampling rate could be estimated + * ``'auto'``: estimate frequency and use estimate. (Ignores `freq` parameter.) Returns ------- @@ -1081,11 +1085,11 @@ def proc_correctRegimeAnomaly(data, field, flagger, cluster_field, model, regime Currently, the only correction mode supported is the "parameter propagation." - This means, any regime z, labeled negatively and being modeled by the parameters p, gets corrected via: + This means, any regime :math:`z`, labeled negatively and being modeled by the parameters p, gets corrected via: - z_correct = z + (m(p*) - m(p)), + :math:`z_{correct} = z + (m(p^*) - m(p))`, - where p* denotes the parameterset belonging to the fit of the nearest not-negatively labeled cluster. + where :math:`p^*` denotes the parameter set belonging to the fit of the nearest not-negatively labeled cluster. Parameters ---------- @@ -1099,14 +1103,14 @@ def proc_correctRegimeAnomaly(data, field, flagger, cluster_field, model, regime A string denoting the field in data, holding the cluster label for the data you want to correct. model : Callable The model function to be fitted to the regimes. - Must be a function of the form f(x, *p), where `x` is the numpy.array holding the independent variables and - `p` are the model parameters that are to be obtained by fitting. + It must be a function of the form :math:`f(x, *p)`, where :math:`x` is the ``numpy.array`` holding the + independent variables and :math:`p` are the model parameters that are to be obtained by fitting. Depending on the `x_date` parameter, independent variable x will either be the timestamps of every regime transformed to seconds from epoch, or it will be just seconds, counting the regimes length. regime_transmission : {None, str}, default None: - If an offset string is passed, a data chunk of length `regime_transimission` rigth at the - start and right at the end is ignored when fitting the model. This is to account for the unrelyability of data - near the changepoints of regimes. + If an offset string is passed, a data chunk of length `regime_transimission` right at the + start and right at the end is ignored when fitting the model. This is to account for the + unreliability of data near the changepoints of regimes. x_date : bool, default False If True, use "seconds from epoch" as x input to the model func, instead of "seconds from regime start".