saqc.py

"""

"""
def flagMissing(field, nodata, flag):
    """
    The function flags all values indicating missing data.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    nodata : any, default np.nan
        A value that defines missing data.
    flag : float, default BAD
        flag to set.
    """
    pass


def flagIsolated(field, gap_window, group_window, flag):
    """
    The function flags arbitrary large groups of values, if they are surrounded by sufficiently
    large data gaps.
    
    A gap is a timespan containing either no data or data invalid only (usually `nan`) .
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    gap_window : str
        The minimum size of the gap before and after a group of valid values, making this group considered an
        isolated group. See condition (2) and (3)
    group_window : str
        The maximum temporal extension allowed for a group that is isolated by gaps of size 'gap_window',
        to be actually flagged as isolated group. See condition (1).
    flag : float, default BAD
        flag to set.
    
    Notes
    -----
    A series of values :math:`x_k,x_{k+1},...,x_{k+n}`, with associated timestamps :math:`t_k,t_{k+1},...,t_{k+n}`,
    is considered to be isolated, if:
    
    1. :math:`t_{k+1} - t_n <` `group_window`
    2. None of the :math:`x_j` with :math:`0 < t_k - t_j <` `gap_window`, is valid (preceeding gap).
    3. None of the :math:`x_j` with :math:`0 < t_j - t_(k+n) <` `gap_window`, is valid (succeding gap).
    
    See Also
    --------
    :py:func:`flagMissing`
    """
    pass


def flagJumps(field, thresh, winsz, min_periods, flag):
    """
    Flag datapoints, where the mean of the values significantly changes (where the value course "jumps").
    
    Parameters
    ----------
    field : str
        The reference variable, the deviation from wich determines the flagging.
    thresh : float
        The threshold, the mean of the values have to change by, to trigger flagging.
    winsz : str
        The temporal extension, of the rolling windows, the mean values that are to be compared,
        are obtained from.
    min_periods : int, default 1
        Minimum number of periods that have to be present in a window of size `winsz`, so that
        the mean value obtained from that window is regarded valid.
    flag : float, default BAD
        flag to set.
    """
    pass


def flagChangePoints(field, stat_func, thresh_func, bwd_window, min_periods_bwd, fwd_window, min_periods_fwd, closed, reduce_window, reduce_func, flag):
    """
    Flag datapoints, where the parametrization of the process, the data is assumed to generate by, significantly
    changes.
    
    The change points detection is based on a sliding window search.
    
    Parameters
    ----------
    field : str
        The reference variable, the deviation from wich determines the flagging.
    stat_func : Callable[numpy.array, numpy.array]
         A function that assigns a value to every twin window. Left window content will be passed to first variable,
        right window content will be passed to the second.
    thresh_func : Callable[numpy.array, numpy.array]
        A function that determines the value level, exceeding wich qualifies a timestamps stat func value as denoting a
        changepoint.
    bwd_window : str
        The left (backwards facing) windows temporal extension (freq-string).
    min_periods_bwd : {str, int}
        Minimum number of periods that have to be present in a backwards facing window, for a changepoint test to be
        performed.
    fwd_window : {None, str}, default None
        The right (forward facing) windows temporal extension (freq-string).
    min_periods_fwd : {None, str, int}, default None
        Minimum number of periods that have to be present in a forward facing window, for a changepoint test to be
        performed.
    closed : {'right', 'left', 'both', 'neither'}, default 'both'
        Determines the closure of the sliding windows.
    reduce_window : {None, str}, default None
        The sliding window search method is not an exact CP search method and usually there wont be
        detected a single changepoint, but a "region" of change around a changepoint.
        If `reduce_window` is given, for every window of size `reduce_window`, there
        will be selected the value with index `reduce_func(x, y)` and the others will be dropped.
        If `reduce_window` is None, the reduction window size equals the
        twin window size, the changepoints have been detected with.
    reduce_func : Callable[[numpy.ndarray, numpy.ndarray], int], default lambda x, y: x.argmax()
        A function that must return an index value upon input of two arrays x and y.
        First input parameter will hold the result from the stat_func evaluation for every
        reduction window. Second input parameter holds the result from the thresh_func evaluation.
        The default reduction function just selects the value that maximizes the stat_func.
    flag : float, default BAD
        flag to set.
    
    Returns
    -------
    """
    pass


def assignChangePointCluster(field, stat_func, thresh_func, bwd_window, min_periods_bwd, fwd_window, min_periods_fwd, closed, reduce_window, reduce_func, flag_changepoints, model_by_resids, assign_cluster, flag):
    """
    Assigns label to the data, aiming to reflect continous regimes of the processes the data is assumed to be
    generated by.
    The regime change points detection is based on a sliding window search.
    
    Note, that the cluster labels will be stored to the `field` field of the input data, so that the data that is
    clustered gets overridden.
    
    Parameters
    ----------
    field : str
        The reference variable, the deviation from wich determines the flagging.
    stat_func : Callable[[numpy.array, numpy.array], float]
        A function that assigns a value to every twin window. Left window content will be passed to first variable,
        right window content will be passed to the second.
    thresh_func : Callable[numpy.array, numpy.array], float]
        A function that determines the value level, exceeding wich qualifies a timestamps stat func value as denoting a
        changepoint.
    bwd_window : str
        The left (backwards facing) windows temporal extension (freq-string).
    min_periods_bwd : int
        Minimum number of periods that have to be present in a backwards facing window, for a changepoint test to be
        performed.
    fwd_window : {None, str}, default None
        The right (forward facing) windows temporal extension (freq-string).
    min_periods_fwd : {None, int}, default None
        Minimum number of periods that have to be present in a forward facing window, for a changepoint test to be
        performed.
    closed : {'right', 'left', 'both', 'neither'}, default 'both'
        Determines the closure of the sliding windows.
    reduce_window : {None, str}, default None
        The sliding window search method is not an exact CP search method and usually there wont be
        detected a single changepoint, but a "region" of change around a changepoint.
        If `reduce_window` is given, for every window of size `reduce_window`, there
        will be selected the value with index `reduce_func(x, y)` and the others will be dropped.
        If `reduce_window` is None, the reduction window size equals the
        twin window size, the changepoints have been detected with.
    reduce_func : Callable[[numpy.array, numpy.array], numpy.array], default lambda x, y: x.argmax()
        A function that must return an index value upon input of two arrays x and y.
        First input parameter will hold the result from the stat_func evaluation for every
        reduction window. Second input parameter holds the result from the thresh_func evaluation.
        The default reduction function just selects the value that maximizes the stat_func.
    flag_changepoints : bool, default False
        If true, the points, where there is a change in data modelling regime detected gets flagged.
    model_by_resids : bool, default False
        If True, the data is replaced by the stat_funcs results instead of regime labels.
    assign_cluster : bool, default True
        Is set to False, if called by function that oly wants to calculate flags.
    flag : float, default BAD
        flag to set.
    
    Returns
    -------
    """
    pass


def flagConstants(field, thresh, window, flag):
    """
    This functions flags plateaus/series of constant values of length `window` if
    their maximum total change is smaller than thresh.
    
    Function flags plateaus/series of constant values. Any interval of values y(t),..y(t+n) is flagged, if:
    
    (1) n > `window`
    (2) |(y(t + i) - (t + j)| < `thresh`, for all i,j in [0, 1, ..., n]
    
    Flag values are (semi-)constant.
    
    Parameters
    ----------
    field : str
        Name of the column, holding the data-to-be-flagged.
    thresh : float
        Upper bound for the maximum total change of an interval to be flagged constant.
    window : str
        Lower bound for the size of an interval to be flagged constant.
    flag : float, default BAD
        flag to set.
    """
    pass


def flagByVariance(field, window, thresh, max_missing, max_consec_missing, flag):
    """
    Function flags plateaus/series of constant values. Any interval of values y(t),..y(t+n) is flagged, if:
    
    (1) n > `window`
    (2) variance(y(t),...,y(t+n) < `thresh`
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    window : str
        Only intervals of minimum size "window" have the chance to get flagged as constant intervals
    thresh : float
        The upper bound, the variance of an interval must not exceed, if the interval wants to be flagged a plateau.
    max_missing : {None, int}, default None
        Maximum number of nan values tolerated in an interval, for retrieving a valid
        variance from it. (Intervals with a number of nans exceeding "max_missing"
        have no chance to get flagged a plateau!)
    max_consec_missing : {None, int}, default None
        Maximum number of consecutive nan values allowed in an interval to retrieve a
        valid  variance from it. (Intervals with a number of nans exceeding
        "max_consec_missing" have no chance to get flagged a plateau!)
    flag : float, default BAD
        flag to set.
    """
    pass


def fitPolynomial(field, winsz, polydeg, numba, eval_flags, min_periods, return_residues, flag):
    """
    Function fits a polynomial model to the data and returns the fitted data curve.
    
    The fit is calculated by fitting a polynomial of degree `polydeg` to a data slice
    of size `winsz`, that has x at its center.
    
    Note, that the resulting fit is stored to the `field` field of the input data, so that the original data, the
    polynomial is fitted to, gets overridden.
    
    Note, that, if data[field] is not alligned to an equidistant frequency grid, the window size passed,
    has to be an offset string. Also numba boost options don`t apply for irregularly sampled
    timeseries.
    
    Note, that calculating the residues tends to be quite costy, because a function fitting is perfomed for every
    sample. To improve performance, consider the following possibillities:
    
    In case your data is sampled at an equidistant frequency grid:
    
    (1) If you know your data to have no significant number of missing values, or if you do not want to
        calculate residues for windows containing missing values any way, performance can be increased by setting
        min_periods=winsz.
    
    (2) If your data consists of more then around 200000 samples, setting numba=True, will boost the
        calculations up to a factor of 5 (for samplesize > 300000) - however for lower sample sizes,
        numba will slow down the calculations, also, up to a factor of 5, for sample_size < 50000.
        By default (numba='auto'), numba is set to true, if the data sample size exceeds 200000.
    
    in case your data is not sampled at an equidistant frequency grid:
    
    (1) Harmonization/resampling of your data will have a noticable impact on polyfittings performance - since
        numba_boost doesnt apply for irregularly sampled data in the current implementation.
    
    Note, that in the current implementation, the initial and final winsz/2 values do not get fitted.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-modelled.
    winsz : {str, int}
        The size of the window you want to use for fitting. If an integer is passed, the size
        refers to the number of periods for every fitting window. If an offset string is passed,
        the size refers to the total temporal extension. The window will be centered around the vaule-to-be-fitted.
        For regularly sampled timeseries the period number will be casted down to an odd number if
        even.
    polydeg : int
        The degree of the polynomial used for fitting
    numba : {True, False, "auto"}, default "auto"
        Wheather or not to apply numbas just-in-time compilation onto the poly fit function. This will noticably
        increase the speed of calculation, if the sample size is sufficiently high.
        If "auto" is selected, numba compatible fit functions get applied for data consisiting of > 200000 samples.
    eval_flags : bool, default True
        Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst
        flag present in the interval, the data for its calculation was obtained from.
    min_periods : {int, None}, default 0
        The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial
        fit to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting
        regardless of the number of values present (results in overfitting for too sparse intervals). To automatically
        set the minimum number of periods to the number of values in an offset defined window size, pass np.nan.
    return_residues : bool, default False
        Internal parameter. Makes the method return the residues instead of the fit.
    flag : float, default BAD
        flag to set.
    """
    pass


def flagDriftFromNorm(field, fields, segment_freq, norm_spread, norm_frac, metric, linkage_method, flag):
    """
    The function flags value courses that significantly deviate from a group of normal value courses.
    
    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed.
    In addition, only a group is considered "normal" if it contains more then `norm_frac` percent of the
    variables in "fields".
    
    See the Notes section for a more detailed presentation of the algorithm
    
    Parameters
    ----------
    field : str
        A dummy parameter.
    fields : str
        List of fieldnames in data, determining which variables are to be included into the flagging process.
    segment_freq : str
        An offset string, determining the size of the seperate datachunks that the algorihm is to be piecewise
        applied on.
    norm_spread : float
        A parameter limiting the maximum "spread" of the timeseries, allowed in the "normal" group. See Notes section
        for more details.
    norm_frac : float, default 0.5
        Has to be in [0,1]. Determines the minimum percentage of variables, the "normal" group has to comprise to be the
        normal group actually. The higher that value, the more stable the algorithm will be with respect to false
        positives. Also, nobody knows what happens, if this value is below 0.5.
    metric : Callable[[numpy.array, numpy.array], float]
        A distance function. It should be a function of 2 1-dimensional arrays and return a float scalar value.
        This value is interpreted as the distance of the two input arrays. The default is the averaged manhatten metric.
        See the Notes section to get an idea of why this could be a good choice.
    linkage_method : {"single", "complete", "average", "weighted", "centroid", "median", "ward"}, default "single"
        The linkage method used for hierarchical (agglomerative) clustering of the timeseries.
        See the Notes section for more details.
        The keyword gets passed on to scipy.hierarchy.linkage. See its documentation to learn more about the different
        keywords (References [1]).
        See wikipedia for an introduction to hierarchical clustering (References [2]).
    flag : float, default BAD
        flag to set.
    
    Notes
    -----
    following steps are performed for every data "segment" of length `segment_freq` in order to find the
    "abnormal" data:
    
    1. Calculate the distances :math:`d(x_i,x_j)` for all :math:`x_i` in parameter `fields`. (with :math:`d`
       denoting the distance function
       passed to the parameter `metric`.
    2. Calculate a dendogram with a hierarchical linkage algorithm, specified by the parameter `linkage_method`.
    3. Flatten the dendogram at the level, the agglomeration costs exceed the value given by the parameter `norm_spread`
    4. check if there is a cluster containing more than `norm_frac` percentage of the variables in fields.
    
        1. if yes: flag all the variables that are not in that cluster (inside the segment)
        2. if no: flag nothing
    
    The main parameter giving control over the algorithms behavior is the `norm_spread` parameter, that determines
    the maximum spread of a normal group by limiting the costs, a cluster agglomeration must not exceed in every
    linkage step.
    For singleton clusters, that costs just equal half the distance, the timeseries in the clusters, have to
    each other. So, no timeseries can be clustered together, that are more then
    2*`norm_spread` distanted from each other.
    When timeseries get clustered together, this new clusters distance to all the other timeseries/clusters is
    calculated according to the linkage method specified by `linkage_method`. By default, it is the minimum distance,
    the members of the clusters have to each other.
    Having that in mind, it is advisable to choose a distance function, that can be well interpreted in the units
    dimension of the measurement and where the interpretation is invariant over the length of the timeseries.
    That is, why, the "averaged manhatten metric" is set as the metric default, since it corresponds to the
    averaged value distance, two timeseries have (as opposed by euclidean, for example).
    
    References
    ----------
    Documentation of the underlying hierarchical clustering algorithm:
        [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
    Introduction to Hierarchical clustering:
        [2] https://en.wikipedia.org/wiki/Hierarchical_clustering
    """
    pass


def flagDriftFromReference(field, fields, segment_freq, thresh, metric, flag):
    """
    The function flags value courses that deviate from a reference course by a margin exceeding a certain threshold.
    
    The deviation is measured by the distance function passed to parameter metric.
    
    Parameters
    ----------
    field : str
        The reference variable, the deviation from wich determines the flagging.
    fields : str
        List of fieldnames in data, determining wich variables are to be included into the flagging process.
    segment_freq : str
        An offset string, determining the size of the seperate datachunks that the algorihm is to be piecewise
        applied on.
    thresh : float
        The threshod by wich normal variables can deviate from the reference variable at max.
    metric : Callable[(numpyp.array, numpy-array), float]
        A distance function. It should be a function of 2 1-dimensional arrays and return a float scalar value.
        This value is interpreted as the distance of the two input arrays. The default is the averaged manhatten metric.
        See the Notes section to get an idea of why this could be a good choice.
    flag : float, default BAD
        flag to set.
    
    Notes
    -----
    it is advisable to choose a distance function, that can be well interpreted in the units
    dimension of the measurement and where the interpretation is invariant over the length of the timeseries.
    That is, why, the "averaged manhatten metric" is set as the metric default, since it corresponds to the
    averaged value distance, two timeseries have (as opposed by euclidean, for example).
    """
    pass


def flagDriftFromScaledNorm(field, fields_scale1, fields_scale2, segment_freq, norm_spread, norm_frac, metric, linkage_method, flag):
    """
    The function linearly rescales one set of variables to another set of variables with a different scale and then
    flags value courses that significantly deviate from a group of normal value courses.
    
    The two sets of variables can be linearly scaled one to another and hence the scaling transformation is performed
    via linear regression: A linear regression is performed on each pair of variables giving a slope and an intercept.
    The transformation is then calculated a the median of all the calculated slopes and intercepts.
    
    Once the transformation is performed, the function flags those values, that deviate from a group of normal values.
    "Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed.
    In addition, only a group is considered "normal" if it contains more then `norm_frac` percent of the
    variables in "fields".
    
    Parameters
    ----------
    field : str
        A dummy parameter.
    fields_scale1 : str
        List of fieldnames in data to be included into the flagging process which are scaled according to scaling
        scheme 1.
    fields_scale2 : str
        List of fieldnames in data to be included into the flagging process which are scaled according to scaling
        scheme 2.
    segment_freq : str
        An offset string, determining the size of the seperate datachunks that the algorihm is to be piecewise
        applied on.
    norm_spread : float
        A parameter limiting the maximum "spread" of the timeseries, allowed in the "normal" group. See Notes section
        for more details.
    norm_frac : float, default 0.5
        Has to be in [0,1]. Determines the minimum percentage of variables, the "normal" group has to comprise to be the
        normal group actually. The higher that value, the more stable the algorithm will be with respect to false
        positives. Also, nobody knows what happens, if this value is below 0.5.
    metric : Callable[(numpyp.array, numpy-array), float]
        A distance function. It should be a function of 2 1-dimensional arrays and return a float scalar value.
        This value is interpreted as the distance of the two input arrays. The default is the averaged manhatten metric.
        See the Notes section to get an idea of why this could be a good choice.
    linkage_method : {"single", "complete", "average", "weighted", "centroid", "median", "ward"}, default "single"
        The linkage method used for hierarchical (agglomerative) clustering of the timeseries.
        See the Notes section for more details.
        The keyword gets passed on to scipy.hierarchy.linkage. See its documentation to learn more about the different
        keywords (References [1]).
        See wikipedia for an introduction to hierarchical clustering (References [2]).
    flag : float, default BAD
        flag to set.
    
    References
    ----------
    Documentation of the underlying hierarchical clustering algorithm:
        [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
    Introduction to Hierarchical clustering:
        [2] https://en.wikipedia.org/wiki/Hierarchical_clustering
    """
    pass


def correctDrift(field, maint_data_field, driftModel, cal_mean, flag_maint_period, flag):
    """
    The function corrects drifting behavior.
    
    See the Notes section for an overview over the correction algorithm.
    
    Parameters
    ----------
    field : str
        The fieldname of the data column, you want to correct.
    maint_data_field : str
        The fieldname of the datacolumn holding the support-points information.
        The maint data is to expected to have following form:
        The series' timestamp itself represents the beginning of a
        maintenance event, wheras the values represent the endings of the maintenance intervals.
    driftModel : Callable
        A modelfunction describing the drift behavior, that is to be corrected.
        The model function must always contain the keyword parameters 'origin' and 'target'.
        The starting parameter must always be the parameter, by wich the data is passed to the model.
        After the data parameter, there can occure an arbitrary number of model calibration arguments in
        the signature.
        See the Notes section for an extensive description.
    cal_mean : int, default 5
        The number of values the mean is computed over, for obtaining the value level directly after and
        directly before maintenance event. This values are needed for shift calibration. (see above description)
    flag_maint_period : bool, default False
        Whether or not to flag the values obtained while maintenance.
    flag : float, default BAD
        flag to set.
    
    Notes
    -----
    It is assumed, that between support points, there is a drift effect shifting the meassurements in a way, that
    can be described, by a model function M(t, *p, origin, target). (With 0<=t<=1, p being a parameter set, and origin,
    target being floats).
    
    Note, that its possible for the model to have no free parameters p at all. (linear drift mainly)
    
    The drift model, directly after the last support point (t=0),
    should evaluate to the origin - calibration level (origin), and directly before the next support point
    (t=1), it should evaluate to the target calibration level (target).
    
    M(0, *p, origin, target) = origin
    M(1, *p, origin, target) = target
    
    The model is than fitted to any data chunk in between support points, by optimizing the parameters p*, and
    thus, obtaining optimal parameterset P*.
    
    The new values at t are computed via:
    
    new_vals(t) = old_vals(t) + M(t, *P, origin, target) - M_drift(t, *P, origin, new_target)
    
    Wheras new_target represents the value level immediately after the nex support point.
    
    Examples
    --------
    Some examples of meaningful driftmodels.
    
    Linear drift modell (no free parameters).
    
    >>> M = lambda t, origin, target: origin + t*target
    
    exponential drift model (exponential raise!)
    
    >>> expFunc = lambda t, a, b, c: a + b * (np.exp(c * x) - 1)
    >>> M = lambda t, p, origin, target: expFunc(t, (target - origin) / (np.exp(abs(c)) - 1), abs(c))
    
    Exponential and linear driftmodels are part of the ts_operators library, under the names
    expDriftModel and linearDriftModel.
    """
    pass


def correctRegimeAnomaly(field, cluster_field, model, regime_transmission, x_date):
    """
    Function fits the passed model to the different regimes in data[field] and tries to correct
    those values, that have assigned a negative label by data[cluster_field].
    
    Currently, the only correction mode supported is the "parameter propagation."
    
    This means, any regime :math:`z`, labeled negatively and being modeled by the parameters p, gets corrected via:
    
    :math:`z_{correct} = z + (m(p^*) - m(p))`,
    
    where :math:`p^*` denotes the parameter set belonging to the fit of the nearest not-negatively labeled cluster.
    
    Parameters
    ----------
    field : str
        The fieldname of the data column, you want to correct.
    cluster_field : str
        A string denoting the field in data, holding the cluster label for the data you want to correct.
    model : Callable
        The model function to be fitted to the regimes.
        It must be a function of the form :math:`f(x, *p)`, where :math:`x` is the ``numpy.array`` holding the
        independent variables and :math:`p` are the model parameters that are to be obtained by fitting.
        Depending on the `x_date` parameter, independent variable x will either be the timestamps
        of every regime transformed to seconds from epoch, or it will be just seconds, counting the regimes length.
    regime_transmission : {None, str}, default None:
        If an offset string is passed, a data chunk of length `regime_transimission` right at the
        start and right at the end is ignored when fitting the model. This is to account for the
        unreliability of data near the changepoints of regimes.
    x_date : bool, default False
        If True, use "seconds from epoch" as x input to the model func, instead of "seconds from regime start".
    """
    pass


def correctOffset():
    """
    Parameters
    ----------
    data : dios.DictOfSeries
        A dictionary of pandas.Series, holding all the data.
    field : str
        The fieldname of the data column, you want to correct.
    flags : saqc.Flags
        Container to store flags of the data.
    max_mean_jump : float
        when searching for changepoints in mean - this is the threshold a mean difference in the
        sliding window search must exceed to trigger changepoint detection.
    normal_spread : float
        threshold denoting the maximum, regimes are allowed to abolutely differ in their means
        to form the "normal group" of values.
    search_winsz : str
        Size of the adjacent windows that are used to search for the mean changepoints.
    min_periods : int
        Minimum number of periods a search window has to contain, for the result of the changepoint
        detection to be considered valid.
    regime_transmission : {None, str}, default None:
        If an offset string is passed, a data chunk of length `regime_transimission` right from the
        start and right before the end of any regime is ignored when calculating a regimes mean for data correcture.
        This is to account for the unrelyability of data near the changepoints of regimes.
    """
    pass


def flagRegimeAnomaly(field, cluster_field, norm_spread, linkage_method, metric, norm_frac, flag):
    """
    A function to flag values belonging to an anomalous regime regarding modelling regimes of field.
    
    "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect
    to a certain metric and linkage method.
    
    In addition, only a range of regimes is considered "normal", if it models more then `norm_frac` percentage of
    the valid samples in "field".
    
    Note, that you must detect the regime changepoints prior to calling this function.
    
    Note, that it is possible to perform hypothesis tests for regime equality by passing the metric
    a function for p-value calculation and selecting linkage method "complete".
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    cluster_field : str
        The name of the column in data, holding the cluster labels for the samples in field. (has to be indexed
        equal to field)
    norm_spread : float
        A threshold denoting the valuelevel, up to wich clusters a agglomerated.
    linkage_method : {"single", "complete", "average", "weighted", "centroid", "median", "ward"}, default "single"
        The linkage method used for hierarchical (agglomerative) clustering of the variables.
    metric : Callable[[numpy.array, numpy.array], float], default lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y))
        A metric function for calculating the dissimilarity between 2 regimes. Defaults to just the difference in mean.
    norm_frac : float
        Has to be in [0,1]. Determines the minimum percentage of samples,
        the "normal" group has to comprise to be the normal group actually.
    flag : float, default BAD
        flag to set.
    """
    pass


def assignRegimeAnomaly(field, cluster_field, norm_spread, linkage_method, metric, norm_frac, set_cluster, set_flags, flag):
    """
    A function to detect values belonging to an anomalous regime regarding modelling regimes of field.
    
    The function changes the value of the regime cluster labels to be negative.
    
    "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect
    to a certain metric and linkage method.
    
    In addition, only a range of regimes is considered "normal", if it models more then `norm_frac` percentage of
    the valid samples in "field".
    
    Note, that you must detect the regime changepoints prior to calling this function. (They are expected to be stored
    parameter `cluster_field`.)
    
    Note, that it is possible to perform hypothesis tests for regime equality by passing the metric
    a function for p-value calculation and selecting linkage method "complete".
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    cluster_field : str
        The name of the column in data, holding the cluster labels for the samples in field. (has to be indexed
        equal to field)
    norm_spread : float
        A threshold denoting the valuelevel, up to wich clusters a agglomerated.
    linkage_method : {"single", "complete", "average", "weighted", "centroid", "median", "ward"}, default "single"
        The linkage method used for hierarchical (agglomerative) clustering of the variables.
    metric : Callable[[numpy.array, numpy.array], float], default lambda x, y: np.abs(np.nanmean(x) - np.nanmean(y))
        A metric function for calculating the dissimilarity between 2 regimes. Defaults to just the difference in mean.
    norm_frac : float
        Has to be in [0,1]. Determines the minimum percentage of samples,
        the "normal" group has to comprise to be the normal group actually.
    set_cluster : bool, default False
        If True, all data, considered "anormal", gets assigned a negative clusterlabel. This option
        is present for further use (correction) of the anomaly information.
    set_flags : bool, default True
        Wheather or not to flag abnormal values (do not flag them, if you want to correct them
        afterwards, becasue flagged values usually are not visible in further tests.).
    flag : float, default BAD
        flag to set.
    """
    pass


def forceFlags(field, flag, kwargs):
    """
    Set whole column to a flag value.
    
    Parameters
    ----------
    field : str
        columns name that holds the data
    flag : float, default BAD
        flag to set
    kwargs : dict
        unused
    
    See Also
    --------
    clearFlags : set whole column to UNFLAGGED
    flagUnflagged : set flag value at all unflagged positions
    """
    pass


def clearFlags(field, kwargs):
    """
    Set whole column to UNFLAGGED.
    
    Parameters
    ----------
    field : str
        columns name that holds the data
    kwargs : dict
        unused
    
    See Also
    --------
    forceFlags : set whole column to a flag value
    flagUnflagged : set flag value at all unflagged positions
    """
    pass


def flagUnflagged(field, flag, kwargs):
    """
    Function sets a flag at all unflagged positions.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    flag : float, default BAD
        flag value to set
    kwargs : Dict
        unused
    
    See Also
    --------
    clearFlags : set whole column to UNFLAGGED
    forceFlags : set whole column to a flag value
    """
    pass


def flagManual(field, mdata, mflag, method, flag):
    """
    Flag data by given, "manually generated" data.
    
    The data is flagged at locations where `mdata` is equal to a provided flag (`mflag`).
    The format of mdata can be an indexed object, like pd.Series, pd.Dataframe or dios.DictOfSeries,
    but also can be a plain list- or array-like.
    How indexed mdata is aligned to data is specified via the `method` parameter.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    mdata : {pd.Series, pd.Dataframe, DictOfSeries}
        The "manually generated" data
    mflag : scalar
        The flag that indicates data points in `mdata`, of wich the projection in data should be flagged.
    
    method : {'plain', 'ontime', 'left-open', 'right-open'}, default plain
        Defines how mdata is projected on data. Except for the 'plain' method, the methods assume mdata to have an
        index.
    
        * 'plain': mdata must have the same length as data and is projected one-to-one on data.
        * 'ontime': works only with indexed mdata. mdata entries are matched with data entries that have the same index.
        * 'right-open': mdata defines intervals, values are to be projected on.
          The intervals are defined by any two consecutive timestamps t_1 and 1_2 in mdata.
          the value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2.
        * 'left-open': like 'right-open', but the projected interval now covers all t with t_1 < t <= t_2.
    
    flag : float, default BAD
        flag to set.
    
    Examples
    --------
    An example for mdata
    >>> mdata = pd.Series([1,0,1], index=pd.to_datetime(['2000-02', '2000-03', '2001-05']))
    >>> mdata
    2000-02-01    1
    2000-03-01    0
    2001-05-01    1
    dtype: int64
    
    On *dayly* data, with the 'ontime' method, only the provided timestamnps are used.
    Bear in mind that only exact timestamps apply, any offset will result in ignoring
    the timestamp.
    >>> _, fl = flagManual(data, field, flags, mdata, mflag=1, method='ontime')
    >>> fl[field] > UNFLAGGED
    2000-01-31    False
    2000-02-01    True
    2000-02-02    False
    2000-02-03    False
    ..            ..
    2000-02-29    False
    2000-03-01    True
    2000-03-02    False
    Freq: D, dtype: bool
    
    With the 'right-open' method, the mdata is forward fill:
    >>> _, fl = flagManual(data, field, flags, mdata, mflag=1, method='right-open')
    >>> fl[field] > UNFLAGGED
    2000-01-31    False
    2000-02-01    True
    2000-02-02    True
    ..            ..
    2000-02-29    True
    2000-03-01    False
    2000-03-02    False
    Freq: D, dtype: bool
    
    With the 'left-open' method, backward filling is used:
    >>> _, fl = flagManual(data, field, flags, mdata, mflag=1, method='left-open')
    >>> fl[field] > UNFLAGGED
    2000-01-31    False
    2000-02-01    False
    2000-02-02    True
    ..            ..
    2000-02-29    True
    2000-03-01    True
    2000-03-02    False
    Freq: D, dtype: bool
    """
    pass


def flagDummy(field):
    """
    Function does nothing but returning data and flags.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, holding the data-to-be-flagged.
    """
    pass


def process(field, func, nodata):
    """
    generate/process data with generically defined functions.
    
    The functions can depend on on any of the fields present in data.
    
    Formally, what the function does, is the following:
    
    1.  Let F be a Callable, depending on fields f_1, f_2,...f_K, (F = F(f_1, f_2,...f_K))
        Than, for every timestamp t_i that occurs in at least one of the timeseries data[f_j] (outer join),
        The value v_i is computed via:
        v_i = data([f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]), if all data[f_j][t_i] do exist
        v_i = `nodata`, if at least one of the data[f_j][t_i] is missing.
    2.  The result is stored to data[field] (gets generated if not present)
    
    Parameters
    ----------
    field : str
        The fieldname of the column, where you want the result from the generic expressions processing to be written to.
    func : Callable
        The data processing function with parameter names that will be
        interpreted as data column entries.
        See the examples section to learn more.
    nodata : any, default np.nan
        The value that indicates missing/invalid data
    
    Examples
    --------
    Some examples on what to pass to the func parameter:
    To compute the sum of the variables "temperature" and "uncertainty", you would pass the function:
    
    >>> lambda temperature, uncertainty: temperature + uncertainty
    
    You also can pass numpy and pandas functions:
    
    >>> lambda temperature, uncertainty: np.round(temperature) * np.sqrt(uncertainty)
    """
    pass


def flag(field, func, nodata, flag):
    """
    a function to flag a data column by evaluation of a generic expression.
    
    The expression can depend on any of the fields present in data.
    
    Formally, what the function does, is the following:
    
    Let X be an expression, depending on fields f_1, f_2,...f_K, (X = X(f_1, f_2,...f_K))
    Than for every timestamp t_i in data[field]:
    data[field][t_i] is flagged if X(data[f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]) is True.
    
    Note, that all value series included in the expression to evaluate must be labeled identically to field.
    
    Note, that the expression is passed in the form of a Callable and that this callables variable names are
    interpreted as actual names in the data header. See the examples section to get an idea.
    
    Note, that all the numpy functions are available within the generic expressions.
    
    Parameters
    ----------
    field : str
        The fieldname of the column, where you want the result from the generic expressions evaluation to be projected
        to.
    func : Callable
        The expression that is to be evaluated is passed in form of a callable, with parameter names that will be
        interpreted as data column entries. The Callable must return an boolen array like.
        See the examples section to learn more.
    nodata : any, default np.nan
        The value that indicates missing/invalid data
    flag : float, default BAD
        flag to set.
    
    Examples
    --------
    Some examples on what to pass to the func parameter:
    To flag the variable `field`, if the sum of the variables
    "temperature" and "uncertainty" is below zero, you would pass the function:
    
    >>> lambda temperature, uncertainty: temperature + uncertainty < 0
    
    There is the reserved name 'This', that always refers to `field`. So, to flag field if field is negative, you can
    also pass:
    
    >>> lambda this: this < 0
    
    If you want to make dependent the flagging from flags already present in the data, you can use the built-in
    ``isflagged`` method. For example, to flag the 'temperature', if 'level' is flagged, you would use:
    
    >>> lambda level: isflagged(level)
    
    You can furthermore specify a flagging level, you want to compare the flags to. For example, for flagging
    'temperature', if 'level' is flagged at a level named DOUBTFUL or worse, use:
    
    >>> lambda level: isflagged(level, flag=DOUBTFUL, comparator='>')
    
    If you are unsure about the used flaggers flagging level names, you can use the reserved key words BAD, UNFLAGGED
    and GOOD, to refer to the worst (BAD), best(GOOD) or unflagged (UNFLAGGED) flagging levels. For example.
    
    >>> lambda level: isflagged(level, flag=UNFLAGGED, comparator='==')
    
    Your expression also is allowed to include pandas and numpy functions
    
    >>> lambda level: np.sqrt(level) > 7
    """
    pass


def interpolateByRolling(field, winsz, func, center, min_periods, flag):
    """
    Interpolates nan-values in the data by assigning them the aggregation result of the window surrounding them.
    
    Parameters
    ----------
    field : str
        Name of the column, holding the data-to-be-interpolated.
    
    winsz : int, str
        The size of the window, the aggregation is computed from. An integer define the number of periods to be used,
        an string is interpreted as an offset. ( see `pandas.rolling` for more information).
        Integer windows may result in screwed aggregations if called on none-harmonized or irregular data.
    
    func : Callable
        The function used for aggregation.
    
    center : bool, default True
        Center the window around the value. Can only be used with integer windows, otherwise it is silently ignored.
    
    min_periods : int
        Minimum number of valid (not np.nan) values that have to be available in a window for its aggregation to be
        computed.
    
    flag : float or None, default UNFLAGGED
        Flag that is to be inserted for the interpolated values. If ``None`` no flags are set.
    """
    pass


def interpolateInvalid(field, method, inter_order, inter_limit, flag, downgrade_interpolation):
    """
    Function to interpolate nan values in the data.
    
    There are available all the interpolation methods from the pandas.interpolate method and they are applicable by
    the very same key words, that you would pass to the ``pd.Series.interpolate``'s method parameter.
    
    Parameters
    ----------