diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index b380013ef39e32e63671f1fdba0370d0305149f5..113e973db6d966b6630f36358aa0bbd7eba1e166 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -76,14 +76,6 @@ class BaseFlagger(ABC): newflagger = self.copy(flags=flags) return newflagger - def rename(self, field: str, new_name: str, inplace=False): - if inplace: - out = self - else: - out = self.copy() - out._flags.columns = mutateIndex(out._flags.columns, field, new_name) - return out - def merge(self, other: BaseFlaggerT, subset: Optional[List] = None, join: str = "merge", inplace=False): """ Merge the given flagger 'other' into self @@ -300,6 +292,51 @@ class BaseFlagger(ABC): # version of it. return flag == self.BAD or flag == self.GOOD or flag == self.UNFLAGGED or self.isSUSPICIOUS(flag) + def replaceField(self, field, flags, inplace=False, **kwargs): + """ Replace or delete all data for a given field. + + Parameters + ---------- + field : str + The field to replace / delete. If the field already exist, the respected data + is replaced, otherwise the data is inserted in the respected field column. + flags : pandas.Series or None + If None, the series denoted by `field` will be deleted. Otherwise + a series of flags (dtype flagger.dtype) that will replace the series + currently stored under `field` + inplace : bool, default False + If False, a flagger copy is returned, otherwise the flagger is not copied. + **kwargs : dict + ignored. + + Returns + ------- + flagger: saqc.flagger.BaseFlagger + The flagger object or a copy of it (if inplace=True). + + Raises + ------ + ValueError: (delete) if field does not exist + TypeError: (replace / insert) if flags are not pd.Series + """ + + assertScalar("field", field, optional=False) + + out = self if inplace else deepcopy(self) + + # delete + if flags is None: + if field not in self._flags: + raise ValueError(f"{field}: field does not exist") + del out._flags[field] + + # insert / replace + else: + if not isinstance(flags, pd.Series): + raise TypeError(f"`flags` must be pd.Series.") + out._flags[field] = flags.astype(self.dtype) + return out + def _check_field(self, field): """ Check if (all) field(s) in self._flags. """ diff --git a/saqc/flagger/dmpflagger.py b/saqc/flagger/dmpflagger.py index 3c436ab50cef17138de5984a2be8a3bff575e6f7..62275574f20f0faca019f82f8d998df6affd839b 100644 --- a/saqc/flagger/dmpflagger.py +++ b/saqc/flagger/dmpflagger.py @@ -86,12 +86,6 @@ class DmpFlagger(CategoricalFlagger): newflagger._comments = self._comments.aloc[flags, ...] return newflagger - def rename(self, field: str, new_name: str, inplace=False): - newflagger = super().rename(field, new_name, inplace=inplace) - newflagger._causes.columns = newflagger._flags.columns - newflagger._comments.columns = newflagger._flags.columns - return newflagger - def merge(self, other: DmpFlaggerT, subset: Optional[List] = None, join: str = "merge", inplace=False): assert isinstance(other, DmpFlagger) flags = mergeDios(self._flags, other._flags, subset=subset, join=join) @@ -169,6 +163,68 @@ class DmpFlagger(CategoricalFlagger): out._comments.aloc[row_indexer, field] = comment return out + def replaceField(self, field, flags, inplace=False, cause=None, comment=None, **kwargs): + """ Replace or delete all data for a given field. + + Parameters + ---------- + field : str + The field to replace / delete. If the field already exist, the respected data + is replaced, otherwise the data is inserted in the respected field column. + flags : pandas.Series or None + If None, the series denoted by `field` will be deleted. Otherwise + a series of flags (dtype flagger.dtype) that will replace the series + currently stored under `field` + causes : pandas.Series + A series of causes (dtype str). + comments : pandas.Series + A series of comments (dtype str). + inplace : bool, default False + If False, a flagger copy is returned, otherwise the flagger is not copied. + **kwargs : dict + ignored. + + Returns + ------- + flagger: saqc.flagger.BaseFlagger + The flagger object or a copy of it (if inplace=True). + + Raises + ------ + ValueError: (delete) if field does not exist + TypeError: (replace / insert) if flags, causes, comments are not pd.Series + AssertionError: (replace / insert) if flags, causes, comments does not have the same index + + Notes + ----- + If deletion is requested (`flags=None`), `causes` and `comments` are don't-care. + + Flags, causes and comments must have the same index, if flags is not None, also + each is casted implicit to the respected dtype. + """ + assertScalar("field", field, optional=False) + out = self if inplace else deepcopy(self) + causes, comments = cause, comment + + # delete + if flags is None: + if field not in self._flags: + raise ValueError(f"{field}: field does not exist") + del out._flags[field] + del out._comments[field] + del out._causes[field] + + # insert / replace + else: + for val in [flags, causes, comments]: + if not isinstance(val, pd.Series): + raise TypeError(f"`flag`, `cause`, `comment` must be pd.Series.") + assert flags.index.equals(comments.index) and flags.index.equals(causes.index) + out._flags[field] = flags.astype(self.dtype) + out._causes[field] = causes.astype(str) + out._comments[field] = comments.astype(str) + return out + def _construct_new(self, flags, causes, comments) -> DmpFlaggerT: new = DmpFlagger() new.project_version = self.project_version diff --git a/saqc/funcs/proc_functions.py b/saqc/funcs/proc_functions.py index ac3b19499ca458d918b334fa5fd8a89e6c58d4b4..e4f39ed482a2f697279c0842e2258b0698a2a7f5 100644 --- a/saqc/funcs/proc_functions.py +++ b/saqc/funcs/proc_functions.py @@ -777,12 +777,15 @@ def proc_fork(data, field, flagger, suffix=ORIGINAL_SUFFIX, **kwargs): Flags shape may have changed relatively to the flagger input. """ - fork_field = str(field) + suffix - fork_dios = dios.DictOfSeries({fork_field: data[field]}) - fork_flagger = flagger.slice(drop=data.columns.drop(field)).rename(field, fork_field, inplace=True) - data = mergeDios(data, fork_dios) - flagger = flagger.merge(fork_flagger) - return data, flagger + newfield = str(field) + suffix + if newfield in flagger.flags.columns.union(data.columns): + raise ValueError(f"{field}: field already exist") + + flags, extras = flagger.getFlags(field, full=True) + newflagger = flagger.replaceField(newfield, flags=flags, **extras) + newdata = data.copy() + newdata[newfield] = data[field].copy() + return newdata, newflagger @register(masking='field') @@ -837,10 +840,20 @@ def proc_rename(data, field, flagger, new_name, **kwargs): flagger : saqc.flagger The flagger object, holding flags and additional Informations related to `data`. """ + # store + s = data[field] + f, e = flagger.getFlags(field, full=True) + + # delete + data = data.copy() + del data[field] + flagger = flagger.replaceField(field, flags=None) + + # insert + data[new_name] = s + flagger = flagger.replaceField(new_name, inplace=True, flags=f, **e) - data.columns = mutateIndex(data.columns, field, new_name) - new_flagger = flagger.rename(field, new_name) - return data, new_flagger + return data, flagger def _drift_fit(x, shift_target, cal_mean): diff --git a/test/flagger/test_dmpflagger.py b/test/flagger/test_dmpflagger.py index b2eca1193d01a9a3bf4880a33d638333fe2a41c9..b1a9c1b73df1d2a58866291119e5c709bfc06f90 100644 --- a/test/flagger/test_dmpflagger.py +++ b/test/flagger/test_dmpflagger.py @@ -113,16 +113,3 @@ def test_sliceFlaggerDrop(data): assert (filtered._comments.to_df().index == expected.index).all(axis=None) assert (filtered._causes.to_df().index == expected.index).all(axis=None) - -def test_rename(data_4cols): - new_name = "mysterious" - cols = pd.Index(["var1", "var2", new_name, "var4"]) - cols = cols.rename(data_4cols.columns.name) - - flagger = DmpFlagger().initFlags(data_4cols) - new_flagger = flagger.rename(data_4cols.columns[2], new_name) - new_flagger.getFlags(new_name) - - assert new_flagger._causes.columns.equals(cols) - assert new_flagger._comments.columns.equals(cols) - assert new_flagger._flags.columns.equals(cols) diff --git a/test/flagger/test_flagger.py b/test/flagger/test_flagger.py index ee803ad2baf3ebec29e7f98f499fb640a2afeb82..143283ca0ee13e143a2376a8faf1517fa1ab13e3 100644 --- a/test/flagger/test_flagger.py +++ b/test/flagger/test_flagger.py @@ -597,3 +597,90 @@ def test_classicUseCases(data, flagger): dt_idx = data[field].iloc[indices].index flagged = flagger.setFlags(field, loc=dt_idx, flag=flagger.BAD).isFlagged(field) assert (flagged.iloc[indices] == flagged[flagged]).all() + + +@pytest.mark.parametrize("data", DATASETS) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_getFlagsWithExtras(data, flagger): + flagger = flagger.initFlags(data) + field, *_ = data.columns + + flags, extra = flagger.getFlags(field, full=True) + assert isinstance(flags, pd.Series) + assert isinstance(extra, dict) + for k, v in extra.items(): + assert isinstance(v, pd.Series) + assert flags.index.equals(v.index) + + flags, extra = flagger.getFlags(full=True) + assert isinstance(flags, dios.DictOfSeries) + assert isinstance(extra, dict) + for k, v in extra.items(): + assert isinstance(v, dios.DictOfSeries) + assert flags.columns.equals(v.columns) + for c in flags: + assert flags[c].index.equals(v[c].index) + + +@pytest.mark.parametrize("data", DATASETS) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_replace_delete(data, flagger): + flagger = flagger.initFlags(data) + field, *_ = data.columns + newflagger = flagger.replaceField(field=field, flags=None) + + new, newextra = newflagger.getFlags(full=True) + assert field not in newflagger.flags + for k in newextra: + assert field not in newextra[k] + + with pytest.raises(ValueError): + flagger.replaceField(field="i_dont_exist", flags=None) + +@pytest.mark.parametrize("data", DATASETS) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_replace_insert(data, flagger): + flagger = flagger.initFlags(data) + field, *_ = data.columns + newfield = 'fooo' + flags, extra = flagger.getFlags(field, full=True) + newflagger = flagger.replaceField(field=newfield, flags=flags, **extra) + + old, oldextra = flagger.getFlags(full=True) + new, newextra = newflagger.getFlags(full=True) + assert newfield in newflagger.flags + assert (newflagger._flags[newfield] == flagger._flags[field]).all() + assert newflagger._flags[newfield] is not flagger._flags[field] # not a copy + for k in newextra: + assert newfield in newextra[k] + assert (newextra[k][newfield] == oldextra[k][field]).all() + + +@pytest.mark.parametrize("data", DATASETS) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_replace_replace(data, flagger): + flagger = flagger.initFlags(data) + field, *_ = data.columns + flags, extra = flagger.getFlags(field, full=True) + + # set everything to DOUBTFUL + flags[:] = flagger.BAD + for k, v in extra.items(): + v[:] = flagger.BAD + extra[k] = v + + newflagger = flagger.replaceField(field=field, flags=flags, **extra) + + old, oldextra = flagger.getFlags(full=True) + new, newextra = newflagger.getFlags(full=True) + assert old.columns.equals(new.columns) + assert (new[field] == flagger.BAD).all() + + assert oldextra.keys() == newextra.keys() + for k in newextra: + o, n = oldextra[k], newextra[k] + assert n.columns.equals(o.columns) + assert (n[field] == flagger.BAD).all() + + + diff --git a/test/funcs/test_proc_functions.py b/test/funcs/test_proc_functions.py index 604c1a5032218ab3e6d567adff571162538a648b..382939e7b5f4be9532d1427eeec7098a4001fa92 100644 --- a/test/funcs/test_proc_functions.py +++ b/test/funcs/test_proc_functions.py @@ -85,13 +85,12 @@ def test_resample(course_5, flagger): assert np.isnan(data1[field].iloc[1]) assert np.isnan(data1[field].iloc[2]) + @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_interpolateGrid(course_5, course_3, flagger): data, _ = course_5() data_grid, characteristics = course_3() - data['grid']=data_grid.to_df() - #data = dios.DictOfSeries(data) + data['grid'] = data_grid.to_df() + # data = dios.DictOfSeries(data) flagger = flagger.initFlags(data) dataInt, *_ = proc_interpolateGrid(data, 'data', flagger, '1h', 'time', grid_field='grid', inter_limit=10) - -