David Schäfer · d4435d6c · 1a1cf9cd · 23c1f093 · f31ba14f · bc930214
--- a/saqc/core/translator/dmptranslator.py

+ 76

− 2
+++ b/saqc/core/translator/dmptranslator.py

+ 76

− 2
 @@ -38,6 +38,18 @@ class DmpTranslator(Translator):
        "BAD": BAD,
    }

+    _QUALITY_CAUSES = {
+        "BATTERY_LOW",
+        "BELOW_MINIMUM",
+        "ABOVE_MAXIMUM",
+        "BELOW_OR_ABOVE_MIN_MAX",
+        "ISOLATED_SPIKE",
+        "DEFECTIVE_SENSOR",
+        "LEFT_CENSORED_DATA",
+        "RIGHT_CENSORED_DATA",
+        "OTHER",
+    }
+
    def __init__(self):
        super().__init__(forward=self._FORWARD)

 @@ -59,8 +71,30 @@ class DmpTranslator(Translator):
        Note
        ----
        Could (and maybe should) be implemented as a method of `CallGraph`
+
+        Currently we work around the issue, that we keep track of the
+        computations we do on a variable using the variable name, but also
+        allow mutations of that name (i.e. our key) through `tools.rename`
+        in a somewhat hacky way. There are better ideas, to solve this (i.e.
+        global function pointers), but for the moment this has to do the trick
        """
-        return [SaQCFunction(name="")] + [f for l, f in call_stack if l.field == field]
+        # backtrack name changes and let's look, if our field
+        # originally had another name
+        for sel, func in call_stack[::-1]:
+            if func.name == "tools.rename":
+                new_name = func.keywords.get("new_name") or func.args[3]
+                if new_name == field:
+                    field = sel.field
+
+        out = [SaQCFunction(name="")]
+        for sel, func in call_stack:
+            if sel.field == field:
+                out.append(func)
+                # forward track name changes
+                if func.name == "tools.rename":
+                    field = func.keywords.get("new_name") or func.args[3]
+
+        return out

    def forward(self, flags: pd.DataFrame) -> Tuple[Flags, MaterializedGraph]:
        """
 @@ -144,6 +178,11 @@ class DmpTranslator(Translator):
            flag_call_history = self._getFieldFunctions(field, call_graph)
            flag_pos = flags.history[field].idxmax()
            comments, causes = [], []
+            # NOTE:
+            # Strangely enough, this loop withstood all my efforts
+            # to speed it up through vectorization - the simple
+            # loop always outperformed even careful `pd.DataFrame.apply`
+            # versions. The latest try is left as a comment below.
            for p in flag_pos:
                func = flag_call_history[p]
                cause = func.keywords.get("cause", self.ARGUMENTS["cause"])
 @@ -158,10 +197,45 @@ class DmpTranslator(Translator):
                causes.append(cause)
                comments.append(comment)

+            # DMP quality_cause needs some special care as only certain values
+            # and combinations are allowed.
+            # See: https://wiki.intranet.ufz.de/wiki/dmp/index.php/Qualit%C3%A4tsflags
+            causes = pd.Series(causes, index=flags[field].index)
+            causes[
+                (causes == self.ARGUMENTS["cause"]) & (flags[field] > GOOD)
+            ] = "OTHER"
+            if not ((causes == "") | causes.isin(self._QUALITY_CAUSES)).all():
+                raise ValueError(
+                    f"quality causes needs to be one of {self._QUALITY_CAUSES}"
+                )
+
            var_flags = {
                "quality_flag": tflags[field],
                "quality_comment": pd.Series(comments, index=flags[field].index),
-                "quality_cause": pd.Series(causes, index=flags[field].index),
+                "quality_cause": causes,
            }
            out[field] = pd.DataFrame(var_flags)
        return pd.concat(out, axis="columns")
+
+        # for field in tflags.columns:
+        #     call_history = []
+        #     for func in self._getFieldFunctions(field, call_graph):
+        #         func_info = {
+        #             "cause": func.keywords.get("cause", self.ARGUMENTS["comment"]),
+        #             "comment": json.dumps({
+        #                 "test": func.name,
+        #                 "comment": func.keywords.get("comment", self.ARGUMENTS["comment"]),
+        #             })
+        #          }
+        #         call_history.append(func_info)
+
+        #     functions = pd.DataFrame(call_history)
+        #     flag_pos = flags.history[field].idxmax()
+
+        #     var_flags = {
+        #         "quality_flag": tflags[field].reset_index(drop=True),
+        #         "quality_comment": functions.loc[flag_pos, "comment"].reset_index(drop=True),
+        #         "quality_cause": functions.loc[flag_pos, "cause"].reset_index(drop=True),
+        #     }
+        #     out[field] = pd.DataFrame(var_flags, index=flag_pos.index)
+        # return pd.concat(out, axis="columns")