From cd226f3e6f46916b533ff870cc5e4d9671a3c00a Mon Sep 17 00:00:00 2001
From: Nikita Karaev <nikitakaraev@meta.com>
Date: Mon, 30 Oct 2023 11:35:42 +0000
Subject: [PATCH] fixed a bug in compute_tapvid_metrics

---
 cotracker/evaluation/core/eval_utils.py | 48 +++++++++++--------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/cotracker/evaluation/core/eval_utils.py b/cotracker/evaluation/core/eval_utils.py
index 405aa8b..7002fa5 100644
--- a/cotracker/evaluation/core/eval_utils.py
+++ b/cotracker/evaluation/core/eval_utils.py
@@ -55,32 +55,29 @@ def compute_tapvid_metrics(
     """
 
     metrics = {}
+    # Fixed bug is described in:
+    # https://github.com/facebookresearch/co-tracker/issues/20
+    eye = np.eye(gt_tracks.shape[2], dtype=np.int32)
+
+    if query_mode == "first":
+        # evaluate frames after the query frame
+        query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye
+    elif query_mode == "strided":
+        # evaluate all frames except the query frame
+        query_frame_to_eval_frames = 1 - eye
+    else:
+        raise ValueError("Unknown query mode " + query_mode)
 
-    # Don't evaluate the query point.  Numpy doesn't have one_hot, so we
-    # replicate it by indexing into an identity matrix.
-    one_hot_eye = np.eye(gt_tracks.shape[2])
     query_frame = query_points[..., 0]
     query_frame = np.round(query_frame).astype(np.int32)
-    evaluation_points = one_hot_eye[query_frame] == 0
-
-    # If we're using the first point on the track as a query, don't evaluate the
-    # other points.
-    if query_mode == "first":
-        for i in range(gt_occluded.shape[0]):
-            index = np.where(gt_occluded[i] == 0)[0][0]
-            evaluation_points[i, :index] = False
-    elif query_mode != "strided":
-        raise ValueError("Unknown query mode " + query_mode)
+    evaluation_points = query_frame_to_eval_frames[query_frame] > 0
 
     # Occlusion accuracy is simply how often the predicted occlusion equals the
     # ground truth.
-    occ_acc = (
-        np.sum(
-            np.equal(pred_occluded, gt_occluded) & evaluation_points,
-            axis=(1, 2),
-        )
-        / np.sum(evaluation_points)
-    )
+    occ_acc = np.sum(
+        np.equal(pred_occluded, gt_occluded) & evaluation_points,
+        axis=(1, 2),
+    ) / np.sum(evaluation_points)
     metrics["occlusion_accuracy"] = occ_acc
 
     # Next, convert the predictions and ground truth positions into pixel
@@ -92,13 +89,10 @@ def compute_tapvid_metrics(
     for thresh in [1, 2, 4, 8, 16]:
         # True positives are points that are within the threshold and where both
         # the prediction and the ground truth are listed as visible.
-        within_dist = (
-            np.sum(
-                np.square(pred_tracks - gt_tracks),
-                axis=-1,
-            )
-            < np.square(thresh)
-        )
+        within_dist = np.sum(
+            np.square(pred_tracks - gt_tracks),
+            axis=-1,
+        ) < np.square(thresh)
         is_correct = np.logical_and(within_dist, visible)
 
         # Compute the frac_within_threshold, which is the fraction of points