Compare commits

..

No commits in common. "19767a9d65d6c55f053e4956ce62cdd97aadfcf3" and "9c9a97d158115f89e367382f99178db8abd06cb4" have entirely different histories.

6 changed files with 17 additions and 58 deletions

View File

@ -4,7 +4,7 @@
[Nikita Karaev](https://nikitakaraevv.github.io/), [Ignacio Rocco](https://www.irocco.info/), [Benjamin Graham](https://ai.facebook.com/people/benjamin-graham/), [Natalia Neverova](https://nneverova.github.io/), [Andrea Vedaldi](https://www.robots.ox.ac.uk/~vedaldi/), [Christian Rupprecht](https://chrirupp.github.io/)
### [Project Page](https://co-tracker.github.io/) | [Paper](https://arxiv.org/abs/2307.07635) | [X Thread](https://twitter.com/n_karaev/status/1742638906355470772) | [BibTeX](#citing-cotracker)
[[`Paper`](https://arxiv.org/abs/2307.07635)] [[`Project`](https://co-tracker.github.io/)] [[`BibTeX`](#citing-cotracker)]
<a target="_blank" href="https://colab.research.google.com/github/facebookresearch/co-tracker/blob/main/notebooks/demo.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -26,7 +26,6 @@ CoTracker can track:
Try these tracking modes for yourself with our [Colab demo](https://colab.research.google.com/github/facebookresearch/co-tracker/blob/master/notebooks/demo.ipynb) or in the [Hugging Face Space 🤗](https://huggingface.co/spaces/facebook/cotracker).
**Updates:**
- [June 14, 2024] 📣 We have released the code for [VGGSfM](https://github.com/facebookresearch/vggsfm), a model for recovering camera poses and 3D structure from any image sequences based on point tracking! VGGSfM is the first fully differentiable SfM framework that unlocks scalability and outperforms conventional SfM methods on standard benchmarks.
- [December 27, 2023] 📣 CoTracker2 is now available! It can now track many more (up to **265*265**!) points jointly and it has a cleaner and more memory-efficient implementation. It also supports online processing. See the [updated paper](https://arxiv.org/abs/2307.07635) for more details. The old version remains available [here](https://github.com/facebookresearch/co-tracker/tree/8d364031971f6b3efec945dd15c468a183e58212).
@ -120,7 +119,7 @@ We strongly recommend installing both PyTorch and TorchVision with CUDA support,
git clone https://github.com/facebookresearch/co-tracker
cd co-tracker
pip install -e .
pip install matplotlib flow_vis tqdm tensorboard imageio[ffmpeg]
pip install matplotlib flow_vis tqdm tensorboard
```
You can manually download the CoTracker2 checkpoint from the links below and place it in the `checkpoints` folder as follows:
@ -133,11 +132,6 @@ cd ..
```
For old checkpoints, see [this section](#previous-version).
After installation, this is how you could run the model on `./assets/apple.mp4` (results will be saved to `./saved_videos/apple.mp4`):
```bash
python demo.py --checkpoint checkpoints/cotracker2.pth
```
## Evaluation
To reproduce the results presented in the paper, download the following datasets:
@ -209,15 +203,6 @@ make -C docs html
## Previous version
You can use CoTracker v1 directly via pytorch hub:
```python
import torch
import einops
import timm
import tqdm
cotracker = torch.hub.load("facebookresearch/co-tracker:v1.0", "cotracker_w8")
```
The old version of the code is available [here](https://github.com/facebookresearch/co-tracker/tree/8d364031971f6b3efec945dd15c468a183e58212).
You can also download the corresponding checkpoints:
```bash

View File

@ -38,6 +38,7 @@ class EvaluationPredictor(torch.nn.Module):
B, N, D = queries.shape
assert D == 3
assert B == 1
video = video.reshape(B * T, C, H, W)
video = F.interpolate(video, tuple(self.interp_shape), mode="bilinear", align_corners=True)

View File

@ -23,11 +23,11 @@ class CoTrackerPredictor(torch.nn.Module):
@torch.no_grad()
def forward(
self,
video, # (B, T, 3, H, W)
video, # (1, T, 3, H, W)
# input prompt types:
# - None. Dense tracks are computed in this case. You can adjust *query_frame* to compute tracks starting from a specific frame.
# *backward_tracking=True* will compute tracks in both directions.
# - queries. Queried points of shape (B, N, 3) in format (t, x, y) for frame index and pixel coordinates.
# - queries. Queried points of shape (1, N, 3) in format (t, x, y) for frame index and pixel coordinates.
# - grid_size. Grid of N*N points from the first frame. if segm_mask is provided, then computed only for the mask.
# You can adjust *query_frame* and *backward_tracking* for the regular grid in the same way as for dense tracks.
queries: torch.Tensor = None,
@ -92,6 +92,7 @@ class CoTrackerPredictor(torch.nn.Module):
backward_tracking=False,
):
B, T, C, H, W = video.shape
assert B == 1
video = video.reshape(B * T, C, H, W)
video = F.interpolate(video, tuple(self.interp_shape), mode="bilinear", align_corners=True)
@ -120,14 +121,13 @@ class CoTrackerPredictor(torch.nn.Module):
queries = torch.cat(
[torch.ones_like(grid_pts[:, :, :1]) * grid_query_frame, grid_pts],
dim=2,
).repeat(B, 1, 1)
)
if add_support_grid:
grid_pts = get_points_on_a_grid(
self.support_grid_size, self.interp_shape, device=video.device
)
grid_pts = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)
grid_pts = grid_pts.repeat(B, 1, 1)
queries = torch.cat([queries, grid_pts], dim=1)
tracks, visibilities, __ = self.model.forward(video=video, queries=queries, iters=6)
@ -174,7 +174,7 @@ class CoTrackerPredictor(torch.nn.Module):
inv_visibilities = inv_visibilities.flip(1)
arange = torch.arange(video.shape[1], device=queries.device)[None, :, None]
mask = (arange < queries[:, None, :, 0]).unsqueeze(-1).repeat(1, 1, 1, 2)
mask = (arange < queries[None, :, :, 0]).unsqueeze(-1).repeat(1, 1, 1, 2)
tracks[mask] = inv_tracks[mask]
visibilities[mask[:, :, :, 0]] = inv_visibilities[mask[:, :, :, 0]]
@ -201,7 +201,6 @@ class CoTrackerOnlinePredictor(torch.nn.Module):
grid_query_frame: int = 0,
add_support_grid=False,
):
B, T, C, H, W = video_chunk.shape
# Initialize online video processing and save queried points
# This needs to be done before processing *each new video*
if is_first_step:
@ -232,7 +231,7 @@ class CoTrackerOnlinePredictor(torch.nn.Module):
queries = torch.cat([queries, grid_pts], dim=1)
self.queries = queries
return (None, None)
B, T, C, H, W = video_chunk.shape
video_chunk = video_chunk.reshape(B * T, C, H, W)
video_chunk = F.interpolate(
video_chunk, tuple(self.interp_shape), mode="bilinear", align_corners=True

View File

@ -83,12 +83,11 @@ if __name__ == "__main__":
print("computed")
# save a video with predicted tracks
seq_name = os.path.splitext(args.video_path.split("/")[-1])[0]
seq_name = args.video_path.split("/")[-1]
vis = Visualizer(save_dir="./saved_videos", pad_value=120, linewidth=3)
vis.visualize(
video,
pred_tracks,
pred_visibility,
query_frame=0 if args.backward_tracking else args.grid_query_frame,
filename=seq_name,
)

View File

@ -1,10 +1,3 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import torch
import gradio as gr
@ -29,12 +22,7 @@ def cotracker_demo(
model = model.cuda()
load_video = load_video.cuda()
model(
video_chunk=load_video,
is_first_step=True,
grid_size=grid_size,
grid_query_frame=grid_query_frame,
)
model(video_chunk=load_video, is_first_step=True, grid_size=grid_size)
for ind in range(0, load_video.shape[1] - model.step, model.step):
pred_tracks, pred_visibility = model(
video_chunk=load_video[:, ind : ind + model.step * 2]

View File

@ -4,7 +4,6 @@
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import torch
import argparse
import imageio.v3 as iio
@ -45,9 +44,6 @@ if __name__ == "__main__":
args = parser.parse_args()
if not os.path.isfile(args.video_path):
raise ValueError("Video file does not exist")
if args.checkpoint is not None:
model = CoTrackerOnlinePredictor(checkpoint=args.checkpoint)
else:
@ -56,33 +52,25 @@ if __name__ == "__main__":
window_frames = []
def _process_step(window_frames, is_first_step, grid_size, grid_query_frame):
def _process_step(window_frames, is_first_step, grid_size):
video_chunk = (
torch.tensor(np.stack(window_frames[-model.step * 2 :]), device=DEFAULT_DEVICE)
.float()
.permute(0, 3, 1, 2)[None]
) # (1, T, 3, H, W)
return model(
video_chunk,
is_first_step=is_first_step,
grid_size=grid_size,
grid_query_frame=grid_query_frame,
)
return model(video_chunk, is_first_step=is_first_step, grid_size=grid_size)
# Iterating over video frames, processing one window at a time:
is_first_step = True
for i, frame in enumerate(
iio.imiter(
args.video_path,
"https://github.com/facebookresearch/co-tracker/blob/main/assets/apple.mp4",
plugin="FFMPEG",
)
):
if i % model.step == 0 and i != 0:
pred_tracks, pred_visibility = _process_step(
window_frames,
is_first_step,
grid_size=args.grid_size,
grid_query_frame=args.grid_query_frame,
window_frames, is_first_step, grid_size=args.grid_size
)
is_first_step = False
window_frames.append(frame)
@ -91,13 +79,12 @@ if __name__ == "__main__":
window_frames[-(i % model.step) - model.step - 1 :],
is_first_step,
grid_size=args.grid_size,
grid_query_frame=args.grid_query_frame,
)
print("Tracks are computed")
# save a video with predicted tracks
seq_name = os.path.splitext(args.video_path.split("/")[-1])[0]
seq_name = args.video_path.split("/")[-1]
video = torch.tensor(np.stack(window_frames), device=DEFAULT_DEVICE).permute(0, 3, 1, 2)[None]
vis = Visualizer(save_dir="./saved_videos", pad_value=120, linewidth=3)
vis.visualize(video, pred_tracks, pred_visibility, query_frame=args.grid_query_frame, filename=seq_name)
vis.visualize(video, pred_tracks, pred_visibility, query_frame=args.grid_query_frame)