diff --git a/cotracker/predictor.py b/cotracker/predictor.py
index 575095b..baded92 100644
--- a/cotracker/predictor.py
+++ b/cotracker/predictor.py
@@ -23,7 +23,7 @@ class CoTrackerPredictor(torch.nn.Module):
     @torch.no_grad()
     def forward(
         self,
-        video,  # (B, T, 3, H, W)
+        video,  # (B, T, 3, H, W) Batch_size, time, rgb, height, width
         # input prompt types:
         # - None. Dense tracks are computed in this case. You can adjust *query_frame* to compute tracks starting from a specific frame.
         # *backward_tracking=True* will compute tracks in both directions.
@@ -59,14 +59,23 @@ class CoTrackerPredictor(torch.nn.Module):
         *_, H, W = video.shape
         grid_step = W // grid_size
         grid_width = W // grid_step
-        grid_height = H // grid_step
+        grid_height = H // grid_step # set the whole video to grid_size number of grids
         tracks = visibilities = None
         grid_pts = torch.zeros((1, grid_width * grid_height, 3)).to(video.device)
+        # (batch_size, grid_number, t,x,y)
         grid_pts[0, :, 0] = grid_query_frame
+        # iterate every grid
         for offset in range(grid_step * grid_step):
             print(f"step {offset} / {grid_step * grid_step}")
             ox = offset % grid_step
             oy = offset // grid_step
+            # initialize
+            # for example
+            # grid width = 4, grid height = 4, grid step = 10, ox = 1
+            # torch.arange(grid_width) = [0,1,2,3]
+            # torch.arange(grid_width).repeat(grid_height) = [0,1,2,3,0,1,2,3,0,1,2,3]
+            # torch.arange(grid_width).repeat(grid_height) * grid_step = [0,10,20,30,0,10,20,30,0,10,20,30]
+            # get the location in the image
             grid_pts[0, :, 1] = torch.arange(grid_width).repeat(grid_height) * grid_step + ox
             grid_pts[0, :, 2] = (
                 torch.arange(grid_height).repeat_interleave(grid_width) * grid_step + oy