diff --git a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
index 750c2a32da34..3431bf706373 100644
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -4,6 +4,7 @@
 class MultiTensorApply(object):
     """
     Apply an operation to a list of tensors efficiently.
+    Move tensors to CUDA if they are on CPU.
 
     Args:
         chunk_size (int): Size of a chunk.
@@ -32,4 +33,10 @@ def check_avail(self):
     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
         self.check_avail()
 
+        # Move tensors to GPU if not already on GPU
+        for i, tensor_list in enumerate(tensor_lists):
+            for j, tensor in enumerate(tensor_list):
+                if tensor.device.type == "cpu":
+                    tensor_lists[i][j] = tensor.to("cuda")
+
         return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)