fixed Float8Tensor creation with deferred init, all tests passing loc…

…ally
NVIDIA · Jan 12, 2024 · cb055e5 · cb055e5
1 parent 2f225cf
commit cb055e5
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/transformer_engine/pytorch/module/_common.py b/transformer_engine/pytorch/module/_common.py
@@ -223,7 +223,7 @@ def init_as_weight(self, param: torch.Tensor, set_tp_attributes: bool = False) -
         if FP8GlobalStateManager.with_fp8_parameters():
             self.parent.init_fp8_metadata()
             self.parent.fp8_meta["update_amax_and_scale_fwd"] = True
-            param = Float8Tensor(
+            param = Float8Tensor.to_float8(
                 param,
                 fp8_meta=self.parent.fp8_meta,
                 fp8_meta_index=self.fp8_meta_index

diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -768,7 +768,7 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
         for name, param in self.named_parameters(recurse=False):
             # Ensure parameter is on a real device
             if param.device == torch.device('meta'):
-                param.to(device='cuda')
+                param = param.to(device='cuda')
 
             if 'weight' in name:
                 # Initialize weight values on device
@@ -778,7 +778,7 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
                 param = self.param_init_meta[name].init_as_bias(param)
 
             # Redo parameter wrap in case we broke it above
-            param = torch.nn.Parameter(param)
+            setattr(self, name, torch.nn.Parameter(param))
 
     @abstractmethod
     def forward(self):