From 59267e3e192cc7c0b61f34d5a4d2ad1bec89f393 Mon Sep 17 00:00:00 2001
From: Dongyan Chen <chendongyan@isrc.iscas.ac.cn>
Date: Fri, 12 Jun 2026 06:43:34 +0000
Subject: [PATCH 1/4] fix(pto): canonicalize rank2 view IR forms

---
 include/PTO/Transforms/Passes.h               |   1 +
 include/PTO/Transforms/Passes.td              |  15 +
 lib/PTO/Transforms/CMakeLists.txt             |   1 +
 lib/PTO/Transforms/PTOCanonicalizeIR.cpp      | 259 ++++++++++++++++++
 .../issue31_partition_view_parser_compat.pto  |   8 +-
 .../pto/issue783_canonicalize_rank2_views.pto |  26 ++
 .../tpush_tpop_globaltensor_frontend_a3.pto   |  12 +-
 tools/ptoas/ptoas.cpp                         |   1 +
 8 files changed, 313 insertions(+), 10 deletions(-)
 create mode 100644 lib/PTO/Transforms/PTOCanonicalizeIR.cpp
 create mode 100644 test/lit/pto/issue783_canonicalize_rank2_views.pto
diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h
index 87cc454af..e22e19e75 100644
--- a/include/PTO/Transforms/Passes.h
+++ b/include/PTO/Transforms/Passes.h
@@ -103,6 +103,7 @@ std::unique_ptr<Pass> createPTOValidateVPTOEmissionIRPass();
 std::unique_ptr<Pass> createExpandTileOpPass();
 std::unique_ptr<Pass> createExpandTileOpPass(const ExpandTileOpOptions &options);
 std::unique_ptr<Pass> createFoldTileBufIntrinsicsPass();
+std::unique_ptr<Pass> createPTOCanonicalizeIRPass();
 std::unique_ptr<Pass> createFoldTileBufIntrinsicsPass(llvm::StringRef foldMode);
 std::unique_ptr<Pass>
 createPTOInlineLibCallPass(const PTOInlineLibCallOptions &options = {});
diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
index e92152dbf..abe31b018 100644
--- a/include/PTO/Transforms/Passes.td
+++ b/include/PTO/Transforms/Passes.td
@@ -480,6 +480,21 @@ def FoldTileBufIntrinsics : Pass<"pto-fold-tile-buf-intrinsics", "mlir::func::Fu
   ];
 }
 
+def PTOCanonicalizeIR : Pass<"pto-canonicalize-ir", "func::FuncOp"> {
+  let summary = "Canonicalize PTO IR forms before backend lowering";
+  let description = [{
+    Rewrites shorthand or legacy PTO IR forms into canonical forms before
+    backend-specific lowering. Currently this canonicalizes rank-2 tensor_view /
+    partition_tensor_view descriptors into the canonical right-aligned rank-5
+    form: [R, C] -> [1, 1, 1, R, C].
+  }];
+  let constructor = "mlir::pto::createPTOCanonicalizeIRPass()";
+  let dependentDialects = [
+    "mlir::pto::PTODialect",
+    "mlir::arith::ArithDialect"
+  ];
+}
+
 def PTOInlineLibCall : Pass<"pto-inline-libcall", "ModuleOp"> {
   let summary = "Materialize OP-Lib instance bodies and inline OP-Lib calls";
   let description = [{
diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt
index 2ebc448a7..e372c3d71 100644
--- a/lib/PTO/Transforms/CMakeLists.txt
+++ b/lib/PTO/Transforms/CMakeLists.txt
@@ -55,6 +55,7 @@ add_mlir_dialect_library(PTOTransforms
   PTORemoveRedundantBarrier.cpp
   InferPTOLayout.cpp
   PTOA5NormalizeTMovPass.cpp
+  PTOCanonicalizeIR.cpp
   PTOMaterializeTileHandles.cpp
   BufferizableOpInterfaceImpl.cpp
   ConvertToPTOOp.cpp
diff --git a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp
new file mode 100644
index 000000000..cf5ee8283
--- /dev/null
+++ b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp
@@ -0,0 +1,259 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/Passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+#include <utility>
+
+namespace mlir {
+namespace pto {
+#define GEN_PASS_DEF_PTOCANONICALIZEIR
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+constexpr unsigned kLogicalRank2 = 2;
+constexpr unsigned kCanonicalRank5 = 5;
+constexpr int64_t kUnitExtent = 1;
+constexpr unsigned kRank2Rows = 0;
+constexpr unsigned kRank2Cols = 1;
+constexpr int64_t kRank2ToRank5DimOffset = 3;
+
+static SmallVector<int64_t, kCanonicalRank5>
+rightAlignRank2Shape(ArrayRef<int64_t> shape) {
+  return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2Rows],
+          shape[kRank2Cols]};
+}
+
+static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc,
+                                      int64_t value) {
+  return builder.create<arith::ConstantIndexOp>(loc, value);
+}
+
+static SmallVector<Value, kCanonicalRank5>
+prependThreeValues(ValueRange values, Value fill) {
+  return {fill, fill, fill, values[kRank2Rows], values[kRank2Cols]};
+}
+
+static SmallVector<Value, kCanonicalRank5>
+buildCanonicalRank2Strides(MakeTensorViewOp op) {
+  Value rowStride = op.getStrides()[kRank2Rows];
+  Value colStride = op.getStrides()[kRank2Cols];
+  auto layout = op.getLayoutAttr();
+  if (layout && layout.getLayout() == Layout::DN)
+    return {colStride, colStride, colStride, rowStride, colStride};
+  return {rowStride, rowStride, rowStride, rowStride, colStride};
+}
+
+static bool isRank2ViewLike(Type type) {
+  if (auto viewType = dyn_cast<TensorViewType>(type))
+    return viewType.getRank() == kLogicalRank2;
+  if (auto viewType = dyn_cast<PartitionTensorViewType>(type))
+    return viewType.getRank() == kLogicalRank2;
+  return false;
+}
+
+static Type canonicalViewType(Type type) {
+  if (auto viewType = dyn_cast<TensorViewType>(type)) {
+    if (viewType.getRank() == kLogicalRank2)
+      return TensorViewType::get(type.getContext(),
+                                 rightAlignRank2Shape(viewType.getShape()),
+                                 viewType.getElementType());
+    return type;
+  }
+  if (auto viewType = dyn_cast<PartitionTensorViewType>(type)) {
+    if (viewType.getRank() == kLogicalRank2)
+      return PartitionTensorViewType::get(
+          type.getContext(), rightAlignRank2Shape(viewType.getShape()),
+          viewType.getElementType());
+    return type;
+  }
+  return type;
+}
+
+static bool canonicalizeValueType(Value value) {
+  Type oldType = value.getType();
+  Type newType = canonicalViewType(oldType);
+  if (newType == oldType)
+    return false;
+  value.setType(newType);
+  return true;
+}
+
+static LogicalResult rewriteMakeTensorView(MakeTensorViewOp op,
+                                           IRRewriter &rewriter) {
+  auto oldType = dyn_cast<TensorViewType>(op.getResult().getType());
+  if (!oldType || oldType.getRank() != kLogicalRank2)
+    return success();
+
+  if (op.getShape().size() != kLogicalRank2 ||
+      op.getStrides().size() != kLogicalRank2)
+    return op.emitOpError(
+        "rank-2 tensor_view must have exactly 2 shape and stride operands");
+
+  rewriter.setInsertionPoint(op);
+  Value one = getOrCreateIndexConstant(rewriter, op.getLoc(), kUnitExtent);
+  SmallVector<Value, kCanonicalRank5> newShape =
+      prependThreeValues(op.getShape(), one);
+  SmallVector<Value, kCanonicalRank5> newStrides =
+      buildCanonicalRank2Strides(op);
+  auto newType = cast<TensorViewType>(canonicalViewType(oldType));
+
+  auto newOp = rewriter.create<MakeTensorViewOp>(
+      op.getLoc(), newType, op.getPtr(), newShape, newStrides,
+      op.getLayoutAttr());
+  rewriter.replaceOp(op, newOp.getResult());
+  return success();
+}
+
+static LogicalResult rewritePartitionView(PartitionViewOp op,
+                                          IRRewriter &rewriter) {
+  auto sourceType = dyn_cast<TensorViewType>(op.getSource().getType());
+  auto resultType = dyn_cast<PartitionTensorViewType>(op.getResult().getType());
+  if (!sourceType || !resultType)
+    return success();
+
+  if (op.getOffsets().size() != kLogicalRank2 ||
+      op.getSizes().size() != kLogicalRank2)
+    return success();
+
+  if (sourceType.getRank() != kCanonicalRank5)
+    return op.emitOpError(
+        "rank-2 partition_tensor_view normalization expects canonical rank-5 "
+        "source tensor_view");
+
+  rewriter.setInsertionPoint(op);
+  Value zero = getOrCreateIndexConstant(rewriter, op.getLoc(), 0);
+  Value one = getOrCreateIndexConstant(rewriter, op.getLoc(), kUnitExtent);
+  SmallVector<Value, kCanonicalRank5> newOffsets =
+      prependThreeValues(op.getOffsets(), zero);
+  SmallVector<Value, kCanonicalRank5> newSizes =
+      prependThreeValues(op.getSizes(), one);
+  auto newType = cast<PartitionTensorViewType>(canonicalViewType(resultType));
+
+  auto newOp = rewriter.create<PartitionViewOp>(
+      op.getLoc(), newType, op.getSource(), newOffsets, newSizes);
+  rewriter.replaceOp(op, newOp.getResult());
+  return success();
+}
+
+static Value buildCanonicalDimIndex(Value dimIndex, IRRewriter &rewriter,
+                                    Location loc) {
+  rewriter.setInsertionPointAfterValue(dimIndex);
+  Value offset =
+      getOrCreateIndexConstant(rewriter, loc, kRank2ToRank5DimOffset);
+  return rewriter.create<arith::AddIOp>(loc, dimIndex, offset);
+}
+
+static void rewriteTensorViewDimOperand(Operation *op, Value dimIndex,
+                                        IRRewriter &rewriter) {
+  Value newDim = buildCanonicalDimIndex(dimIndex, rewriter, op->getLoc());
+  op->setOperand(1, newDim);
+}
+
+static void canonicalizeFunctionType(func::FuncOp func) {
+  auto oldType = func.getFunctionType();
+  SmallVector<Type> inputs;
+  SmallVector<Type> results;
+  bool changed = false;
+
+  inputs.reserve(oldType.getNumInputs());
+  for (Type type : oldType.getInputs()) {
+    Type newType = canonicalViewType(type);
+    changed |= newType != type;
+    inputs.push_back(newType);
+  }
+
+  results.reserve(oldType.getNumResults());
+  for (Type type : oldType.getResults()) {
+    Type newType = canonicalViewType(type);
+    changed |= newType != type;
+    results.push_back(newType);
+  }
+
+  if (changed)
+    func.setFunctionType(FunctionType::get(func.getContext(), inputs, results));
+}
+
+static void canonicalizeValueTypes(func::FuncOp func) {
+  canonicalizeFunctionType(func);
+
+  func->walk([](Operation *op) {
+    for (Region &region : op->getRegions()) {
+      for (Block &block : region) {
+        for (BlockArgument arg : block.getArguments())
+          canonicalizeValueType(arg);
+      }
+    }
+
+    for (OpResult result : op->getResults())
+      canonicalizeValueType(result);
+  });
+}
+
+struct PTOCanonicalizeIRPass
+    : public mlir::pto::impl::PTOCanonicalizeIRBase<PTOCanonicalizeIRPass> {
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    SmallVector<MakeTensorViewOp> makeViews;
+    SmallVector<PartitionViewOp> partitionViews;
+    SmallVector<std::pair<Operation *, Value>> dimIndexOps;
+
+    func.walk([&](MakeTensorViewOp op) {
+      if (isRank2ViewLike(op.getResult().getType()))
+        makeViews.push_back(op);
+    });
+    func.walk([&](PartitionViewOp op) {
+      if (op.getOffsets().size() == kLogicalRank2 &&
+          op.getSizes().size() == kLogicalRank2)
+        partitionViews.push_back(op);
+    });
+    func.walk([&](GetTensorViewDimOp op) {
+      if (isRank2ViewLike(op.getTensorView().getType()))
+        dimIndexOps.emplace_back(op.getOperation(), op.getDimIndex());
+    });
+    func.walk([&](GetTensorViewStrideOp op) {
+      if (isRank2ViewLike(op.getTensorView().getType()))
+        dimIndexOps.emplace_back(op.getOperation(), op.getDimIndex());
+    });
+
+    IRRewriter rewriter(func.getContext());
+    for (MakeTensorViewOp op : makeViews) {
+      if (failed(rewriteMakeTensorView(op, rewriter))) {
+        signalPassFailure();
+        return;
+      }
+    }
+    for (auto [op, dimIndex] : dimIndexOps)
+      rewriteTensorViewDimOperand(op, dimIndex, rewriter);
+    canonicalizeValueTypes(func);
+    for (PartitionViewOp op : partitionViews) {
+      if (failed(rewritePartitionView(op, rewriter))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTOCanonicalizeIRPass() {
+  return std::make_unique<PTOCanonicalizeIRPass>();
+}
diff --git a/test/lit/pto/issue31_partition_view_parser_compat.pto b/test/lit/pto/issue31_partition_view_parser_compat.pto
index f6f5bfef8..e1eef5fd9 100755
--- a/test/lit/pto/issue31_partition_view_parser_compat.pto
+++ b/test/lit/pto/issue31_partition_view_parser_compat.pto
@@ -46,9 +46,9 @@ module {
 }
 
 // CHECK-LABEL: func.func @new_format_static
-// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
-// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<16x32xf32>)
+// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
+// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<1x1x1x16x32xf32>)
 // CHECK-LABEL: func.func @old_format_static
-// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
+// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
 // CHECK-LABEL: func.func @old_format_dynamic
-// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
+// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
diff --git a/test/lit/pto/issue783_canonicalize_rank2_views.pto b/test/lit/pto/issue783_canonicalize_rank2_views.pto
new file mode 100644
index 000000000..8caecf343
--- /dev/null
+++ b/test/lit/pto/issue783_canonicalize_rank2_views.pto
@@ -0,0 +1,26 @@
+// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @canonicalize_rank2_views(%src: !pto.ptr<bf16, gm>, %dst: !pto.ptr<bf16, gm>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c512 = arith.constant 512 : index
+    %c8192 = arith.constant 8192 : index
+
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xbf16>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xbf16>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c512], sizes = [%c16, %c512] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x512xbf16>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c512], sizes = [%c16, %c512] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x512xbf16>
+    %tile = pto.declare_tile -> !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=512, v_row=16, v_col=512, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tload ins(%src_part : !pto.partition_tensor_view<16x512xbf16>) outs(%tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=512, v_row=16, v_col=512, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.section.vector {
+    }
+    return
+  }
+}
+
+// CHECK: pto.make_tensor_view {{.*}} shape = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} : !pto.tensor_view<1x1x1x?x?xbf16>
+// CHECK: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xbf16>
+// CHECK: !pto.partition_tensor_view<1x1x1x16x512xbf16>
+// CHECK-NOT: !pto.partition_tensor_view<16x512xbf16>
diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
index 9ad19ec8f..68cbea217 100644
--- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
+++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
@@ -66,17 +66,17 @@ module {
 // CHECK-LABEL: AICORE void cube_kernel
 // CHECK-SAME: (__gm__ float* [[CUBE_GM:v[0-9]+]],
 // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[CUBE_GM]], {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
 // CHECK: TSTORE
-// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
 // CHECK-LABEL: AICORE void vector_kernel
 // CHECK-SAME: (__gm__ float* [[VEC_GM:v[0-9]+]],
 // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[VEC_GM]], {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 // CHECK: TLOAD
-// CHECK: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 
 // RESOLVE-LABEL: func.func @cube_kernel
 // RESOLVE-NOT: pto.reserve_buffer
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
index 0d20114af..2173e592f 100644
--- a/tools/ptoas/ptoas.cpp
+++ b/tools/ptoas/ptoas.cpp
@@ -1737,6 +1737,7 @@ int mlir::pto::compilePTOASModule(
   if (failed(applyPassManagerCLOptions(pm)))
     return 1;
 
+  pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOCanonicalizeIRPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       pto::createPTOAssignDefaultFrontendPipeIdPass());
   pm.addNestedPass<mlir::func::FuncOp>(

From 6fececbe3bedd33bc40e80640ab9a3844d0f9b95 Mon Sep 17 00:00:00 2001
From: Dongyan Chen <chendongyan@isrc.iscas.ac.cn>
Date: Mon, 15 Jun 2026 06:45:48 +0000
Subject: [PATCH 2/4] Update testcases.

---
 include/PTO/Transforms/Passes.h               |  2 +-
 ...tload_tprefetch_low_precision_a5_valid.pto |  6 ++---
 .../tpush_tpop_globaltensor_frontend_a3.pto   | 12 +++++-----
 .../tpush_tpop_globaltensor_frontend_a5.pto   | 24 +++++++++----------
 .../lit/pto/tstore_low_precision_a5_valid.pto |  6 ++---
 5 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h
index e22e19e75..59ad36c93 100644
--- a/include/PTO/Transforms/Passes.h
+++ b/include/PTO/Transforms/Passes.h
@@ -103,8 +103,8 @@ std::unique_ptr<Pass> createPTOValidateVPTOEmissionIRPass();
 std::unique_ptr<Pass> createExpandTileOpPass();
 std::unique_ptr<Pass> createExpandTileOpPass(const ExpandTileOpOptions &options);
 std::unique_ptr<Pass> createFoldTileBufIntrinsicsPass();
-std::unique_ptr<Pass> createPTOCanonicalizeIRPass();
 std::unique_ptr<Pass> createFoldTileBufIntrinsicsPass(llvm::StringRef foldMode);
+std::unique_ptr<Pass> createPTOCanonicalizeIRPass();
 std::unique_ptr<Pass>
 createPTOInlineLibCallPass(const PTOInlineLibCallOptions &options = {});
 void registerPTOViewToMemrefPass();
diff --git a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
index affb44872..eb3f2c4e1 100644
--- a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
+++ b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
@@ -23,7 +23,7 @@ module {
   }
 }
 
-// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>)
+// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>)
 // CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8
-// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN>) outs(
-// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8>) outs(
+// CHECK: pto.tload ins(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) outs(
+// CHECK: pto.tprefetch ins(%arg1 : memref<1x1x1x16x16x!pto.hif8>) outs(
diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
index 68cbea217..2d18717dd 100644
--- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
+++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
@@ -82,16 +82,16 @@ module {
 // RESOLVE-NOT: pto.reserve_buffer
 // RESOLVE-NOT: pto.import_reserved_buffer
 // RESOLVE: pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 1024, slot_num = 8, flag_base = 0, nosplit = true}
-// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array<i64: 16, 1>} -> !pto.tensor_view<16x16xf32>
-// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
-// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array<i64: 16, 16, 16, 16, 1>} -> !pto.tensor_view<1x1x1x16x16xf32>
+// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
 // RESOLVE-LABEL: func.func @vector_kernel
 // RESOLVE-NOT: pto.reserve_buffer
 // RESOLVE-NOT: pto.import_reserved_buffer
-// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
-// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
 
 // GSS-LABEL: AICORE void cube_kernel
 // GSS: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
 // GSS: TSTORE
-// GSS: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
+// GSS: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
\ No newline at end of file
diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
index 8562c1207..bcbdb0b0c 100644
--- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
+++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
@@ -83,24 +83,24 @@ module {
 
 // CHECK-LABEL: AICORE void cube_c2v_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
-// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
 
 // CHECK-LABEL: AICORE void vector_c2v_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
-// CHECK: TFREE<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 
 // CHECK-LABEL: AICORE void vector_v2c_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
-// CHECK: TPUSH<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
 
 // CHECK-LABEL: AICORE void cube_v2c_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
-// CHECK: TFREE<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
diff --git a/test/lit/pto/tstore_low_precision_a5_valid.pto b/test/lit/pto/tstore_low_precision_a5_valid.pto
index ab0783062..3c46a3ba4 100644
--- a/test/lit/pto/tstore_low_precision_a5_valid.pto
+++ b/test/lit/pto/tstore_low_precision_a5_valid.pto
@@ -24,8 +24,8 @@ module {
   }
 }
 
-// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>, %arg2: i64)
+// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>, %arg2: i64)
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN>)
+// CHECK: outs(%arg0 : memref<1x1x1x16x16xf8E4M3FN>)
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg1 : memref<16x16x!pto.hif8>)
+// CHECK: outs(%arg1 : memref<1x1x1x16x16x!pto.hif8>)

From 0a1ed40806a46466af6e6b47a98c58d3693f2f76 Mon Sep 17 00:00:00 2001
From: Dongyan Chen <chendongyan@isrc.iscas.ac.cn>
Date: Wed, 17 Jun 2026 07:34:43 +0000
Subject: [PATCH 3/4] fix stride rule + DN misidentification in canonicalize
 pass, gate on VPTO

---
 include/PTO/Transforms/Passes.td              |  19 ++-
 lib/PTO/IR/PTO.cpp                            |  10 ++
 lib/PTO/Transforms/PTOCanonicalizeIR.cpp      | 156 ++++++++++++++++--
 .../issue31_partition_view_parser_compat.pto  |   8 +-
 .../issue783_canonicalize_rank2_dn_views.pto  |  34 ++++
 ... issue783_canonicalize_rank2_nd_views.pto} |   2 +-
 .../pto/issue783_canonicalize_rank5_noop.pto  |  29 ++++
 ...tload_tprefetch_low_precision_a5_valid.pto |   6 +-
 .../tpush_tpop_globaltensor_frontend_a3.pto   |  24 +--
 .../tpush_tpop_globaltensor_frontend_a5.pto   |  24 +--
 .../lit/pto/tstore_low_precision_a5_valid.pto |   6 +-
 tools/ptoas/ptoas.cpp                         |   9 +-
 12 files changed, 275 insertions(+), 52 deletions(-)
 create mode 100644 test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
 rename test/lit/pto/{issue783_canonicalize_rank2_views.pto => issue783_canonicalize_rank2_nd_views.pto} (94%)
 create mode 100644 test/lit/pto/issue783_canonicalize_rank5_noop.pto

diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
index abe31b018..a897034d1 100644
--- a/include/PTO/Transforms/Passes.td
+++ b/include/PTO/Transforms/Passes.td
@@ -481,12 +481,21 @@ def FoldTileBufIntrinsics : Pass<"pto-fold-tile-buf-intrinsics", "mlir::func::Fu
 }
 
 def PTOCanonicalizeIR : Pass<"pto-canonicalize-ir", "func::FuncOp"> {
-  let summary = "Canonicalize PTO IR forms before backend lowering";
+  let summary = "Canonicalize rank-2 view descriptors into rank-5 form (VPTO-only)";
   let description = [{
-    Rewrites shorthand or legacy PTO IR forms into canonical forms before
-    backend-specific lowering. Currently this canonicalizes rank-2 tensor_view /
-    partition_tensor_view descriptors into the canonical right-aligned rank-5
-    form: [R, C] -> [1, 1, 1, R, C].
+    Rewrites rank-2 tensor_view / partition_tensor_view descriptors into the
+    canonical right-aligned rank-5 form [1, 1, 1, R, C], matching the 5D
+    descriptor layout expected by VPTO lowering. Stride expansion uses the
+    same cumulative-product rule as rightAlignTo5D and
+    buildGlobalTensorShapeAndStride: stride[i] = shape[i+1] * stride[i+1].
+
+    Currently gated on --pto-backend=vpto to limit blast radius. A3/A5
+    EmitC codegen already pads strides to rank-5 via InferPTOLayout and
+    buildGlobalTensorShapeAndStride, so it does not need this pass at the
+    IR level. The gate can be lifted once the pass is proven stable.
+
+    A post-canonicalization verification detects any surviving rank-2 view
+    types to prevent silent failures when new view-consuming ops are added.
   }];
   let constructor = "mlir::pto::createPTOCanonicalizeIRPass()";
   let dependentDialects = [
diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
index 3b14a3034..787a3b125 100644
--- a/lib/PTO/IR/PTO.cpp
+++ b/lib/PTO/IR/PTO.cpp
@@ -1734,6 +1734,16 @@ static std::optional<pto::Layout> getLogicalViewLayout(Value value) {
   if (auto part = value.getDefiningOp<pto::PartitionViewOp>())
     return getLogicalViewLayout(part.getSource());
   if (auto make = value.getDefiningOp<pto::MakeTensorViewOp>()) {
+    // Prefer the explicit layout attribute when available.  After rank-2 →
+    // rank-5 canonicalization, the padded leading strides satisfy the ND
+    // (row-major) recurrence even for DN (col-major) data, so inferLayout
+    // alone would misclassify DN as ND (the col-major recurrence breaks at
+    // the boundary between padded unit-extent dims and real dims).  The
+    // layout attribute carries the *intended* memory layout and is the
+    // authoritative source — inferLayout is only a fallback for views that
+    // lack an explicit layout.
+    if (auto layoutAttr = make.getLayoutAttr())
+      return layoutAttr.getLayout();
     auto tvTy = dyn_cast<pto::TensorViewType>(make.getResult().getType());
     if (!tvTy)
       return std::nullopt;
diff --git a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp
index cf5ee8283..c4495d0d1 100644
--- a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp
+++ b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp
@@ -28,17 +28,49 @@ using namespace mlir::pto;
 
 namespace {
 
+// ---------------------------------------------------------------------------
+// Design note: which ops need structural rewriting vs. type-only walk
+// ---------------------------------------------------------------------------
+//
+// This pass canonicalizes rank-2 TensorViewType / PartitionTensorViewType
+// into the right-aligned rank-5 form [1, 1, 1, R, C] used by all backends
+// (A3, A5, VPTO EmitC codegen and the 5D memref rank in PTOViewToMemref).
+//
+// Ops that carry **rank-dependent operands** must be structurally rewritten
+// (their operand count or operand values change when rank changes):
+//   - MakeTensorViewOp  : shape/strides expanded from 2 → 5
+//   - PartitionViewOp   : offsets/sizes expanded from 2 → 5
+//   - GetTensorViewDimOp / GetTensorViewStrideOp : dim index offset by +3
+//
+// Ops that only **carry view-typed operands/results** (no rank-dependent
+// operand structure) are handled by the type walk (canonicalizeValueTypes)
+// which in-place mutates TensorViewType and PartitionTensorViewType from
+// rank-2 to rank-5:
+//   - TAllocToAivOp, TAllocToAicOp, DeclareGlobalOp (producers)
+//   - TAllocOp, TPushOp, TPopOp, TFreeOp, AicInitializePipeOp,
+//     AivInitializePipeOp, TensorViewAddrOp (consumers)
+//   - All PTODpsType consumers (TLoadOp, TStoreOp, TMatmulOp, etc.)
+//   - All PTOPipeEntryType consumers (TPushToAivOp, TPopFromAicOp, etc.)
+//
+// A post-canonicalization verification (verifyNoRank2ViewSurvivors) detects
+// any surviving rank-2 view types to prevent silent failures when new
+// view-consuming ops with rank-dependent operands are added.
+//
+// NZ layout cannot appear on rank-2 views (it requires rank >= 5 with
+// shape[2] == 16), so only ND and DN strides need expansion logic.
+// ---------------------------------------------------------------------------
+
 constexpr unsigned kLogicalRank2 = 2;
 constexpr unsigned kCanonicalRank5 = 5;
 constexpr int64_t kUnitExtent = 1;
-constexpr unsigned kRank2Rows = 0;
-constexpr unsigned kRank2Cols = 1;
+constexpr unsigned kRank2RowDim = 0; // row dimension index in rank-2 view
+constexpr unsigned kRank2ColDim = 1; // column dimension index in rank-2 view
 constexpr int64_t kRank2ToRank5DimOffset = 3;
 
 static SmallVector<int64_t, kCanonicalRank5>
 rightAlignRank2Shape(ArrayRef<int64_t> shape) {
-  return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2Rows],
-          shape[kRank2Cols]};
+  return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2RowDim],
+          shape[kRank2ColDim]};
 }
 
 static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc,
@@ -48,17 +80,81 @@ static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc,
 
 static SmallVector<Value, kCanonicalRank5>
 prependThreeValues(ValueRange values, Value fill) {
-  return {fill, fill, fill, values[kRank2Rows], values[kRank2Cols]};
+  return {fill, fill, fill, values[kRank2RowDim], values[kRank2ColDim]};
 }
 
+// ---------------------------------------------------------------------------
+// Stride expansion: uses the same cumulative-product rule as
+// rightAlignTo5D (InferPTOLayout.cpp) and buildGlobalTensorShapeAndStride
+// (PTOToEmitC.cpp): stride[i] = shape[i+1] * stride[i+1].
+//
+// For a rank-2 view [R, C] right-aligned into [1, 1, 1, R, C]:
+//   - ND (row-major): original strides = [C, 1]
+//     padded strides: stride[2] = shape[3]*stride[3] = R*C,
+//                    stride[1] = shape[2]*stride[2] = 1*R*C = R*C,
+//                    stride[0] = shape[1]*stride[1] = 1*R*C = R*C
+//     → [R*C, R*C, R*C, C, 1]
+//
+//   - DN (col-major): original strides = [1, R]
+//     padded strides: stride[2] = shape[3]*stride[3] = R*1 = R,
+//                    stride[1] = shape[2]*stride[2] = 1*R = R,
+//                    stride[0] = shape[1]*stride[1] = 1*R = R
+//     → [R, R, R, 1, R]
+//
+// Note: the ND branch was previously incorrectly using rowStride (=C) for
+// all three leading dims, producing [C, C, C, C, 1] instead of the correct
+// cumulative product [R*C, R*C, R*C, C, 1]. The DN branch was correct by
+// coincidence because colStride == R and the cumulative product of unit-extent
+// leading dims also collapses to R.
+// ---------------------------------------------------------------------------
 static SmallVector<Value, kCanonicalRank5>
-buildCanonicalRank2Strides(MakeTensorViewOp op) {
-  Value rowStride = op.getStrides()[kRank2Rows];
-  Value colStride = op.getStrides()[kRank2Cols];
+buildCanonicalRank2Strides(MakeTensorViewOp op, IRRewriter &rewriter) {
+  Value rowStride = op.getStrides()[kRank2RowDim];
+  Value colStride = op.getStrides()[kRank2ColDim];
+
+  rewriter.setInsertionPoint(op);
+  auto loc = op.getLoc();
+
   auto layout = op.getLayoutAttr();
-  if (layout && layout.getLayout() == Layout::DN)
+
+  // For ND (row-major): original strides = [rowStride, colStride]
+  // where rowStride = C (shape[1]) and colStride = 1.
+  // Cumulative product rule for leading dims:
+  //   stride[2] = shape[3] * stride[3] = rowStride_vals * rowStride
+  //   But shape[3] and stride[3] are SSA values, not constants.
+  //   We compute: shape[kRank2RowDim] * rowStride for stride[2],
+  //               1 * (shape[kRank2RowDim] * rowStride) for strides [0..1].
+  //
+  // Simplification: since shape[0..2] are all 1 (unit-extent padding),
+  // the cumulative product collapses: stride[i] = stride[shift] for all
+  // i < shift, where shift = kRank2ToRank5DimOffset = 3.
+  //
+  // For ND: stride[3] = rowStride, so stride[0..2] = rowStride.
+  //         BUT wait — the cumulative rule is stride[i] = shape[i+1]*stride[i+1].
+  //         stride[2] = shape[3] * stride[3] = R * rowStride.
+  //         stride[1] = shape[2] * stride[2] = 1 * (R*rowStride) = R*rowStride.
+  //         stride[0] = shape[1] * stride[1] = 1 * (R*rowStride) = R*rowStride.
+  //         So the leading strides are NOT rowStride; they are R*rowStride.
+  //
+  // We must compute the product: shape[kRank2RowDim] * rowStride.
+  if (layout && layout.getLayout() == Layout::DN) {
+    // DN (col-major): strides = [1, R]
+    // Cumulative product: stride[2] = shape[3]*stride[3] = R*1 = R,
+    //                     stride[1] = 1*R = R, stride[0] = 1*R = R.
+    // Since colStride = R for DN, this collapses to colStride for all
+    // three leading dims. This is the same as the old DN branch.
     return {colStride, colStride, colStride, rowStride, colStride};
-  return {rowStride, rowStride, rowStride, rowStride, colStride};
+  }
+
+  // ND (row-major) or no explicit layout attr (default = ND):
+  // strides = [rowStride, colStride] where rowStride = C, colStride = 1.
+  // Cumulative product: stride[2] = shape[kRank2RowDim] * stride[3],
+  //                     stride[1] = 1 * stride[2],
+  //                     stride[0] = 1 * stride[2].
+  // = shape[kRank2RowDim] * rowStride for all three leading dims.
+  Value rowsValue = op.getShape()[kRank2RowDim];
+  Value leadingStride = rewriter.create<arith::MulIOp>(loc, rowsValue, rowStride);
+  return {leadingStride, leadingStride, leadingStride, rowStride, colStride};
 }
 
 static bool isRank2ViewLike(Type type) {
@@ -112,7 +208,7 @@ static LogicalResult rewriteMakeTensorView(MakeTensorViewOp op,
   SmallVector<Value, kCanonicalRank5> newShape =
       prependThreeValues(op.getShape(), one);
   SmallVector<Value, kCanonicalRank5> newStrides =
-      buildCanonicalRank2Strides(op);
+      buildCanonicalRank2Strides(op, rewriter);
   auto newType = cast<TensorViewType>(canonicalViewType(oldType));
 
   auto newOp = rewriter.create<MakeTensorViewOp>(
@@ -207,6 +303,36 @@ static void canonicalizeValueTypes(func::FuncOp func) {
   });
 }
 
+/// Verify that no rank-2 view types survived canonicalization.
+/// This catches cases where a new op with rank-dependent operands
+/// was added but not given a structural rewrite in this pass.
+static LogicalResult verifyNoRank2ViewSurvivors(func::FuncOp func) {
+  bool anyFailed = false;
+  func.walk([&](Operation *op) {
+    for (Region &region : op->getRegions()) {
+      for (Block &block : region) {
+        for (BlockArgument arg : block.getArguments()) {
+          if (isRank2ViewLike(arg.getType())) {
+            emitError(arg.getLoc())
+                << "rank-2 view type survived canonicalization: "
+                << arg.getType() << " as block argument";
+            anyFailed = true;
+          }
+        }
+      }
+    }
+    for (OpResult result : op->getResults()) {
+      if (isRank2ViewLike(result.getType())) {
+        emitError(op->getLoc())
+            << "rank-2 view type survived canonicalization: "
+            << result.getType() << " in op " << op->getName();
+        anyFailed = true;
+      }
+    }
+  });
+  return anyFailed ? failure() : success();
+}
+
 struct PTOCanonicalizeIRPass
     : public mlir::pto::impl::PTOCanonicalizeIRBase<PTOCanonicalizeIRPass> {
   void runOnOperation() override {
@@ -249,6 +375,14 @@ struct PTOCanonicalizeIRPass
         return;
       }
     }
+
+    // Post-canonicalization verification: ensure no rank-2 view types
+    // survived. If any do, it means an op with rank-dependent operands
+    // was not given a structural rewrite.
+    if (failed(verifyNoRank2ViewSurvivors(func))) {
+      signalPassFailure();
+      return;
+    }
   }
 };
 
diff --git a/test/lit/pto/issue31_partition_view_parser_compat.pto b/test/lit/pto/issue31_partition_view_parser_compat.pto
index e1eef5fd9..f6f5bfef8 100755
--- a/test/lit/pto/issue31_partition_view_parser_compat.pto
+++ b/test/lit/pto/issue31_partition_view_parser_compat.pto
@@ -46,9 +46,9 @@ module {
 }
 
 // CHECK-LABEL: func.func @new_format_static
-// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
-// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<1x1x1x16x32xf32>)
+// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
+// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<16x32xf32>)
 // CHECK-LABEL: func.func @old_format_static
-// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
+// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
 // CHECK-LABEL: func.func @old_format_dynamic
-// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}}
+// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view<?x?xf32>{{$}}
diff --git a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
new file mode 100644
index 000000000..7a53ce866
--- /dev/null
+++ b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
@@ -0,0 +1,34 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Tests that rank-2 DN (col-major) tensor_view canonicalization produces
+// correct cumulative-product strides and that the layout attribute is
+// preserved, preventing misidentification as ND after rank-5 padding.
+
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
+
+module {
+  func.func @dn_layout_kernel(%src: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+
+    // DN layout: strides = [1, 16] — col-major (stride[0]=1, stride[1]=rows)
+    %tv = pto.make_tensor_view %src, shape = [%c16, %c32], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %sv = pto.partition_view %tv, offsets = [%c0, %c0], sizes = [%c16, %c32] : !pto.tensor_view<?x?xf32>
+
+    %tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=32, v_row=16, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tload ins(%sv : !pto.partition_tensor_view<16x32xf32>) outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=32, v_row=16, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+}
+
+// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} {layout = #pto.layout<dn>} : !pto.tensor_view<1x1x1x?x?xf32>
+// IR: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xf32>
+// IR: !pto.partition_tensor_view<1x1x1x16x32xf32>
diff --git a/test/lit/pto/issue783_canonicalize_rank2_views.pto b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
similarity index 94%
rename from test/lit/pto/issue783_canonicalize_rank2_views.pto
rename to test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
index 8caecf343..b2c01fcbc 100644
--- a/test/lit/pto/issue783_canonicalize_rank2_views.pto
+++ b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
@@ -1,4 +1,4 @@
-// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
 
 module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
   func.func @canonicalize_rank2_views(%src: !pto.ptr<bf16, gm>, %dst: !pto.ptr<bf16, gm>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
diff --git a/test/lit/pto/issue783_canonicalize_rank5_noop.pto b/test/lit/pto/issue783_canonicalize_rank5_noop.pto
new file mode 100644
index 000000000..bfbce5481
--- /dev/null
+++ b/test/lit/pto/issue783_canonicalize_rank5_noop.pto
@@ -0,0 +1,29 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file that in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Tests that the canonicalization pass is a no-op on already-canonical rank-5 views.
+
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @noop_rank5(%src: !pto.ptr<bf16, gm>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c256 = arith.constant 256 : index
+    %c8192 = arith.constant 8192 : index
+
+    // Already rank-5 canonical form — pass should not modify
+    %src_view = pto.make_tensor_view %src, shape = [%c1, %c1, %c1, %c16, %c8192], strides = [%c8192, %c8192, %c8192, %c8192, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<1x1x1x16x8192xbf16>
+
+    // CHECK: pto.make_tensor_view {{.*}} shape = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], strides = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] {layout = #pto.layout<nd>} : !pto.tensor_view<1x1x1x16x8192xbf16>
+    // CHECK-NOT: arith.muli
+    // CHECK-NOT: arith.addi
+
+    return
+  }
+}
diff --git a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
index eb3f2c4e1..affb44872 100644
--- a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
+++ b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
@@ -23,7 +23,7 @@ module {
   }
 }
 
-// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>)
+// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>)
 // CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8
-// CHECK: pto.tload ins(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) outs(
-// CHECK: pto.tprefetch ins(%arg1 : memref<1x1x1x16x16x!pto.hif8>) outs(
+// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN>) outs(
+// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8>) outs(
diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
index 2d18717dd..9ad19ec8f 100644
--- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
+++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto
@@ -66,32 +66,32 @@ module {
 // CHECK-LABEL: AICORE void cube_kernel
 // CHECK-SAME: (__gm__ float* [[CUBE_GM:v[0-9]+]],
 // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[CUBE_GM]], {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
 // CHECK: TSTORE
-// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]);
 // CHECK-LABEL: AICORE void vector_kernel
 // CHECK-SAME: (__gm__ float* [[VEC_GM:v[0-9]+]],
 // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[VEC_GM]], {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 // CHECK: TLOAD
-// CHECK: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 
 // RESOLVE-LABEL: func.func @cube_kernel
 // RESOLVE-NOT: pto.reserve_buffer
 // RESOLVE-NOT: pto.import_reserved_buffer
 // RESOLVE: pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 1024, slot_num = 8, flag_base = 0, nosplit = true}
-// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array<i64: 16, 16, 16, 16, 1>} -> !pto.tensor_view<1x1x1x16x16xf32>
-// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
-// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array<i64: 16, 1>} -> !pto.tensor_view<16x16xf32>
+// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
 // RESOLVE-LABEL: func.func @vector_kernel
 // RESOLVE-NOT: pto.reserve_buffer
 // RESOLVE-NOT: pto.import_reserved_buffer
-// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
-// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
+// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0}
 
 // GSS-LABEL: AICORE void cube_kernel
 // GSS: TALLOC<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
 // GSS: TSTORE
-// GSS: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
\ No newline at end of file
+// GSS: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>
diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
index bcbdb0b0c..8562c1207 100644
--- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
+++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto
@@ -83,24 +83,24 @@ module {
 
 // CHECK-LABEL: AICORE void cube_c2v_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
-// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]);
 
 // CHECK-LABEL: AICORE void vector_c2v_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
-// CHECK: TFREE<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
 
 // CHECK-LABEL: AICORE void vector_v2c_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TALLOC<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
-// CHECK: TPUSH<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TALLOC<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
+// CHECK: TPUSH<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]);
 
 // CHECK-LABEL: AICORE void cube_v2c_kernel(__gm__ float*
 // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}});
-// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr);
-// CHECK: TPOP<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
-// CHECK: TFREE<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr);
+// CHECK: TPOP<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
+// CHECK: TFREE<TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>, GlobalTensor<float, pto::Shape<1, 1, 1, 16, 16>, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>(
diff --git a/test/lit/pto/tstore_low_precision_a5_valid.pto b/test/lit/pto/tstore_low_precision_a5_valid.pto
index 3c46a3ba4..ab0783062 100644
--- a/test/lit/pto/tstore_low_precision_a5_valid.pto
+++ b/test/lit/pto/tstore_low_precision_a5_valid.pto
@@ -24,8 +24,8 @@ module {
   }
 }
 
-// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>, %arg2: i64)
+// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>, %arg2: i64)
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg0 : memref<1x1x1x16x16xf8E4M3FN>)
+// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN>)
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg1 : memref<1x1x1x16x16x!pto.hif8>)
+// CHECK: outs(%arg1 : memref<16x16x!pto.hif8>)
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
index 2173e592f..5bb682167 100644
--- a/tools/ptoas/ptoas.cpp
+++ b/tools/ptoas/ptoas.cpp
@@ -1737,7 +1737,14 @@ int mlir::pto::compilePTOASModule(
   if (failed(applyPassManagerCLOptions(pm)))
     return 1;
 
-  pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOCanonicalizeIRPass());
+  // Rank-2 → rank-5 view canonicalization is currently gated on the VPTO
+  // backend to limit blast radius.  A3/A5 EmitC codegen already pads strides
+  // to rank-5 via InferPTOLayout and buildGlobalTensorShapeAndStride, so it
+  // does not need the canonicalization pass at the IR level.  When VPTO
+  // validation is complete and the pass is proven stable, the gate can be
+  // lifted to make it unconditional for all backends.
+  if (effectiveBackend == PTOBackend::VPTO)
+    pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOCanonicalizeIRPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       pto::createPTOAssignDefaultFrontendPipeIdPass());
   pm.addNestedPass<mlir::func::FuncOp>(

From 63274209de44f3d4668670dfb40e7bb9d74dfa1e Mon Sep 17 00:00:00 2001
From: Dongyan Chen <chendongyan@isrc.iscas.ac.cn>
Date: Wed, 17 Jun 2026 08:24:37 +0000
Subject: [PATCH 4/4] Update testcases.

---
 .../issue783_canonicalize_rank2_dn_views.pto  |  4 +--
 .../issue783_canonicalize_rank2_nd_views.pto  |  2 +-
 .../pto/issue783_canonicalize_rank5_noop.pto  | 29 -------------------
 3 files changed, 3 insertions(+), 32 deletions(-)
 delete mode 100644 test/lit/pto/issue783_canonicalize_rank5_noop.pto

diff --git a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
index 7a53ce866..156adb1f1 100644
--- a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
+++ b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto
@@ -10,7 +10,7 @@
 // correct cumulative-product strides and that the layout attribute is
 // preserved, preventing misidentification as ND after rank-5 padding.
 
-// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 module {
   func.func @dn_layout_kernel(%src: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
@@ -29,6 +29,6 @@ module {
   }
 }
 
-// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} {layout = #pto.layout<dn>} : !pto.tensor_view<1x1x1x?x?xf32>
+// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {layout = #pto.layout<dn>} : !pto.tensor_view<1x1x1x?x?xf32>
 // IR: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xf32>
 // IR: !pto.partition_tensor_view<1x1x1x16x32xf32>
diff --git a/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
index b2c01fcbc..6cec333cd 100644
--- a/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
+++ b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto
@@ -1,4 +1,4 @@
-// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
 
 module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
   func.func @canonicalize_rank2_views(%src: !pto.ptr<bf16, gm>, %dst: !pto.ptr<bf16, gm>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
diff --git a/test/lit/pto/issue783_canonicalize_rank5_noop.pto b/test/lit/pto/issue783_canonicalize_rank5_noop.pto
deleted file mode 100644
index bfbce5481..000000000
--- a/test/lit/pto/issue783_canonicalize_rank5_noop.pto
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2026 Huawei Technologies Co., Ltd.
-// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-// CANN Open Software License Agreement Version 2.0 (the "License").
-// Please refer to the License for details. You may not use this file that in compliance with the License.
-// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-// See LICENSE in the root of the software repository for the full text of the License.
-
-// Tests that the canonicalization pass is a no-op on already-canonical rank-5 views.
-
-// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s
-
-module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
-  func.func @noop_rank5(%src: !pto.ptr<bf16, gm>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-    %c1 = arith.constant 1 : index
-    %c16 = arith.constant 16 : index
-    %c256 = arith.constant 256 : index
-    %c8192 = arith.constant 8192 : index
-
-    // Already rank-5 canonical form — pass should not modify
-    %src_view = pto.make_tensor_view %src, shape = [%c1, %c1, %c1, %c16, %c8192], strides = [%c8192, %c8192, %c8192, %c8192, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<1x1x1x16x8192xbf16>
-
-    // CHECK: pto.make_tensor_view {{.*}} shape = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], strides = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] {layout = #pto.layout<nd>} : !pto.tensor_view<1x1x1x16x8192xbf16>
-    // CHECK-NOT: arith.muli
-    // CHECK-NOT: arith.addi
-
-    return
-  }
-}