From 59267e3e192cc7c0b61f34d5a4d2ad1bec89f393 Mon Sep 17 00:00:00 2001 From: Dongyan Chen Date: Fri, 12 Jun 2026 06:43:34 +0000 Subject: [PATCH 1/4] fix(pto): canonicalize rank2 view IR forms --- include/PTO/Transforms/Passes.h | 1 + include/PTO/Transforms/Passes.td | 15 + lib/PTO/Transforms/CMakeLists.txt | 1 + lib/PTO/Transforms/PTOCanonicalizeIR.cpp | 259 ++++++++++++++++++ .../issue31_partition_view_parser_compat.pto | 8 +- .../pto/issue783_canonicalize_rank2_views.pto | 26 ++ .../tpush_tpop_globaltensor_frontend_a3.pto | 12 +- tools/ptoas/ptoas.cpp | 1 + 8 files changed, 313 insertions(+), 10 deletions(-) create mode 100644 lib/PTO/Transforms/PTOCanonicalizeIR.cpp create mode 100644 test/lit/pto/issue783_canonicalize_rank2_views.pto diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index 87cc454af..e22e19e75 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -103,6 +103,7 @@ std::unique_ptr createPTOValidateVPTOEmissionIRPass(); std::unique_ptr createExpandTileOpPass(); std::unique_ptr createExpandTileOpPass(const ExpandTileOpOptions &options); std::unique_ptr createFoldTileBufIntrinsicsPass(); +std::unique_ptr createPTOCanonicalizeIRPass(); std::unique_ptr createFoldTileBufIntrinsicsPass(llvm::StringRef foldMode); std::unique_ptr createPTOInlineLibCallPass(const PTOInlineLibCallOptions &options = {}); diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index e92152dbf..abe31b018 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -480,6 +480,21 @@ def FoldTileBufIntrinsics : Pass<"pto-fold-tile-buf-intrinsics", "mlir::func::Fu ]; } +def PTOCanonicalizeIR : Pass<"pto-canonicalize-ir", "func::FuncOp"> { + let summary = "Canonicalize PTO IR forms before backend lowering"; + let description = [{ + Rewrites shorthand or legacy PTO IR forms into canonical forms before + backend-specific lowering. Currently this canonicalizes rank-2 tensor_view / + partition_tensor_view descriptors into the canonical right-aligned rank-5 + form: [R, C] -> [1, 1, 1, R, C]. + }]; + let constructor = "mlir::pto::createPTOCanonicalizeIRPass()"; + let dependentDialects = [ + "mlir::pto::PTODialect", + "mlir::arith::ArithDialect" + ]; +} + def PTOInlineLibCall : Pass<"pto-inline-libcall", "ModuleOp"> { let summary = "Materialize OP-Lib instance bodies and inline OP-Lib calls"; let description = [{ diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index 2ebc448a7..e372c3d71 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -55,6 +55,7 @@ add_mlir_dialect_library(PTOTransforms PTORemoveRedundantBarrier.cpp InferPTOLayout.cpp PTOA5NormalizeTMovPass.cpp + PTOCanonicalizeIR.cpp PTOMaterializeTileHandles.cpp BufferizableOpInterfaceImpl.cpp ConvertToPTOOp.cpp diff --git a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp new file mode 100644 index 000000000..cf5ee8283 --- /dev/null +++ b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp @@ -0,0 +1,259 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/Passes.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +#include + +namespace mlir { +namespace pto { +#define GEN_PASS_DEF_PTOCANONICALIZEIR +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +constexpr unsigned kLogicalRank2 = 2; +constexpr unsigned kCanonicalRank5 = 5; +constexpr int64_t kUnitExtent = 1; +constexpr unsigned kRank2Rows = 0; +constexpr unsigned kRank2Cols = 1; +constexpr int64_t kRank2ToRank5DimOffset = 3; + +static SmallVector +rightAlignRank2Shape(ArrayRef shape) { + return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2Rows], + shape[kRank2Cols]}; +} + +static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc, + int64_t value) { + return builder.create(loc, value); +} + +static SmallVector +prependThreeValues(ValueRange values, Value fill) { + return {fill, fill, fill, values[kRank2Rows], values[kRank2Cols]}; +} + +static SmallVector +buildCanonicalRank2Strides(MakeTensorViewOp op) { + Value rowStride = op.getStrides()[kRank2Rows]; + Value colStride = op.getStrides()[kRank2Cols]; + auto layout = op.getLayoutAttr(); + if (layout && layout.getLayout() == Layout::DN) + return {colStride, colStride, colStride, rowStride, colStride}; + return {rowStride, rowStride, rowStride, rowStride, colStride}; +} + +static bool isRank2ViewLike(Type type) { + if (auto viewType = dyn_cast(type)) + return viewType.getRank() == kLogicalRank2; + if (auto viewType = dyn_cast(type)) + return viewType.getRank() == kLogicalRank2; + return false; +} + +static Type canonicalViewType(Type type) { + if (auto viewType = dyn_cast(type)) { + if (viewType.getRank() == kLogicalRank2) + return TensorViewType::get(type.getContext(), + rightAlignRank2Shape(viewType.getShape()), + viewType.getElementType()); + return type; + } + if (auto viewType = dyn_cast(type)) { + if (viewType.getRank() == kLogicalRank2) + return PartitionTensorViewType::get( + type.getContext(), rightAlignRank2Shape(viewType.getShape()), + viewType.getElementType()); + return type; + } + return type; +} + +static bool canonicalizeValueType(Value value) { + Type oldType = value.getType(); + Type newType = canonicalViewType(oldType); + if (newType == oldType) + return false; + value.setType(newType); + return true; +} + +static LogicalResult rewriteMakeTensorView(MakeTensorViewOp op, + IRRewriter &rewriter) { + auto oldType = dyn_cast(op.getResult().getType()); + if (!oldType || oldType.getRank() != kLogicalRank2) + return success(); + + if (op.getShape().size() != kLogicalRank2 || + op.getStrides().size() != kLogicalRank2) + return op.emitOpError( + "rank-2 tensor_view must have exactly 2 shape and stride operands"); + + rewriter.setInsertionPoint(op); + Value one = getOrCreateIndexConstant(rewriter, op.getLoc(), kUnitExtent); + SmallVector newShape = + prependThreeValues(op.getShape(), one); + SmallVector newStrides = + buildCanonicalRank2Strides(op); + auto newType = cast(canonicalViewType(oldType)); + + auto newOp = rewriter.create( + op.getLoc(), newType, op.getPtr(), newShape, newStrides, + op.getLayoutAttr()); + rewriter.replaceOp(op, newOp.getResult()); + return success(); +} + +static LogicalResult rewritePartitionView(PartitionViewOp op, + IRRewriter &rewriter) { + auto sourceType = dyn_cast(op.getSource().getType()); + auto resultType = dyn_cast(op.getResult().getType()); + if (!sourceType || !resultType) + return success(); + + if (op.getOffsets().size() != kLogicalRank2 || + op.getSizes().size() != kLogicalRank2) + return success(); + + if (sourceType.getRank() != kCanonicalRank5) + return op.emitOpError( + "rank-2 partition_tensor_view normalization expects canonical rank-5 " + "source tensor_view"); + + rewriter.setInsertionPoint(op); + Value zero = getOrCreateIndexConstant(rewriter, op.getLoc(), 0); + Value one = getOrCreateIndexConstant(rewriter, op.getLoc(), kUnitExtent); + SmallVector newOffsets = + prependThreeValues(op.getOffsets(), zero); + SmallVector newSizes = + prependThreeValues(op.getSizes(), one); + auto newType = cast(canonicalViewType(resultType)); + + auto newOp = rewriter.create( + op.getLoc(), newType, op.getSource(), newOffsets, newSizes); + rewriter.replaceOp(op, newOp.getResult()); + return success(); +} + +static Value buildCanonicalDimIndex(Value dimIndex, IRRewriter &rewriter, + Location loc) { + rewriter.setInsertionPointAfterValue(dimIndex); + Value offset = + getOrCreateIndexConstant(rewriter, loc, kRank2ToRank5DimOffset); + return rewriter.create(loc, dimIndex, offset); +} + +static void rewriteTensorViewDimOperand(Operation *op, Value dimIndex, + IRRewriter &rewriter) { + Value newDim = buildCanonicalDimIndex(dimIndex, rewriter, op->getLoc()); + op->setOperand(1, newDim); +} + +static void canonicalizeFunctionType(func::FuncOp func) { + auto oldType = func.getFunctionType(); + SmallVector inputs; + SmallVector results; + bool changed = false; + + inputs.reserve(oldType.getNumInputs()); + for (Type type : oldType.getInputs()) { + Type newType = canonicalViewType(type); + changed |= newType != type; + inputs.push_back(newType); + } + + results.reserve(oldType.getNumResults()); + for (Type type : oldType.getResults()) { + Type newType = canonicalViewType(type); + changed |= newType != type; + results.push_back(newType); + } + + if (changed) + func.setFunctionType(FunctionType::get(func.getContext(), inputs, results)); +} + +static void canonicalizeValueTypes(func::FuncOp func) { + canonicalizeFunctionType(func); + + func->walk([](Operation *op) { + for (Region ®ion : op->getRegions()) { + for (Block &block : region) { + for (BlockArgument arg : block.getArguments()) + canonicalizeValueType(arg); + } + } + + for (OpResult result : op->getResults()) + canonicalizeValueType(result); + }); +} + +struct PTOCanonicalizeIRPass + : public mlir::pto::impl::PTOCanonicalizeIRBase { + void runOnOperation() override { + func::FuncOp func = getOperation(); + SmallVector makeViews; + SmallVector partitionViews; + SmallVector> dimIndexOps; + + func.walk([&](MakeTensorViewOp op) { + if (isRank2ViewLike(op.getResult().getType())) + makeViews.push_back(op); + }); + func.walk([&](PartitionViewOp op) { + if (op.getOffsets().size() == kLogicalRank2 && + op.getSizes().size() == kLogicalRank2) + partitionViews.push_back(op); + }); + func.walk([&](GetTensorViewDimOp op) { + if (isRank2ViewLike(op.getTensorView().getType())) + dimIndexOps.emplace_back(op.getOperation(), op.getDimIndex()); + }); + func.walk([&](GetTensorViewStrideOp op) { + if (isRank2ViewLike(op.getTensorView().getType())) + dimIndexOps.emplace_back(op.getOperation(), op.getDimIndex()); + }); + + IRRewriter rewriter(func.getContext()); + for (MakeTensorViewOp op : makeViews) { + if (failed(rewriteMakeTensorView(op, rewriter))) { + signalPassFailure(); + return; + } + } + for (auto [op, dimIndex] : dimIndexOps) + rewriteTensorViewDimOperand(op, dimIndex, rewriter); + canonicalizeValueTypes(func); + for (PartitionViewOp op : partitionViews) { + if (failed(rewritePartitionView(op, rewriter))) { + signalPassFailure(); + return; + } + } + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTOCanonicalizeIRPass() { + return std::make_unique(); +} diff --git a/test/lit/pto/issue31_partition_view_parser_compat.pto b/test/lit/pto/issue31_partition_view_parser_compat.pto index f6f5bfef8..e1eef5fd9 100755 --- a/test/lit/pto/issue31_partition_view_parser_compat.pto +++ b/test/lit/pto/issue31_partition_view_parser_compat.pto @@ -46,9 +46,9 @@ module { } // CHECK-LABEL: func.func @new_format_static -// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} -// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<16x32xf32>) +// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} +// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<1x1x1x16x32xf32>) // CHECK-LABEL: func.func @old_format_static -// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} +// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} // CHECK-LABEL: func.func @old_format_dynamic -// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} +// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} diff --git a/test/lit/pto/issue783_canonicalize_rank2_views.pto b/test/lit/pto/issue783_canonicalize_rank2_views.pto new file mode 100644 index 000000000..8caecf343 --- /dev/null +++ b/test/lit/pto/issue783_canonicalize_rank2_views.pto @@ -0,0 +1,26 @@ +// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @canonicalize_rank2_views(%src: !pto.ptr, %dst: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c512 = arith.constant 512 : index + %c8192 = arith.constant 8192 : index + + %src_view = pto.make_tensor_view %src, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout} : !pto.tensor_view + %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout} : !pto.tensor_view + %src_part = pto.partition_view %src_view, offsets = [%c0, %c512], sizes = [%c16, %c512] : !pto.tensor_view -> !pto.partition_tensor_view<16x512xbf16> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c512], sizes = [%c16, %c512] : !pto.tensor_view -> !pto.partition_tensor_view<16x512xbf16> + %tile = pto.declare_tile -> !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<16x512xbf16>) outs(%tile : !pto.tile_buf) + pto.section.vector { + } + return + } +} + +// CHECK: pto.make_tensor_view {{.*}} shape = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} : !pto.tensor_view<1x1x1x?x?xbf16> +// CHECK: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xbf16> +// CHECK: !pto.partition_tensor_view<1x1x1x16x512xbf16> +// CHECK-NOT: !pto.partition_tensor_view<16x512xbf16> diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto index 9ad19ec8f..68cbea217 100644 --- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto +++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto @@ -66,17 +66,17 @@ module { // CHECK-LABEL: AICORE void cube_kernel // CHECK-SAME: (__gm__ float* [[CUBE_GM:v[0-9]+]], // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[CUBE_GM]], {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); // CHECK: TSTORE -// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); // CHECK-LABEL: AICORE void vector_kernel // CHECK-SAME: (__gm__ float* [[VEC_GM:v[0-9]+]], // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[VEC_GM]], {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // CHECK: TLOAD -// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // RESOLVE-LABEL: func.func @cube_kernel // RESOLVE-NOT: pto.reserve_buffer diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 0d20114af..2173e592f 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -1737,6 +1737,7 @@ int mlir::pto::compilePTOASModule( if (failed(applyPassManagerCLOptions(pm))) return 1; + pm.addNestedPass(pto::createPTOCanonicalizeIRPass()); pm.addNestedPass( pto::createPTOAssignDefaultFrontendPipeIdPass()); pm.addNestedPass( From 6fececbe3bedd33bc40e80640ab9a3844d0f9b95 Mon Sep 17 00:00:00 2001 From: Dongyan Chen Date: Mon, 15 Jun 2026 06:45:48 +0000 Subject: [PATCH 2/4] Update testcases. --- include/PTO/Transforms/Passes.h | 2 +- ...tload_tprefetch_low_precision_a5_valid.pto | 6 ++--- .../tpush_tpop_globaltensor_frontend_a3.pto | 12 +++++----- .../tpush_tpop_globaltensor_frontend_a5.pto | 24 +++++++++---------- .../lit/pto/tstore_low_precision_a5_valid.pto | 6 ++--- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index e22e19e75..59ad36c93 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -103,8 +103,8 @@ std::unique_ptr createPTOValidateVPTOEmissionIRPass(); std::unique_ptr createExpandTileOpPass(); std::unique_ptr createExpandTileOpPass(const ExpandTileOpOptions &options); std::unique_ptr createFoldTileBufIntrinsicsPass(); -std::unique_ptr createPTOCanonicalizeIRPass(); std::unique_ptr createFoldTileBufIntrinsicsPass(llvm::StringRef foldMode); +std::unique_ptr createPTOCanonicalizeIRPass(); std::unique_ptr createPTOInlineLibCallPass(const PTOInlineLibCallOptions &options = {}); void registerPTOViewToMemrefPass(); diff --git a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto index affb44872..eb3f2c4e1 100644 --- a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto +++ b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto @@ -23,7 +23,7 @@ module { } } -// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>) +// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>) // CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8 -// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN>) outs( -// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8>) outs( +// CHECK: pto.tload ins(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) outs( +// CHECK: pto.tprefetch ins(%arg1 : memref<1x1x1x16x16x!pto.hif8>) outs( diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto index 68cbea217..2d18717dd 100644 --- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto +++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto @@ -82,16 +82,16 @@ module { // RESOLVE-NOT: pto.reserve_buffer // RESOLVE-NOT: pto.import_reserved_buffer // RESOLVE: pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 1024, slot_num = 8, flag_base = 0, nosplit = true} -// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array} -> !pto.tensor_view<16x16xf32> -// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} -// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array} -> !pto.tensor_view<1x1x1x16x16xf32> +// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} // RESOLVE-LABEL: func.func @vector_kernel // RESOLVE-NOT: pto.reserve_buffer // RESOLVE-NOT: pto.import_reserved_buffer -// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} -// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} // GSS-LABEL: AICORE void cube_kernel // GSS: TALLOC // GSS: TSTORE -// GSS: TPUSH +// GSS: TPUSH \ No newline at end of file diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto index 8562c1207..bcbdb0b0c 100644 --- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto +++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto @@ -83,24 +83,24 @@ module { // CHECK-LABEL: AICORE void cube_c2v_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); -// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); // CHECK-LABEL: AICORE void vector_c2v_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( -// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // CHECK-LABEL: AICORE void vector_v2c_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); -// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); // CHECK-LABEL: AICORE void cube_v2c_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( -// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( diff --git a/test/lit/pto/tstore_low_precision_a5_valid.pto b/test/lit/pto/tstore_low_precision_a5_valid.pto index ab0783062..3c46a3ba4 100644 --- a/test/lit/pto/tstore_low_precision_a5_valid.pto +++ b/test/lit/pto/tstore_low_precision_a5_valid.pto @@ -24,8 +24,8 @@ module { } } -// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>, %arg2: i64) +// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>, %arg2: i64) // CHECK: pto.tstore ins( -// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN>) +// CHECK: outs(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) // CHECK: pto.tstore ins( -// CHECK: outs(%arg1 : memref<16x16x!pto.hif8>) +// CHECK: outs(%arg1 : memref<1x1x1x16x16x!pto.hif8>) From 0a1ed40806a46466af6e6b47a98c58d3693f2f76 Mon Sep 17 00:00:00 2001 From: Dongyan Chen Date: Wed, 17 Jun 2026 07:34:43 +0000 Subject: [PATCH 3/4] fix stride rule + DN misidentification in canonicalize pass, gate on VPTO --- include/PTO/Transforms/Passes.td | 19 ++- lib/PTO/IR/PTO.cpp | 10 ++ lib/PTO/Transforms/PTOCanonicalizeIR.cpp | 156 ++++++++++++++++-- .../issue31_partition_view_parser_compat.pto | 8 +- .../issue783_canonicalize_rank2_dn_views.pto | 34 ++++ ... issue783_canonicalize_rank2_nd_views.pto} | 2 +- .../pto/issue783_canonicalize_rank5_noop.pto | 29 ++++ ...tload_tprefetch_low_precision_a5_valid.pto | 6 +- .../tpush_tpop_globaltensor_frontend_a3.pto | 24 +-- .../tpush_tpop_globaltensor_frontend_a5.pto | 24 +-- .../lit/pto/tstore_low_precision_a5_valid.pto | 6 +- tools/ptoas/ptoas.cpp | 9 +- 12 files changed, 275 insertions(+), 52 deletions(-) create mode 100644 test/lit/pto/issue783_canonicalize_rank2_dn_views.pto rename test/lit/pto/{issue783_canonicalize_rank2_views.pto => issue783_canonicalize_rank2_nd_views.pto} (94%) create mode 100644 test/lit/pto/issue783_canonicalize_rank5_noop.pto diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index abe31b018..a897034d1 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -481,12 +481,21 @@ def FoldTileBufIntrinsics : Pass<"pto-fold-tile-buf-intrinsics", "mlir::func::Fu } def PTOCanonicalizeIR : Pass<"pto-canonicalize-ir", "func::FuncOp"> { - let summary = "Canonicalize PTO IR forms before backend lowering"; + let summary = "Canonicalize rank-2 view descriptors into rank-5 form (VPTO-only)"; let description = [{ - Rewrites shorthand or legacy PTO IR forms into canonical forms before - backend-specific lowering. Currently this canonicalizes rank-2 tensor_view / - partition_tensor_view descriptors into the canonical right-aligned rank-5 - form: [R, C] -> [1, 1, 1, R, C]. + Rewrites rank-2 tensor_view / partition_tensor_view descriptors into the + canonical right-aligned rank-5 form [1, 1, 1, R, C], matching the 5D + descriptor layout expected by VPTO lowering. Stride expansion uses the + same cumulative-product rule as rightAlignTo5D and + buildGlobalTensorShapeAndStride: stride[i] = shape[i+1] * stride[i+1]. + + Currently gated on --pto-backend=vpto to limit blast radius. A3/A5 + EmitC codegen already pads strides to rank-5 via InferPTOLayout and + buildGlobalTensorShapeAndStride, so it does not need this pass at the + IR level. The gate can be lifted once the pass is proven stable. + + A post-canonicalization verification detects any surviving rank-2 view + types to prevent silent failures when new view-consuming ops are added. }]; let constructor = "mlir::pto::createPTOCanonicalizeIRPass()"; let dependentDialects = [ diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 3b14a3034..787a3b125 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -1734,6 +1734,16 @@ static std::optional getLogicalViewLayout(Value value) { if (auto part = value.getDefiningOp()) return getLogicalViewLayout(part.getSource()); if (auto make = value.getDefiningOp()) { + // Prefer the explicit layout attribute when available. After rank-2 → + // rank-5 canonicalization, the padded leading strides satisfy the ND + // (row-major) recurrence even for DN (col-major) data, so inferLayout + // alone would misclassify DN as ND (the col-major recurrence breaks at + // the boundary between padded unit-extent dims and real dims). The + // layout attribute carries the *intended* memory layout and is the + // authoritative source — inferLayout is only a fallback for views that + // lack an explicit layout. + if (auto layoutAttr = make.getLayoutAttr()) + return layoutAttr.getLayout(); auto tvTy = dyn_cast(make.getResult().getType()); if (!tvTy) return std::nullopt; diff --git a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp index cf5ee8283..c4495d0d1 100644 --- a/lib/PTO/Transforms/PTOCanonicalizeIR.cpp +++ b/lib/PTO/Transforms/PTOCanonicalizeIR.cpp @@ -28,17 +28,49 @@ using namespace mlir::pto; namespace { +// --------------------------------------------------------------------------- +// Design note: which ops need structural rewriting vs. type-only walk +// --------------------------------------------------------------------------- +// +// This pass canonicalizes rank-2 TensorViewType / PartitionTensorViewType +// into the right-aligned rank-5 form [1, 1, 1, R, C] used by all backends +// (A3, A5, VPTO EmitC codegen and the 5D memref rank in PTOViewToMemref). +// +// Ops that carry **rank-dependent operands** must be structurally rewritten +// (their operand count or operand values change when rank changes): +// - MakeTensorViewOp : shape/strides expanded from 2 → 5 +// - PartitionViewOp : offsets/sizes expanded from 2 → 5 +// - GetTensorViewDimOp / GetTensorViewStrideOp : dim index offset by +3 +// +// Ops that only **carry view-typed operands/results** (no rank-dependent +// operand structure) are handled by the type walk (canonicalizeValueTypes) +// which in-place mutates TensorViewType and PartitionTensorViewType from +// rank-2 to rank-5: +// - TAllocToAivOp, TAllocToAicOp, DeclareGlobalOp (producers) +// - TAllocOp, TPushOp, TPopOp, TFreeOp, AicInitializePipeOp, +// AivInitializePipeOp, TensorViewAddrOp (consumers) +// - All PTODpsType consumers (TLoadOp, TStoreOp, TMatmulOp, etc.) +// - All PTOPipeEntryType consumers (TPushToAivOp, TPopFromAicOp, etc.) +// +// A post-canonicalization verification (verifyNoRank2ViewSurvivors) detects +// any surviving rank-2 view types to prevent silent failures when new +// view-consuming ops with rank-dependent operands are added. +// +// NZ layout cannot appear on rank-2 views (it requires rank >= 5 with +// shape[2] == 16), so only ND and DN strides need expansion logic. +// --------------------------------------------------------------------------- + constexpr unsigned kLogicalRank2 = 2; constexpr unsigned kCanonicalRank5 = 5; constexpr int64_t kUnitExtent = 1; -constexpr unsigned kRank2Rows = 0; -constexpr unsigned kRank2Cols = 1; +constexpr unsigned kRank2RowDim = 0; // row dimension index in rank-2 view +constexpr unsigned kRank2ColDim = 1; // column dimension index in rank-2 view constexpr int64_t kRank2ToRank5DimOffset = 3; static SmallVector rightAlignRank2Shape(ArrayRef shape) { - return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2Rows], - shape[kRank2Cols]}; + return {kUnitExtent, kUnitExtent, kUnitExtent, shape[kRank2RowDim], + shape[kRank2ColDim]}; } static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc, @@ -48,17 +80,81 @@ static Value getOrCreateIndexConstant(OpBuilder &builder, Location loc, static SmallVector prependThreeValues(ValueRange values, Value fill) { - return {fill, fill, fill, values[kRank2Rows], values[kRank2Cols]}; + return {fill, fill, fill, values[kRank2RowDim], values[kRank2ColDim]}; } +// --------------------------------------------------------------------------- +// Stride expansion: uses the same cumulative-product rule as +// rightAlignTo5D (InferPTOLayout.cpp) and buildGlobalTensorShapeAndStride +// (PTOToEmitC.cpp): stride[i] = shape[i+1] * stride[i+1]. +// +// For a rank-2 view [R, C] right-aligned into [1, 1, 1, R, C]: +// - ND (row-major): original strides = [C, 1] +// padded strides: stride[2] = shape[3]*stride[3] = R*C, +// stride[1] = shape[2]*stride[2] = 1*R*C = R*C, +// stride[0] = shape[1]*stride[1] = 1*R*C = R*C +// → [R*C, R*C, R*C, C, 1] +// +// - DN (col-major): original strides = [1, R] +// padded strides: stride[2] = shape[3]*stride[3] = R*1 = R, +// stride[1] = shape[2]*stride[2] = 1*R = R, +// stride[0] = shape[1]*stride[1] = 1*R = R +// → [R, R, R, 1, R] +// +// Note: the ND branch was previously incorrectly using rowStride (=C) for +// all three leading dims, producing [C, C, C, C, 1] instead of the correct +// cumulative product [R*C, R*C, R*C, C, 1]. The DN branch was correct by +// coincidence because colStride == R and the cumulative product of unit-extent +// leading dims also collapses to R. +// --------------------------------------------------------------------------- static SmallVector -buildCanonicalRank2Strides(MakeTensorViewOp op) { - Value rowStride = op.getStrides()[kRank2Rows]; - Value colStride = op.getStrides()[kRank2Cols]; +buildCanonicalRank2Strides(MakeTensorViewOp op, IRRewriter &rewriter) { + Value rowStride = op.getStrides()[kRank2RowDim]; + Value colStride = op.getStrides()[kRank2ColDim]; + + rewriter.setInsertionPoint(op); + auto loc = op.getLoc(); + auto layout = op.getLayoutAttr(); - if (layout && layout.getLayout() == Layout::DN) + + // For ND (row-major): original strides = [rowStride, colStride] + // where rowStride = C (shape[1]) and colStride = 1. + // Cumulative product rule for leading dims: + // stride[2] = shape[3] * stride[3] = rowStride_vals * rowStride + // But shape[3] and stride[3] are SSA values, not constants. + // We compute: shape[kRank2RowDim] * rowStride for stride[2], + // 1 * (shape[kRank2RowDim] * rowStride) for strides [0..1]. + // + // Simplification: since shape[0..2] are all 1 (unit-extent padding), + // the cumulative product collapses: stride[i] = stride[shift] for all + // i < shift, where shift = kRank2ToRank5DimOffset = 3. + // + // For ND: stride[3] = rowStride, so stride[0..2] = rowStride. + // BUT wait — the cumulative rule is stride[i] = shape[i+1]*stride[i+1]. + // stride[2] = shape[3] * stride[3] = R * rowStride. + // stride[1] = shape[2] * stride[2] = 1 * (R*rowStride) = R*rowStride. + // stride[0] = shape[1] * stride[1] = 1 * (R*rowStride) = R*rowStride. + // So the leading strides are NOT rowStride; they are R*rowStride. + // + // We must compute the product: shape[kRank2RowDim] * rowStride. + if (layout && layout.getLayout() == Layout::DN) { + // DN (col-major): strides = [1, R] + // Cumulative product: stride[2] = shape[3]*stride[3] = R*1 = R, + // stride[1] = 1*R = R, stride[0] = 1*R = R. + // Since colStride = R for DN, this collapses to colStride for all + // three leading dims. This is the same as the old DN branch. return {colStride, colStride, colStride, rowStride, colStride}; - return {rowStride, rowStride, rowStride, rowStride, colStride}; + } + + // ND (row-major) or no explicit layout attr (default = ND): + // strides = [rowStride, colStride] where rowStride = C, colStride = 1. + // Cumulative product: stride[2] = shape[kRank2RowDim] * stride[3], + // stride[1] = 1 * stride[2], + // stride[0] = 1 * stride[2]. + // = shape[kRank2RowDim] * rowStride for all three leading dims. + Value rowsValue = op.getShape()[kRank2RowDim]; + Value leadingStride = rewriter.create(loc, rowsValue, rowStride); + return {leadingStride, leadingStride, leadingStride, rowStride, colStride}; } static bool isRank2ViewLike(Type type) { @@ -112,7 +208,7 @@ static LogicalResult rewriteMakeTensorView(MakeTensorViewOp op, SmallVector newShape = prependThreeValues(op.getShape(), one); SmallVector newStrides = - buildCanonicalRank2Strides(op); + buildCanonicalRank2Strides(op, rewriter); auto newType = cast(canonicalViewType(oldType)); auto newOp = rewriter.create( @@ -207,6 +303,36 @@ static void canonicalizeValueTypes(func::FuncOp func) { }); } +/// Verify that no rank-2 view types survived canonicalization. +/// This catches cases where a new op with rank-dependent operands +/// was added but not given a structural rewrite in this pass. +static LogicalResult verifyNoRank2ViewSurvivors(func::FuncOp func) { + bool anyFailed = false; + func.walk([&](Operation *op) { + for (Region ®ion : op->getRegions()) { + for (Block &block : region) { + for (BlockArgument arg : block.getArguments()) { + if (isRank2ViewLike(arg.getType())) { + emitError(arg.getLoc()) + << "rank-2 view type survived canonicalization: " + << arg.getType() << " as block argument"; + anyFailed = true; + } + } + } + } + for (OpResult result : op->getResults()) { + if (isRank2ViewLike(result.getType())) { + emitError(op->getLoc()) + << "rank-2 view type survived canonicalization: " + << result.getType() << " in op " << op->getName(); + anyFailed = true; + } + } + }); + return anyFailed ? failure() : success(); +} + struct PTOCanonicalizeIRPass : public mlir::pto::impl::PTOCanonicalizeIRBase { void runOnOperation() override { @@ -249,6 +375,14 @@ struct PTOCanonicalizeIRPass return; } } + + // Post-canonicalization verification: ensure no rank-2 view types + // survived. If any do, it means an op with rank-dependent operands + // was not given a structural rewrite. + if (failed(verifyNoRank2ViewSurvivors(func))) { + signalPassFailure(); + return; + } } }; diff --git a/test/lit/pto/issue31_partition_view_parser_compat.pto b/test/lit/pto/issue31_partition_view_parser_compat.pto index e1eef5fd9..f6f5bfef8 100755 --- a/test/lit/pto/issue31_partition_view_parser_compat.pto +++ b/test/lit/pto/issue31_partition_view_parser_compat.pto @@ -46,9 +46,9 @@ module { } // CHECK-LABEL: func.func @new_format_static -// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} -// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<1x1x1x16x32xf32>) +// CHECK: %[[SV0:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} +// CHECK: pto.tload ins(%[[SV0]] : !pto.partition_tensor_view<16x32xf32>) // CHECK-LABEL: func.func @old_format_static -// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} +// CHECK: %[[SV1:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} // CHECK-LABEL: func.func @old_format_dynamic -// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : !pto.tensor_view<1x1x1x?x?xf32>{{$}} +// CHECK: %[[SV2:.*]] = pto.partition_view %{{.*}}, offsets = [%{{.*}}, %{{.*}}], sizes = [%{{.*}}, %{{.*}}] : !pto.tensor_view{{$}} diff --git a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto new file mode 100644 index 000000000..7a53ce866 --- /dev/null +++ b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto @@ -0,0 +1,34 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Tests that rank-2 DN (col-major) tensor_view canonicalization produces +// correct cumulative-product strides and that the layout attribute is +// preserved, preventing misidentification as ND after rank-5 padding. + +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR + +module { + func.func @dn_layout_kernel(%src: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + + // DN layout: strides = [1, 16] — col-major (stride[0]=1, stride[1]=rows) + %tv = pto.make_tensor_view %src, shape = [%c16, %c32], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %sv = pto.partition_view %tv, offsets = [%c0, %c0], sizes = [%c16, %c32] : !pto.tensor_view + + %tile = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%sv : !pto.partition_tensor_view<16x32xf32>) outs(%tile : !pto.tile_buf) + return + } +} + +// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} {layout = #pto.layout} : !pto.tensor_view<1x1x1x?x?xf32> +// IR: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xf32> +// IR: !pto.partition_tensor_view<1x1x1x16x32xf32> diff --git a/test/lit/pto/issue783_canonicalize_rank2_views.pto b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto similarity index 94% rename from test/lit/pto/issue783_canonicalize_rank2_views.pto rename to test/lit/pto/issue783_canonicalize_rank2_nd_views.pto index 8caecf343..b2c01fcbc 100644 --- a/test/lit/pto/issue783_canonicalize_rank2_views.pto +++ b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto @@ -1,4 +1,4 @@ -// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { func.func @canonicalize_rank2_views(%src: !pto.ptr, %dst: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { diff --git a/test/lit/pto/issue783_canonicalize_rank5_noop.pto b/test/lit/pto/issue783_canonicalize_rank5_noop.pto new file mode 100644 index 000000000..bfbce5481 --- /dev/null +++ b/test/lit/pto/issue783_canonicalize_rank5_noop.pto @@ -0,0 +1,29 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file that in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Tests that the canonicalization pass is a no-op on already-canonical rank-5 views. + +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @noop_rank5(%src: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + %c8192 = arith.constant 8192 : index + + // Already rank-5 canonical form — pass should not modify + %src_view = pto.make_tensor_view %src, shape = [%c1, %c1, %c1, %c16, %c8192], strides = [%c8192, %c8192, %c8192, %c8192, %c1] {layout = #pto.layout} : !pto.tensor_view<1x1x1x16x8192xbf16> + + // CHECK: pto.make_tensor_view {{.*}} shape = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], strides = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] {layout = #pto.layout} : !pto.tensor_view<1x1x1x16x8192xbf16> + // CHECK-NOT: arith.muli + // CHECK-NOT: arith.addi + + return + } +} diff --git a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto index eb3f2c4e1..affb44872 100644 --- a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto +++ b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto @@ -23,7 +23,7 @@ module { } } -// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>) +// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>) // CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8 -// CHECK: pto.tload ins(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) outs( -// CHECK: pto.tprefetch ins(%arg1 : memref<1x1x1x16x16x!pto.hif8>) outs( +// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN>) outs( +// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8>) outs( diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto index 2d18717dd..9ad19ec8f 100644 --- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto +++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a3.pto @@ -66,32 +66,32 @@ module { // CHECK-LABEL: AICORE void cube_kernel // CHECK-SAME: (__gm__ float* [[CUBE_GM:v[0-9]+]], // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[CUBE_GM]], {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); // CHECK: TSTORE -// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_ENTRY]]); // CHECK-LABEL: AICORE void vector_kernel // CHECK-SAME: (__gm__ float* [[VEC_GM:v[0-9]+]], // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 8, true>([[VEC_GM]], {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // CHECK: TLOAD -// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // RESOLVE-LABEL: func.func @cube_kernel // RESOLVE-NOT: pto.reserve_buffer // RESOLVE-NOT: pto.import_reserved_buffer // RESOLVE: pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 1024, slot_num = 8, flag_base = 0, nosplit = true} -// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array} -> !pto.tensor_view<1x1x1x16x16xf32> -// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} -// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: %{{.*}} = pto.declare_global {__pto.globaltensor_strides = array} -> !pto.tensor_view<16x16xf32> +// RESOLVE: pto.talloc(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tpush(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} // RESOLVE-LABEL: func.func @vector_kernel // RESOLVE-NOT: pto.reserve_buffer // RESOLVE-NOT: pto.import_reserved_buffer -// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} -// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<1x1x1x16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tpop(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} +// RESOLVE: pto.tfree(%{{.*}}, %{{.*}} : !pto.tensor_view<16x16xf32>, !pto.pipe) {split = 0} // GSS-LABEL: AICORE void cube_kernel // GSS: TALLOC // GSS: TSTORE -// GSS: TPUSH \ No newline at end of file +// GSS: TPUSH diff --git a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto index bcbdb0b0c..8562c1207 100644 --- a/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto +++ b/test/lit/pto/tpush_tpop_globaltensor_frontend_a5.pto @@ -83,24 +83,24 @@ module { // CHECK-LABEL: AICORE void cube_c2v_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); -// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_C2V_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[CUBE_C2V_ENTRY]]); // CHECK-LABEL: AICORE void vector_c2v_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_C2V_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( -// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_C2V_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( // CHECK-LABEL: AICORE void vector_v2c_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr); -// CHECK: TALLOC, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); -// CHECK: TPUSH, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[VEC_V2C_ENTRY:v[0-9]+]](nullptr); +// CHECK: TALLOC, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); +// CHECK: TPUSH, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>({{.*}}, [[VEC_V2C_ENTRY]]); // CHECK-LABEL: AICORE void cube_v2c_kernel(__gm__ float* // CHECK: TPipe<0, Direction::DIR_V2C_GM, 1024, 8, 8, true>({{.*}}, {{.*}}, {{.*}}); -// CHECK: GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr); -// CHECK: TPOP, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( -// CHECK: TFREE, GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> [[CUBE_V2C_ENTRY:v[0-9]+]](nullptr); +// CHECK: TPOP, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( +// CHECK: TFREE, GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>, TileSplitAxis::TILE_NO_SPLIT>( diff --git a/test/lit/pto/tstore_low_precision_a5_valid.pto b/test/lit/pto/tstore_low_precision_a5_valid.pto index 3c46a3ba4..ab0783062 100644 --- a/test/lit/pto/tstore_low_precision_a5_valid.pto +++ b/test/lit/pto/tstore_low_precision_a5_valid.pto @@ -24,8 +24,8 @@ module { } } -// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<1x1x1x16x16xf8E4M3FN>, %arg1: memref<1x1x1x16x16x!pto.hif8>, %arg2: i64) +// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>, %arg2: i64) // CHECK: pto.tstore ins( -// CHECK: outs(%arg0 : memref<1x1x1x16x16xf8E4M3FN>) +// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN>) // CHECK: pto.tstore ins( -// CHECK: outs(%arg1 : memref<1x1x1x16x16x!pto.hif8>) +// CHECK: outs(%arg1 : memref<16x16x!pto.hif8>) diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 2173e592f..5bb682167 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -1737,7 +1737,14 @@ int mlir::pto::compilePTOASModule( if (failed(applyPassManagerCLOptions(pm))) return 1; - pm.addNestedPass(pto::createPTOCanonicalizeIRPass()); + // Rank-2 → rank-5 view canonicalization is currently gated on the VPTO + // backend to limit blast radius. A3/A5 EmitC codegen already pads strides + // to rank-5 via InferPTOLayout and buildGlobalTensorShapeAndStride, so it + // does not need the canonicalization pass at the IR level. When VPTO + // validation is complete and the pass is proven stable, the gate can be + // lifted to make it unconditional for all backends. + if (effectiveBackend == PTOBackend::VPTO) + pm.addNestedPass(pto::createPTOCanonicalizeIRPass()); pm.addNestedPass( pto::createPTOAssignDefaultFrontendPipeIdPass()); pm.addNestedPass( From 63274209de44f3d4668670dfb40e7bb9d74dfa1e Mon Sep 17 00:00:00 2001 From: Dongyan Chen Date: Wed, 17 Jun 2026 08:24:37 +0000 Subject: [PATCH 4/4] Update testcases. --- .../issue783_canonicalize_rank2_dn_views.pto | 4 +-- .../issue783_canonicalize_rank2_nd_views.pto | 2 +- .../pto/issue783_canonicalize_rank5_noop.pto | 29 ------------------- 3 files changed, 3 insertions(+), 32 deletions(-) delete mode 100644 test/lit/pto/issue783_canonicalize_rank5_noop.pto diff --git a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto index 7a53ce866..156adb1f1 100644 --- a/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto +++ b/test/lit/pto/issue783_canonicalize_rank2_dn_views.pto @@ -10,7 +10,7 @@ // correct cumulative-product strides and that the layout attribute is // preserved, preventing misidentification as ND after rank-5 padding. -// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR module { func.func @dn_layout_kernel(%src: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { @@ -29,6 +29,6 @@ module { } } -// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {{.*}} {layout = #pto.layout} : !pto.tensor_view<1x1x1x?x?xf32> +// IR: pto.make_tensor_view {{.*}} strides = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] {layout = #pto.layout} : !pto.tensor_view<1x1x1x?x?xf32> // IR: pto.partition_view {{.*}} offsets = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}], sizes = [{{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^,]+}}, {{%[^]]+}}] : !pto.tensor_view<1x1x1x?x?xf32> // IR: !pto.partition_tensor_view<1x1x1x16x32xf32> diff --git a/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto index b2c01fcbc..6cec333cd 100644 --- a/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto +++ b/test/lit/pto/issue783_canonicalize_rank2_nd_views.pto @@ -1,4 +1,4 @@ -// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-pto-ir --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { func.func @canonicalize_rank2_views(%src: !pto.ptr, %dst: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { diff --git a/test/lit/pto/issue783_canonicalize_rank5_noop.pto b/test/lit/pto/issue783_canonicalize_rank5_noop.pto deleted file mode 100644 index bfbce5481..000000000 --- a/test/lit/pto/issue783_canonicalize_rank5_noop.pto +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file that in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// Tests that the canonicalization pass is a no-op on already-canonical rank-5 views. - -// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-canonicalize-ir %s -o /dev/null 2>&1 | FileCheck %s - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @noop_rank5(%src: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c256 = arith.constant 256 : index - %c8192 = arith.constant 8192 : index - - // Already rank-5 canonical form — pass should not modify - %src_view = pto.make_tensor_view %src, shape = [%c1, %c1, %c1, %c16, %c8192], strides = [%c8192, %c8192, %c8192, %c8192, %c1] {layout = #pto.layout} : !pto.tensor_view<1x1x1x16x8192xbf16> - - // CHECK: pto.make_tensor_view {{.*}} shape = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], strides = [%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] {layout = #pto.layout} : !pto.tensor_view<1x1x1x16x8192xbf16> - // CHECK-NOT: arith.muli - // CHECK-NOT: arith.addi - - return - } -}